Image Edge Detection in C++ using OpenCL produces Rotated Image

我目前正在尝试使用 OpenCL 在 C++ 中实现 Sobel 边缘检测方法,以并行实现部分代码。我设法正确检测输入图像的边缘,但是,我的输出图像是输入图像的旋转和反射版本。请参阅下面的图片作为参考:






/* Commands needed to run this file:
*   g++ sobel.cpp -o sobel.out -lOpenCL ----> compiles file and creates an executable file
*   ./sobel.out chess.pgm 100 35  ----> runs the executable file on the chess image for a high threshold of 100 and
*                                       low threshold value of 35

#include <tuple>

using namespace std;

int main(int argc, char **argv)
    if (argc != 4)
        cout << "Proper syntax: ./a.out <input_filename> <high_threshold> <low_threshold>" << endl;
        return 0;

    // Exit program if file doesn't open
    string filename(argv[1]);
    string path = "./input_images/" + filename;
    ifstream infile(path, ios::binary);
    if (!infile.is_open())
        cout << "File " << path << " not found in directory." << endl;
        return 0;

    ofstream img_mag("./output_images/sobel_mag.pgm", ios::binary);
    ofstream img_hi("./output_images/sobel_hi.pgm", ios::binary);
    ofstream img_lo("./output_images/sobel_lo.pgm", ios::binary);
    ofstream img_x("./output_images/sobel_x.pgm", ios::binary);
    ofstream img_y("./output_images/sobel_y.pgm", ios::binary);

    char buffer[1024];
    int width, height, intensity, hi = stoi(argv[2]), lo = stoi(argv[3]);
    int sumx, sumy;

    // Storing header information and copying into the new ouput images
    infile  >> buffer >> width >> height >> intensity;
    img_mag << buffer << endl << width << " " << height << endl << intensity << endl;
    img_hi  << buffer << endl << width << " " << height << endl << intensity << endl;
    img_lo  << buffer << endl << width << " " << height << endl << intensity << endl;
    img_x   << buffer << endl << width << " " << height << endl << intensity << endl;
    img_y   << buffer << endl << width << " " << height << endl << intensity << endl;

    // These matrices will hold the integer values of the input image
    int Size = width * height;
    int pic[Size];

    // Reading in the input image
    for (int i = 0; i < Size; i++){
        pic[i] = (int)infile.get();

    // setting up the OpenCL
    clock_t start, end;  //Timers to for execution timing & performance
    //Initialize Buffers, memory space the allows for communication between the host and the target device
    cl_mem width_buffer, height_buffer, input_buffer, xConv_buffer, yConv_buffer, size_buffer, magOutput_buffer;

    //Get the platform you want to use
    cl_uint platformCount; //keeps track of the number of platforms you have installed on your device
    cl_platform_id *platforms;
    // get platform count
    clGetPlatformIDs(5, NULL, &platformCount); //sets platformCount to the number of platforms

    // get all platforms
    platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
    clGetPlatformIDs(platformCount, platforms, NULL); //saves a list of platforms in the platforms variable
    //Select the platform you would like to use in this program (change index to do this). If you would like to see all available platforms run platform.cpp.
    cl_platform_id platform = platforms[0]; 
    //Outputs the information of the chosen platform
    char* Info = (char*)malloc(0x1000*sizeof(char));
    clGetPlatformInfo(platform, CL_PLATFORM_NAME      , 0x1000, Info, 0);
    printf("Name      : %s\n", Info);
    clGetPlatformInfo(platform, CL_PLATFORM_VENDOR    , 0x1000, Info, 0);
    printf("Vendor    : %s\n", Info);
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION   , 0x1000, Info, 0);
    printf("Version   : %s\n", Info);
    clGetPlatformInfo(platform, CL_PLATFORM_PROFILE   , 0x1000, Info, 0);
    printf("Profile   : %s\n", Info);

    // get device ID must first get platform
    cl_device_id device; //this is your deviceID
    cl_int err, err1, err2;
    // Access a device
    //The if statement checks to see if the chosen platform uses a GPU, if not it setups the device using the CPU
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if(err == CL_DEVICE_NOT_FOUND) {
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    printf("Device ID = %i\n",err);

    // creates a context that allows devices to send and receive kernels and transfer data
    cl_context context; //This is your contextID, the line below must just run
    context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

    // get details about the kernel.cl file in order to create it (read the kernel.cl file and place it in a buffer)
    //read file in  
    FILE *program_handle;
    program_handle = fopen("OpenCL/Kernel.cl", "r");

    //get program size
    size_t program_size;//, log_size;
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);

    //sort buffer out
    char *program_buffer;//, *program_log;
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '[=10=]';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    // create program from source because the kernel is in a separate file 'kernel.cl', therefore the compiler must run twice once on main and once on kernel
    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, NULL); //this compiles the kernels code

    // build the program, this compiles the source code from above for the devices that the code has to run on (ie GPU or CPU)
    cl_int err3= clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    printf("program ID = %i\n", err3);

    // creates the kernel, this creates a kernel from one of the functions in the cl_program you just built
    // select the kernel you are running
    cl_kernel kernel = clCreateKernel(program, "sobelEdgeDetection", &err);
    // create command queue to the target device. This is the queue that the kernels get dispatched too, to get the the desired device.
    cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, 0, NULL);

    // create data buffers for memory management between the host and the target device
    size_t global_size = Size; //total number of work items
    size_t local_size = height; //Size of each work group
    cl_int num_groups = global_size/local_size; //number of work groups needed
    int magOutput[global_size];
    int xConv[global_size];
    int yConv[global_size];
    //Buffer (memory block) that both the host and target device can access 
    width_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(int), &width, &err);
    height_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(int), &height, &err);
    input_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,global_size*sizeof(int), &pic, &err);
    xConv_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,global_size*sizeof(int), &xConv, &err);
    yConv_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,global_size*sizeof(int), &yConv, &err);
    size_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(int), &Size, &err);
    magOutput_buffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,global_size*sizeof(int), &magOutput, &err);

    // create the arguments for the kernel (link these to the buffers set above, using the pointers for the respective buffers)
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &width_buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &height_buffer);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_buffer);
    clSetKernelArg(kernel, 3, sizeof(cl_mem), &xConv_buffer);
    clSetKernelArg(kernel, 4, sizeof(cl_mem), &yConv_buffer);
    clSetKernelArg(kernel, 5, sizeof(cl_mem), &size_buffer);
    clSetKernelArg(kernel, 6, sizeof(cl_mem), &magOutput_buffer);
    //enqueue kernel, deploys the kernels and determines the number of work-items that should be generated to execute the kernel (global_size) and the number of work-items in each work-group (local_size).
    cl_int err4 = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); 
    printf("\nKernel check: %i \n",err4);

    // Allows the host to read from the buffer object 
    err = clEnqueueReadBuffer(queue, xConv_buffer, CL_TRUE, 0, sizeof(xConv), xConv, 0, NULL, NULL);
    err = clEnqueueReadBuffer(queue, yConv_buffer, CL_TRUE, 0, sizeof(yConv), yConv, 0, NULL, NULL);
    err = clEnqueueReadBuffer(queue, magOutput_buffer, CL_TRUE, 0, sizeof(magOutput), magOutput, 0, NULL, NULL);
    //This command stops the program here until everything in the queue has been run

    // Once OpenCL has been used finish off the processing by normalising the magOutput array
    // Make sure all the x,y and output magnitude values are between 0-255
    int maxVal = 0; 
    int maxx = 0; 
    int maxy = 0;

    for (int j = 0; j < Size; j++){
        if (xConv[j] > maxx)
            maxx = xConv[j];

        if (yConv[j] > maxy)
            maxy = yConv[j];

        if (magOutput[j] > maxy)
            maxVal = magOutput[j];

    int tempx;
    // Make sure all the magnitude values are between 0-255
    for (int z = 0; z < Size; z++){
        xConv[z] = xConv[z] * 255 / maxx;
        yConv[z] = yConv[z] * 255 / maxy;
        magOutput[z] = magOutput[z] * 255 / maxVal;

    printf("\nMaxx: %i \n",maxx); 
    printf("Maxy: %i \n",maxy);
    printf("MaxVal: %i \n",maxVal);  

    // Make sure to cast back to char before outputting
    // Also to avoid any wonky results, get rid of any decimals by casting to int first
    for (int j = 0; j < Size; j++){
        // Output the x image
        img_x << (char)((int)xConv[j]);

        // Output the y image
        img_y << (char)((int)yConv[j]);

        // Output the magnitude image
        img_mag << (char)((int)magOutput[j]);

        // Ouput the low threshold image
        if (magOutput[j] > lo)
            img_lo << (char)255;
            img_lo << (char)0;

        // Ouput the high threshold image
        if (magOutput[j] > hi)
            img_hi << (char)255;
            img_hi << (char)0;
    // Deallocate all the OpenCL resources          

    return 0;;


__kernel void sobelEdgeDetection(__global int* width,__global int* height, __global int* pic, __global int* xConv, __global int* yConv, __global int* Size, __global int* magOutput){
    int workItemNum = get_global_id(0); //Work item ID
    int workGroupNum = get_group_id(0); //Work group ID
    int localGroupID = get_local_id(0); //Work items ID within each work group
    // size refers to the total size of a matrix. So for a 3x3 size = 9
    int dim = *Size;
    int row = *height; // only square matrices are used and as such the sqrt of size produces the row length
    int col = *width; // only square matrices are used and as such the sqrt of size produces the column length

    int current_row = workItemNum/col; // the current row is calculated by using the current workitem number divided by the total size of the matrix
    int current_col = workItemNum % col; // the current column is calculated by using the current workitem number modulus by the total size of the matrix

    if (workItemNum == dim-1)
        printf("\nColumn size:  %i \n",col);
        printf("Row size:  %i \n",row);
        printf("Image Size:  %i \n",dim);

    // This if statement excludes all boundary pixels from the calculation as you require the neighbouring pixel cells 
    // for this calculation
    if (current_col == 0 || current_col == col-1 || current_row == 0 || current_row == row - 1){
        xConv[workItemNum] = 0;
        yConv[workItemNum] = 0;
        magOutput[workItemNum] = 0; // do not assess the bondary pixels and just set the value of the output array to zero
        //printf("Workitemnum: %i \n", workItemNum);

        * The xConv array performs the kernal convultion of the input grey scale values with the following matrix:
        *                            [-1  0 +1]
        * X - Directional Kernel  =  [-2  0 +2]
        *                            [-1  0 +1]
        * This scans across the X direction of the image and enhances all edges in the X-direction 
        xConv[workItemNum] = pic[(current_col - 1)*col + current_row - 1]*-1 
                 + pic[(current_col)*col + current_row - 1]*-2 
                 + pic[(current_col + 1)*col + current_row - 1]*-1 
                 + pic[(current_col - 1)*col + current_row]*0 
                 + pic[(current_col)*col + current_row]*0 
                 + pic[(current_col + 1)*col + current_row]*0 
                 + pic[(current_col - 1)*col + current_row + 1]*1 
                 + pic[(current_col)*col + current_row + 1]*2 
                 + pic[(current_col + 1)*col + current_row + 1]*1;

        * The xConv array performs the kernal convultion of the input grey scale values with the following matrix:
        *                            [+1 +2 +1]
        * Y - Directional Kernel  =  [ 0  0  0]
        *                            [-1 -2 -1]
        * This scans across the Y direction of the image and enhances all edges in the Y-direction 
        yConv[workItemNum] =  pic[(current_col - 1)*col + current_row - 1]*1 
                 + pic[(current_col)*col + current_row - 1]*0 
                 + pic[(current_col + 1)*col + current_row - 1]*-1 
                 + pic[(current_col - 1)*col + current_row]*2 
                 + pic[(current_col)*col + current_row]*0 
                 + pic[(current_col + 1)*col + current_row]*-2 
                 + pic[(current_col - 1)*col + current_row + 1]*1 
                 + pic[(current_col)*col + current_row + 1]*0 
                 + pic[(current_col + 1)*col + current_row + 1]*-1;

        * Calculates the convolution matrix of the X and Y arrays. Does so by squaring each item of the X and Y arrays,  
        * adding them and taking the square root. This is the basic magnitude formula. This is done for by each workItem
        const float xConvf = (float)xConv[workItemNum], yConvf = (float)yConv[workItemNum];
        magOutput[workItemNum] = (int)(sqrt(xConvf*xConvf + yConvf*yConvf)+0.5f);

您的主机 (c++) 代码看起来不错,但您的内核代码包含错误:

 xConv[workItemNum] = pic[(current_col - 1)*col + current_row - 1]*-1 
             + pic[(current_col)*col + current_row - 1]*-2 
             + pic[(current_col + 1)*col + current_row - 1]*-1 
             + pic[(current_col - 1)*col + current_row]*0 
             + pic[(current_col)*col + current_row]*0 
             + pic[(current_col + 1)*col + current_row]*0 
             + pic[(current_col - 1)*col + current_row + 1]*1 
             + pic[(current_col)*col + current_row + 1]*2 
             + pic[(current_col + 1)*col + current_row + 1]*1;

    * The xConv array performs the kernal convultion of the input grey scale values with the following matrix:
    *                            [+1 +2 +1]
    * Y - Directional Kernel  =  [ 0  0  0]
    *                            [-1 -2 -1]
    * This scans across the Y direction of the image and enhances all edges in the Y-direction 
    yConv[workItemNum] =  pic[(current_col - 1)*col + current_row - 1]*1 
             + pic[(current_col)*col + current_row - 1]*0 
             + pic[(current_col + 1)*col + current_row - 1]*-1 
             + pic[(current_col - 1)*col + current_row]*2 
             + pic[(current_col)*col + current_row]*0 
             + pic[(current_col + 1)*col + current_row]*-2 
             + pic[(current_col - 1)*col + current_row + 1]*1 
             + pic[(current_col)*col + current_row + 1]*0 
             + pic[(current_col + 1)*col + current_row + 1]*-1;

我不熟悉 sobel 算法,但您似乎错误地索引了 pic 数组。 如果您的意图是 select (row=current_row,col=current_col) 处的像素, 那么你应该像 pic[(current_row)*col+current_col].


如果您打算索引 (row=current_col,col=current_row) 处的像素,那么您的原始代码可以工作,但是如果 rowcol 是,您只能索引 (row=current_col,col=current_row)完全相同的。使用您提供的图像,您最终会索引超出数组的边界。 请重新检查您的内核代码。

P.S。我强烈建议将 row 重命名为 numRows,将 col 重命名为 numCols