OpenCL 映射输入到输出
OpenCL mapping input to output
我有一个大小为 14 的输入 [1,2,3,4,1,3,3,5,1,2,5,4,1,3]
,我将其用作内核输入。该输入将从给定的输出缓冲区索引开始复制到输出缓冲区。这项工作将发生在单个工作项位置(用于实验目的)。然后,我使用格式 (input_int --> work_item_id --> output_int)
和复制的 output
打印出我的映射结果。映射显示所有输入字符串(大小 14)已映射到 output
,但复制的 output
在结果图像中仅打印 7 个插槽作为 Resulting output
。 7 是工作项的数量,但我打算将整个输入复制到工作项 0 的输出。这里发生了什么?
程序:
#!/usr/bin/env python3
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
import pyopencl as cl
import numpy as np
import seq
# Write down our kernel as a multiline string.
kernel = """
__kernel void dragon(
const int N,
__global char *AplusB,
__global char *output
)
{
int idx = get_global_id(0);
if(idx == 0){ //A+B
printf ("\n mappings from input to output in kernel at idx %d",idx);
for (int i = 0; i <14; i++){
output[i]= AplusB[i];
int a = output[i];
int b =AplusB[i];
printf("\n %d --> %d --> %d \n",b,idx, a);
}
}
}
"""
#declare constants
number_of_expansions = 4
total_probelem_size =7
resulting_str_size=62
# Step 1: Create a context.
# This will ask the user to select the device to be used.
context = cl.create_some_context()
# Create a queue to the device.
queue = cl.CommandQueue(context)
# Create the program.
program = cl.Program(context, kernel).build()
# Create the input string
AplusB = np.array([1,2,3,4,1,3,3,5,1,2,5,4,1,3], dtype = np.int8)
#prepare out buffers
output = np.empty(total_probelem_size).astype(np.int8)
output.fill(0)
# Create the memory on the device to put the result into.
expanded_output = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, output.nbytes)
# Send the data to the guest memory.
mf = cl.mem_flags
AplusBBuf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=AplusB)
# Initiate the kernel.
dragon = program.dragon
dragon.set_scalar_arg_dtypes([np.int32, None,None])
global_work_size = total_probelem_size
# Execute C = A * B.
dragon(queue, (global_work_size,), None,total_probelem_size,AplusBBuf,expanded_output)
# Wait for the queue to be completely processed.
queue.finish()
# Read the array from the device.
cl.enqueue_copy(queue, output, expanded_output).wait()
print("----------- Resulting output -------------------")
print (output)
结果:
expanded_output
和 output
分配了 7 个元素的大小 (total_probelem_size
)。因此,内核写入 output[i=7]
到 output[i=13]
会写入无效的内存位置。
由于output
数组只包含7个元素,所以只打印7个元素。
我有一个大小为 14 的输入 [1,2,3,4,1,3,3,5,1,2,5,4,1,3]
,我将其用作内核输入。该输入将从给定的输出缓冲区索引开始复制到输出缓冲区。这项工作将发生在单个工作项位置(用于实验目的)。然后,我使用格式 (input_int --> work_item_id --> output_int)
和复制的 output
打印出我的映射结果。映射显示所有输入字符串(大小 14)已映射到 output
,但复制的 output
在结果图像中仅打印 7 个插槽作为 Resulting output
。 7 是工作项的数量,但我打算将整个输入复制到工作项 0 的输出。这里发生了什么?
程序:
#!/usr/bin/env python3
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
import pyopencl as cl
import numpy as np
import seq
# Write down our kernel as a multiline string.
kernel = """
__kernel void dragon(
const int N,
__global char *AplusB,
__global char *output
)
{
int idx = get_global_id(0);
if(idx == 0){ //A+B
printf ("\n mappings from input to output in kernel at idx %d",idx);
for (int i = 0; i <14; i++){
output[i]= AplusB[i];
int a = output[i];
int b =AplusB[i];
printf("\n %d --> %d --> %d \n",b,idx, a);
}
}
}
"""
#declare constants
number_of_expansions = 4
total_probelem_size =7
resulting_str_size=62
# Step 1: Create a context.
# This will ask the user to select the device to be used.
context = cl.create_some_context()
# Create a queue to the device.
queue = cl.CommandQueue(context)
# Create the program.
program = cl.Program(context, kernel).build()
# Create the input string
AplusB = np.array([1,2,3,4,1,3,3,5,1,2,5,4,1,3], dtype = np.int8)
#prepare out buffers
output = np.empty(total_probelem_size).astype(np.int8)
output.fill(0)
# Create the memory on the device to put the result into.
expanded_output = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, output.nbytes)
# Send the data to the guest memory.
mf = cl.mem_flags
AplusBBuf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=AplusB)
# Initiate the kernel.
dragon = program.dragon
dragon.set_scalar_arg_dtypes([np.int32, None,None])
global_work_size = total_probelem_size
# Execute C = A * B.
dragon(queue, (global_work_size,), None,total_probelem_size,AplusBBuf,expanded_output)
# Wait for the queue to be completely processed.
queue.finish()
# Read the array from the device.
cl.enqueue_copy(queue, output, expanded_output).wait()
print("----------- Resulting output -------------------")
print (output)
结果:
expanded_output
和 output
分配了 7 个元素的大小 (total_probelem_size
)。因此,内核写入 output[i=7]
到 output[i=13]
会写入无效的内存位置。
由于output
数组只包含7个元素,所以只打印7个元素。