如何在 JCuda 中创建本地指针结构
How can I create a struct of native pointers in JCuda
我有一个采用结构列表的 CUDA 内核。
kernel<<<blockCount,blockSize>>>(MyStruct *structs);
每个结构包含 3 个指针。
typedef struct __align(16)__ {
float* pointer1;
float* pointer2;
float* pointer3;
}
我有三个包含浮点数的设备数组,结构中的每个指针都指向三个设备数组之一中的一个浮点数。
结构列表表示一个 tree/graph 结构,它允许内核执行递归操作,具体取决于发送到内核的结构列表的顺序。 (此位适用于 C++,因此与我的问题无关)
我想做的是能够从 JCuda 发送我的指针结构。我知道这在本质上是不可能的,除非像 this post.
那样将其展平为填充数组
我了解发送结构列表时可能发生的所有对齐和填充问题,它本质上是一个重复的填充数组,我可以接受。
我不确定该怎么做,是用指针填充我的展平结构缓冲区,例如,我想我可以做这样的事情:
Pointer A = ....(underlying device array1)
Pointer B = ....(underlying device array2)
Pointer C = ....(underlying device array3)
ByteBuffer structListBuffer = ByteBuffer.allocate(16*noSteps);
for(int x = 0; x<noSteps; x++) {
// Get the underlying pointer values
long pointer1 = A.withByteOffset(getStepOffsetA(x)).someGetUnderlyingPointerValueFunction();
long pointer2 = B.withByteOffset(getStepOffsetB(x)).someGetUnderlyingPointerValueFunction();
long pointer3 = C.withByteOffset(getStepOffsetC(x)).someGetUnderlyingPointerValueFunction();
// Build the struct
structListBuffer.asLongBuffer().append(pointer1);
structListBuffer.asLongBuffer().append(pointer2);
structListBuffer.asLongBuffer().append(pointer3);
structListBuffer.asLongBuffer().append(0); //padding
}
structListBuffer
将以内核期望的方式包含一个结构列表。
那么有什么方法可以从 ByteBuffer 中执行 someGetUnderlyingPointerValueFunction()
吗?
如果我没理解错的话,问题的重点是有没有像
这样的神奇功能
long address = pointer.someGetUnderlyingPointerValueFunction();
即returns本机指针的地址。
简短的回答:不,没有这样的功能。
(旁注:很久以前就有人要求类似的功能,但我还没有添加它。主要是因为这样的功能对于指向 [=63= 的指针没有意义] 数组或(非直接)字节缓冲区。此外,在 32 位和 64 位机器上手动处理具有填充和对齐的结构,以及具有不同大小的指针,以及大端或小端的缓冲区是无尽的头痛源。但我明白了这一点,以及可能的应用案例,所以我很可能会添加类似 getAddress()
函数的东西。也许只添加到 CUdeviceptr
class,这绝对有意义- 至少比 Pointer
class 多。人们 会 使用这种方法做一些奇怪的事情,他们 会 做一些会导致 VM 严重崩溃的事情,但 JCuda 本身是一个薄抽象层,无论如何在这方面没有安全网...)
也就是说,您可以使用如下方法解决当前限制:
private static long getPointerAddress(CUdeviceptr p)
{
// WORKAROUND until a method like CUdeviceptr#getAddress exists
class PointerWithAddress extends Pointer
{
PointerWithAddress(Pointer other)
{
super(other);
}
long getAddress()
{
return getNativePointer() + getByteOffset();
}
}
return new PointerWithAddress(p).getAddress();
}
当然,这很丑陋,显然与getNativePointer()
和getByteOffset()
方法protected
的意图相矛盾。但它最终可能会被一些 "official" 方法取代:
private static long getPointerAddress(CUdeviceptr p)
{
return p.getAddress();
}
直到现在,这可能是最接近您在 C 端可以做的解决方案。
这是我为测试这个而写的一个例子。内核只是一个虚拟内核,它用 "identifiable" 值填充结构(以查看它们是否在正确的位置结束),并且应该只用 1 个线程启动:
typedef struct __declspec(align(16)) {
float* pointer1;
float* pointer2;
float* pointer3;
} MyStruct;
extern "C"
__global__ void kernel(MyStruct *structs)
{
structs[0].pointer1[0] = 1.0f;
structs[0].pointer1[1] = 1.1f;
structs[0].pointer1[2] = 1.2f;
structs[0].pointer2[0] = 2.0f;
structs[0].pointer2[1] = 2.1f;
structs[0].pointer2[2] = 2.2f;
structs[0].pointer3[0] = 3.0f;
structs[0].pointer3[1] = 3.1f;
structs[0].pointer3[2] = 3.2f;
structs[1].pointer1[0] = 11.0f;
structs[1].pointer1[1] = 11.1f;
structs[1].pointer1[2] = 11.2f;
structs[1].pointer2[0] = 12.0f;
structs[1].pointer2[1] = 12.1f;
structs[1].pointer2[2] = 12.2f;
structs[1].pointer3[0] = 13.0f;
structs[1].pointer3[1] = 13.1f;
structs[1].pointer3[2] = 13.2f;
}
此内核在以下程序中启动(注意:PTX文件的编译是在此处即时完成的,其设置可能与您的应用程序不匹配。在有疑问,您可以手动编译您的 PTX 文件)。
初始化每个结构的pointer1
、pointer2
和pointer3
指针,使它们指向设备缓冲区的连续元素A
、B
和 C
,每个都有一个偏移量,可以识别内核写入的值。 (请注意,我试图在 32 位或 64 位机器上处理 运行 这两种可能的情况,这意味着不同的指针大小——尽管目前,我只能测试 32 位版本)
import static jcuda.driver.JCudaDriver.*;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.IntBuffer;
import java.nio.LongBuffer;
import java.util.Arrays;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;
public class JCudaPointersInStruct
{
public static void main(String args[]) throws IOException
{
JCudaDriver.setExceptionsEnabled(true);
String ptxFileName = preparePtxFile("JCudaPointersInStructKernel.cu");
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "kernel");
int numElements = 9;
CUdeviceptr A = new CUdeviceptr();
cuMemAlloc(A, numElements * Sizeof.FLOAT);
cuMemsetD32(A, 0, numElements);
CUdeviceptr B = new CUdeviceptr();
cuMemAlloc(B, numElements * Sizeof.FLOAT);
cuMemsetD32(B, 0, numElements);
CUdeviceptr C = new CUdeviceptr();
cuMemAlloc(C, numElements * Sizeof.FLOAT);
cuMemsetD32(C, 0, numElements);
int numSteps = 2;
int sizeOfStruct = Sizeof.POINTER * 4;
ByteBuffer hostStructsBuffer =
ByteBuffer.allocate(numSteps * sizeOfStruct);
if (Sizeof.POINTER == 4)
{
IntBuffer b = hostStructsBuffer.order(
ByteOrder.nativeOrder()).asIntBuffer();
for(int x = 0; x<numSteps; x++)
{
CUdeviceptr pointer1 = A.withByteOffset(getStepOffsetA(x));
CUdeviceptr pointer2 = B.withByteOffset(getStepOffsetB(x));
CUdeviceptr pointer3 = C.withByteOffset(getStepOffsetC(x));
//System.out.println("Step "+x+" pointer1 is "+pointer1);
//System.out.println("Step "+x+" pointer2 is "+pointer2);
//System.out.println("Step "+x+" pointer3 is "+pointer3);
b.put((int)getPointerAddress(pointer1));
b.put((int)getPointerAddress(pointer2));
b.put((int)getPointerAddress(pointer3));
b.put(0);
}
}
else
{
LongBuffer b = hostStructsBuffer.order(
ByteOrder.nativeOrder()).asLongBuffer();
for(int x = 0; x<numSteps; x++)
{
CUdeviceptr pointer1 = A.withByteOffset(getStepOffsetA(x));
CUdeviceptr pointer2 = B.withByteOffset(getStepOffsetB(x));
CUdeviceptr pointer3 = C.withByteOffset(getStepOffsetC(x));
//System.out.println("Step "+x+" pointer1 is "+pointer1);
//System.out.println("Step "+x+" pointer2 is "+pointer2);
//System.out.println("Step "+x+" pointer3 is "+pointer3);
b.put(getPointerAddress(pointer1));
b.put(getPointerAddress(pointer2));
b.put(getPointerAddress(pointer3));
b.put(0);
}
}
CUdeviceptr structs = new CUdeviceptr();
cuMemAlloc(structs, numSteps * sizeOfStruct);
cuMemcpyHtoD(structs, Pointer.to(hostStructsBuffer),
numSteps * sizeOfStruct);
Pointer kernelParameters = Pointer.to(
Pointer.to(structs)
);
cuLaunchKernel(function,
1, 1, 1,
1, 1, 1,
0, null, kernelParameters, null);
cuCtxSynchronize();
float hostA[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostA), A, numElements * Sizeof.FLOAT);
float hostB[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostB), B, numElements * Sizeof.FLOAT);
float hostC[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostC), C, numElements * Sizeof.FLOAT);
System.out.println("A "+Arrays.toString(hostA));
System.out.println("B "+Arrays.toString(hostB));
System.out.println("C "+Arrays.toString(hostC));
}
private static long getStepOffsetA(int x)
{
return x * Sizeof.FLOAT * 4 + 0 * Sizeof.FLOAT;
}
private static long getStepOffsetB(int x)
{
return x * Sizeof.FLOAT * 4 + 1 * Sizeof.FLOAT;
}
private static long getStepOffsetC(int x)
{
return x * Sizeof.FLOAT * 4 + 2 * Sizeof.FLOAT;
}
private static long getPointerAddress(CUdeviceptr p)
{
// WORKAROUND until a method like CUdeviceptr#getAddress exists
class PointerWithAddress extends Pointer
{
PointerWithAddress(Pointer other)
{
super(other);
}
long getAddress()
{
return getNativePointer() + getByteOffset();
}
}
return new PointerWithAddress(p).getAddress();
}
//-------------------------------------------------------------------------
// Ignore this - in practice, you'll compile the PTX manually
private static String preparePtxFile(String cuFileName) throws IOException
{
int endIndex = cuFileName.lastIndexOf('.');
if (endIndex == -1)
{
endIndex = cuFileName.length()-1;
}
String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
File cuFile = new File(cuFileName);
if (!cuFile.exists())
{
throw new IOException("Input file not found: "+cuFileName);
}
String modelString = "-m"+System.getProperty("sun.arch.data.model");
String command =
"nvcc " + modelString + " -ptx -arch sm_11 -lineinfo "+
cuFile.getPath()+" -o "+ptxFileName;
System.out.println("Executing\n"+command);
Process process = Runtime.getRuntime().exec(command);
String errorMessage =
new String(toByteArray(process.getErrorStream()));
String outputMessage =
new String(toByteArray(process.getInputStream()));
int exitValue = 0;
try
{
exitValue = process.waitFor();
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new IOException(
"Interrupted while waiting for nvcc output", e);
}
if (exitValue != 0)
{
System.out.println("nvcc process exitValue "+exitValue);
System.out.println("errorMessage:\n"+errorMessage);
System.out.println("outputMessage:\n"+outputMessage);
throw new IOException(
"Could not create .ptx file: "+errorMessage);
}
System.out.println("Finished creating PTX file");
return ptxFileName;
}
private static byte[] toByteArray(InputStream inputStream)
throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte buffer[] = new byte[8192];
while (true)
{
int read = inputStream.read(buffer);
if (read == -1)
{
break;
}
baos.write(buffer, 0, read);
}
return baos.toByteArray();
}
}
结果如expected/desired:
A [1.0, 1.1, 1.2, 0.0, 11.0, 11.1, 11.2, 0.0, 0.0]
B [0.0, 2.0, 2.1, 2.2, 0.0, 12.0, 12.1, 12.2, 0.0]
C [0.0, 0.0, 3.0, 3.1, 3.2, 0.0, 13.0, 13.1, 13.2]
我有一个采用结构列表的 CUDA 内核。
kernel<<<blockCount,blockSize>>>(MyStruct *structs);
每个结构包含 3 个指针。
typedef struct __align(16)__ {
float* pointer1;
float* pointer2;
float* pointer3;
}
我有三个包含浮点数的设备数组,结构中的每个指针都指向三个设备数组之一中的一个浮点数。
结构列表表示一个 tree/graph 结构,它允许内核执行递归操作,具体取决于发送到内核的结构列表的顺序。 (此位适用于 C++,因此与我的问题无关)
我想做的是能够从 JCuda 发送我的指针结构。我知道这在本质上是不可能的,除非像 this post.
那样将其展平为填充数组我了解发送结构列表时可能发生的所有对齐和填充问题,它本质上是一个重复的填充数组,我可以接受。
我不确定该怎么做,是用指针填充我的展平结构缓冲区,例如,我想我可以做这样的事情:
Pointer A = ....(underlying device array1)
Pointer B = ....(underlying device array2)
Pointer C = ....(underlying device array3)
ByteBuffer structListBuffer = ByteBuffer.allocate(16*noSteps);
for(int x = 0; x<noSteps; x++) {
// Get the underlying pointer values
long pointer1 = A.withByteOffset(getStepOffsetA(x)).someGetUnderlyingPointerValueFunction();
long pointer2 = B.withByteOffset(getStepOffsetB(x)).someGetUnderlyingPointerValueFunction();
long pointer3 = C.withByteOffset(getStepOffsetC(x)).someGetUnderlyingPointerValueFunction();
// Build the struct
structListBuffer.asLongBuffer().append(pointer1);
structListBuffer.asLongBuffer().append(pointer2);
structListBuffer.asLongBuffer().append(pointer3);
structListBuffer.asLongBuffer().append(0); //padding
}
structListBuffer
将以内核期望的方式包含一个结构列表。
那么有什么方法可以从 ByteBuffer 中执行 someGetUnderlyingPointerValueFunction()
吗?
如果我没理解错的话,问题的重点是有没有像
这样的神奇功能long address = pointer.someGetUnderlyingPointerValueFunction();
即returns本机指针的地址。
简短的回答:不,没有这样的功能。
(旁注:很久以前就有人要求类似的功能,但我还没有添加它。主要是因为这样的功能对于指向 [=63= 的指针没有意义] 数组或(非直接)字节缓冲区。此外,在 32 位和 64 位机器上手动处理具有填充和对齐的结构,以及具有不同大小的指针,以及大端或小端的缓冲区是无尽的头痛源。但我明白了这一点,以及可能的应用案例,所以我很可能会添加类似 getAddress()
函数的东西。也许只添加到 CUdeviceptr
class,这绝对有意义- 至少比 Pointer
class 多。人们 会 使用这种方法做一些奇怪的事情,他们 会 做一些会导致 VM 严重崩溃的事情,但 JCuda 本身是一个薄抽象层,无论如何在这方面没有安全网...)
也就是说,您可以使用如下方法解决当前限制:
private static long getPointerAddress(CUdeviceptr p)
{
// WORKAROUND until a method like CUdeviceptr#getAddress exists
class PointerWithAddress extends Pointer
{
PointerWithAddress(Pointer other)
{
super(other);
}
long getAddress()
{
return getNativePointer() + getByteOffset();
}
}
return new PointerWithAddress(p).getAddress();
}
当然,这很丑陋,显然与getNativePointer()
和getByteOffset()
方法protected
的意图相矛盾。但它最终可能会被一些 "official" 方法取代:
private static long getPointerAddress(CUdeviceptr p)
{
return p.getAddress();
}
直到现在,这可能是最接近您在 C 端可以做的解决方案。
这是我为测试这个而写的一个例子。内核只是一个虚拟内核,它用 "identifiable" 值填充结构(以查看它们是否在正确的位置结束),并且应该只用 1 个线程启动:
typedef struct __declspec(align(16)) {
float* pointer1;
float* pointer2;
float* pointer3;
} MyStruct;
extern "C"
__global__ void kernel(MyStruct *structs)
{
structs[0].pointer1[0] = 1.0f;
structs[0].pointer1[1] = 1.1f;
structs[0].pointer1[2] = 1.2f;
structs[0].pointer2[0] = 2.0f;
structs[0].pointer2[1] = 2.1f;
structs[0].pointer2[2] = 2.2f;
structs[0].pointer3[0] = 3.0f;
structs[0].pointer3[1] = 3.1f;
structs[0].pointer3[2] = 3.2f;
structs[1].pointer1[0] = 11.0f;
structs[1].pointer1[1] = 11.1f;
structs[1].pointer1[2] = 11.2f;
structs[1].pointer2[0] = 12.0f;
structs[1].pointer2[1] = 12.1f;
structs[1].pointer2[2] = 12.2f;
structs[1].pointer3[0] = 13.0f;
structs[1].pointer3[1] = 13.1f;
structs[1].pointer3[2] = 13.2f;
}
此内核在以下程序中启动(注意:PTX文件的编译是在此处即时完成的,其设置可能与您的应用程序不匹配。在有疑问,您可以手动编译您的 PTX 文件)。
初始化每个结构的pointer1
、pointer2
和pointer3
指针,使它们指向设备缓冲区的连续元素A
、B
和 C
,每个都有一个偏移量,可以识别内核写入的值。 (请注意,我试图在 32 位或 64 位机器上处理 运行 这两种可能的情况,这意味着不同的指针大小——尽管目前,我只能测试 32 位版本)
import static jcuda.driver.JCudaDriver.*;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.IntBuffer;
import java.nio.LongBuffer;
import java.util.Arrays;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;
public class JCudaPointersInStruct
{
public static void main(String args[]) throws IOException
{
JCudaDriver.setExceptionsEnabled(true);
String ptxFileName = preparePtxFile("JCudaPointersInStructKernel.cu");
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "kernel");
int numElements = 9;
CUdeviceptr A = new CUdeviceptr();
cuMemAlloc(A, numElements * Sizeof.FLOAT);
cuMemsetD32(A, 0, numElements);
CUdeviceptr B = new CUdeviceptr();
cuMemAlloc(B, numElements * Sizeof.FLOAT);
cuMemsetD32(B, 0, numElements);
CUdeviceptr C = new CUdeviceptr();
cuMemAlloc(C, numElements * Sizeof.FLOAT);
cuMemsetD32(C, 0, numElements);
int numSteps = 2;
int sizeOfStruct = Sizeof.POINTER * 4;
ByteBuffer hostStructsBuffer =
ByteBuffer.allocate(numSteps * sizeOfStruct);
if (Sizeof.POINTER == 4)
{
IntBuffer b = hostStructsBuffer.order(
ByteOrder.nativeOrder()).asIntBuffer();
for(int x = 0; x<numSteps; x++)
{
CUdeviceptr pointer1 = A.withByteOffset(getStepOffsetA(x));
CUdeviceptr pointer2 = B.withByteOffset(getStepOffsetB(x));
CUdeviceptr pointer3 = C.withByteOffset(getStepOffsetC(x));
//System.out.println("Step "+x+" pointer1 is "+pointer1);
//System.out.println("Step "+x+" pointer2 is "+pointer2);
//System.out.println("Step "+x+" pointer3 is "+pointer3);
b.put((int)getPointerAddress(pointer1));
b.put((int)getPointerAddress(pointer2));
b.put((int)getPointerAddress(pointer3));
b.put(0);
}
}
else
{
LongBuffer b = hostStructsBuffer.order(
ByteOrder.nativeOrder()).asLongBuffer();
for(int x = 0; x<numSteps; x++)
{
CUdeviceptr pointer1 = A.withByteOffset(getStepOffsetA(x));
CUdeviceptr pointer2 = B.withByteOffset(getStepOffsetB(x));
CUdeviceptr pointer3 = C.withByteOffset(getStepOffsetC(x));
//System.out.println("Step "+x+" pointer1 is "+pointer1);
//System.out.println("Step "+x+" pointer2 is "+pointer2);
//System.out.println("Step "+x+" pointer3 is "+pointer3);
b.put(getPointerAddress(pointer1));
b.put(getPointerAddress(pointer2));
b.put(getPointerAddress(pointer3));
b.put(0);
}
}
CUdeviceptr structs = new CUdeviceptr();
cuMemAlloc(structs, numSteps * sizeOfStruct);
cuMemcpyHtoD(structs, Pointer.to(hostStructsBuffer),
numSteps * sizeOfStruct);
Pointer kernelParameters = Pointer.to(
Pointer.to(structs)
);
cuLaunchKernel(function,
1, 1, 1,
1, 1, 1,
0, null, kernelParameters, null);
cuCtxSynchronize();
float hostA[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostA), A, numElements * Sizeof.FLOAT);
float hostB[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostB), B, numElements * Sizeof.FLOAT);
float hostC[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostC), C, numElements * Sizeof.FLOAT);
System.out.println("A "+Arrays.toString(hostA));
System.out.println("B "+Arrays.toString(hostB));
System.out.println("C "+Arrays.toString(hostC));
}
private static long getStepOffsetA(int x)
{
return x * Sizeof.FLOAT * 4 + 0 * Sizeof.FLOAT;
}
private static long getStepOffsetB(int x)
{
return x * Sizeof.FLOAT * 4 + 1 * Sizeof.FLOAT;
}
private static long getStepOffsetC(int x)
{
return x * Sizeof.FLOAT * 4 + 2 * Sizeof.FLOAT;
}
private static long getPointerAddress(CUdeviceptr p)
{
// WORKAROUND until a method like CUdeviceptr#getAddress exists
class PointerWithAddress extends Pointer
{
PointerWithAddress(Pointer other)
{
super(other);
}
long getAddress()
{
return getNativePointer() + getByteOffset();
}
}
return new PointerWithAddress(p).getAddress();
}
//-------------------------------------------------------------------------
// Ignore this - in practice, you'll compile the PTX manually
private static String preparePtxFile(String cuFileName) throws IOException
{
int endIndex = cuFileName.lastIndexOf('.');
if (endIndex == -1)
{
endIndex = cuFileName.length()-1;
}
String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
File cuFile = new File(cuFileName);
if (!cuFile.exists())
{
throw new IOException("Input file not found: "+cuFileName);
}
String modelString = "-m"+System.getProperty("sun.arch.data.model");
String command =
"nvcc " + modelString + " -ptx -arch sm_11 -lineinfo "+
cuFile.getPath()+" -o "+ptxFileName;
System.out.println("Executing\n"+command);
Process process = Runtime.getRuntime().exec(command);
String errorMessage =
new String(toByteArray(process.getErrorStream()));
String outputMessage =
new String(toByteArray(process.getInputStream()));
int exitValue = 0;
try
{
exitValue = process.waitFor();
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new IOException(
"Interrupted while waiting for nvcc output", e);
}
if (exitValue != 0)
{
System.out.println("nvcc process exitValue "+exitValue);
System.out.println("errorMessage:\n"+errorMessage);
System.out.println("outputMessage:\n"+outputMessage);
throw new IOException(
"Could not create .ptx file: "+errorMessage);
}
System.out.println("Finished creating PTX file");
return ptxFileName;
}
private static byte[] toByteArray(InputStream inputStream)
throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte buffer[] = new byte[8192];
while (true)
{
int read = inputStream.read(buffer);
if (read == -1)
{
break;
}
baos.write(buffer, 0, read);
}
return baos.toByteArray();
}
}
结果如expected/desired:
A [1.0, 1.1, 1.2, 0.0, 11.0, 11.1, 11.2, 0.0, 0.0]
B [0.0, 2.0, 2.1, 2.2, 0.0, 12.0, 12.1, 12.2, 0.0]
C [0.0, 0.0, 3.0, 3.1, 3.2, 0.0, 13.0, 13.1, 13.2]