CUDA中运行次获取结果的时间差
the Difference between running time and time of obtaining results in CUDA
我正在尝试使用 CUDA 在 GPU 上实现我的算法。这个程序运行良好,但有一个问题。当我尝试打印结果时,它们显示得太晚了。
这是我的一些代码。假设真实结果无关紧要。
__device__ unsigned char dev_state[128];
__device__ unsigned char GMul(unsigned char a, unsigned char b) { // Galois Field (256) Multiplication of two Bytes
unsigned char p = 0;
int counter;
unsigned char hi_bit_set;
for (counter = 0; counter < 8; counter++) {
if ((b & 1) != 0) {
p ^= a;
}
hi_bit_set = a & 0x80;
a <<= 1;
if (hi_bit_set != 0) {
a ^= 0x1b; /* x^8 + x^4 + x^3 + x + 1 */
}
b >>= 1;
}
return p;
}
__global__ void AESROUND()
{
__shared__ unsigned char dev_rkey;
__shared__ unsigned char dev_sh_state;
int state_idx = blockIdx.x;
int offset = ((state_idx / 4)) *4;
for (int i = 0; i < 512; i++)
{
dev_rkey = dev_state[state_idx];
dev_sh_state= GMul(dev_state[state_idx], 0x02) ^ GMul(dev_state[(state_idx + 5) % 16], 0x03) ^ dev_state[(offset + 5) % 16] ^ dev_state[(offset + 5) % 16];
dev_state[state_idx] = dev_sh_state ^ dev_rkey;
}
}
调用 AESROUND
int main()
{
unsigned char p[] = { 0x19, 0x3d, 0xe3, 0xbe, 0xa0, 0xf4, 0xe2, 0x2b, 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08 };
unsigned char h_state[128];
for (long long i = 0; i < 128; i++)
h_state[i] = p[i%16];
cudaMemcpyToSymbolAsync(dev_state, h_state, 128, 0, cudaMemcpyHostToDevice);
clock_t start, finish;
start = clock();
for (long long i = 0; i < 1024; i++)
AESROUND << <128, 128 >> >();
finish = clock();
float Time = finish - start;
printf("\n\nprocessing time: %2.15f (ms)\n", Time);
cudaMemcpyFromSymbolAsync(h_state, dev_state, 128, 0, cudaMemcpyDeviceToHost);
printf("\n\state After Encryption:\n ");
for (int i = 0; i < 16; i++)
printf("%x ", h_state[i]);
getchar();
return 0;
}
这是结果:
processing time: 1.0000000000000 (ms)
-经过很长一段时间(~5秒),将显示下一行-
state after encryption:
88 91 23 09 78 65 11 87 65 43 56 71 20 93 18 70
如您所见,处理时间太快,但 128Byte 将显示为时已晚。
为什么会这样?这与GPU有关吗?
我该如何解决?
这里的混乱似乎是由于使用基于主机的计时方法来计时(主要是)设备 activity。
内核启动是异步的。主机代码启动内核,然后不等待内核完成就继续运行。因此这种时机:
start = clock();
for (long long i = 0; i < 1024; i++)
AESROUND << <128, 128 >> >();
finish = clock();
仅测量内核启动时间。 (即使在循环中重复启动内核的情况下也是如此。如果没有超出某些设备队列,则每次内核启动将是异步的,从而允许主机线程(即 for 循环)继续。)
为了测量完整的设备执行时间,您可以这样做:
start = clock();
for (long long i = 0; i < 1024; i++)
AESROUND << <128, 128 >> >();
cudaDeviceSynchronize(); //wait for device to finish
finish = clock();
我正在尝试使用 CUDA 在 GPU 上实现我的算法。这个程序运行良好,但有一个问题。当我尝试打印结果时,它们显示得太晚了。 这是我的一些代码。假设真实结果无关紧要。
__device__ unsigned char dev_state[128];
__device__ unsigned char GMul(unsigned char a, unsigned char b) { // Galois Field (256) Multiplication of two Bytes
unsigned char p = 0;
int counter;
unsigned char hi_bit_set;
for (counter = 0; counter < 8; counter++) {
if ((b & 1) != 0) {
p ^= a;
}
hi_bit_set = a & 0x80;
a <<= 1;
if (hi_bit_set != 0) {
a ^= 0x1b; /* x^8 + x^4 + x^3 + x + 1 */
}
b >>= 1;
}
return p;
}
__global__ void AESROUND()
{
__shared__ unsigned char dev_rkey;
__shared__ unsigned char dev_sh_state;
int state_idx = blockIdx.x;
int offset = ((state_idx / 4)) *4;
for (int i = 0; i < 512; i++)
{
dev_rkey = dev_state[state_idx];
dev_sh_state= GMul(dev_state[state_idx], 0x02) ^ GMul(dev_state[(state_idx + 5) % 16], 0x03) ^ dev_state[(offset + 5) % 16] ^ dev_state[(offset + 5) % 16];
dev_state[state_idx] = dev_sh_state ^ dev_rkey;
}
}
调用 AESROUND
int main()
{
unsigned char p[] = { 0x19, 0x3d, 0xe3, 0xbe, 0xa0, 0xf4, 0xe2, 0x2b, 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08 };
unsigned char h_state[128];
for (long long i = 0; i < 128; i++)
h_state[i] = p[i%16];
cudaMemcpyToSymbolAsync(dev_state, h_state, 128, 0, cudaMemcpyHostToDevice);
clock_t start, finish;
start = clock();
for (long long i = 0; i < 1024; i++)
AESROUND << <128, 128 >> >();
finish = clock();
float Time = finish - start;
printf("\n\nprocessing time: %2.15f (ms)\n", Time);
cudaMemcpyFromSymbolAsync(h_state, dev_state, 128, 0, cudaMemcpyDeviceToHost);
printf("\n\state After Encryption:\n ");
for (int i = 0; i < 16; i++)
printf("%x ", h_state[i]);
getchar();
return 0;
}
这是结果:
processing time: 1.0000000000000 (ms)
-经过很长一段时间(~5秒),将显示下一行-
state after encryption:
88 91 23 09 78 65 11 87 65 43 56 71 20 93 18 70
如您所见,处理时间太快,但 128Byte 将显示为时已晚。 为什么会这样?这与GPU有关吗? 我该如何解决?
这里的混乱似乎是由于使用基于主机的计时方法来计时(主要是)设备 activity。
内核启动是异步的。主机代码启动内核,然后不等待内核完成就继续运行。因此这种时机:
start = clock();
for (long long i = 0; i < 1024; i++)
AESROUND << <128, 128 >> >();
finish = clock();
仅测量内核启动时间。 (即使在循环中重复启动内核的情况下也是如此。如果没有超出某些设备队列,则每次内核启动将是异步的,从而允许主机线程(即 for 循环)继续。)
为了测量完整的设备执行时间,您可以这样做:
start = clock();
for (long long i = 0; i < 1024; i++)
AESROUND << <128, 128 >> >();
cudaDeviceSynchronize(); //wait for device to finish
finish = clock();