为什么 cuda_free 有这么高的开销? (是因为归零吗?)
Why does cuda_free have such a high overhead? (is it due to zeroing?)
我观察到 cudaFree 操作需要很长时间才能运行。为了验证它,我编写了一个简单的微型基准代码来测试 cudaFree 开销。它显示了类似的趋势。
第一步:cudaMallocManaged
不需要那么长时间。实际上仅对于 cudaMallocManaged 不到 1ms。
第二步:init
要初始化 malloced 内存区域,将调用 init 函数。而且也不会少于1ms。
第三步:cudaFree
现在,开销出现了。 需要很多时间。还有一件事是更大的内存大小,需要更长的时间。(漂亮的线性)
问题是"Why does cudaFree have such a huge overhead? Does it fill whole memory region with zero for security issue? Or any other critical path it walks through?"
这里是代码和实测结果。
提前谢谢你:) !!!
10 int getMilliCount(){
11 timeb tb;
12 ftime(&tb);
13 int nCount = tb.millitm + (tb.time & 0xfffff) * 1000;
14 return nCount;
15 }
16
17 int getTimeDiff(int baseTime){
18 int diff = getMilliCount() - baseTime;
19 return diff;
20 }
21
22 __global__ void init(int* x, size_t bytes_){
23 int num_ = bytes_/sizeof(int);
24 for (int i=0; i<num_; i++){
25 x[i] = i;
26 }
27 }
28
29 int main(){
30 printf("sizeof(size_t): %zu\n", sizeof(size_t));
31 printf("sizeof(unsigned int): %zu\n", sizeof(unsigned int));
32 printf("sizeof(int): %zu\n", sizeof(int));
33 printf("sizeof(long): %zu\n", sizeof(long));
34
35 std::ofstream myfile;
36 myfile.open("output3.csv");
37 myfile<<"operation, num_bytes, start, end, duration\n";
38 int baseTime = getMilliCount();
39 int* dptr;
40 int ts1 = 0;
41 int ts2 = 0;
42 size_t KB = 1024; // start from 1KB
43 int num_trial_ = 1;
44 for (int j=10; j<25; j++){
45 size_t num_bytes_ = KB<<j;
46 for (int i=0; i<num_trial_; i++){
47 // measuring cudaMallocManaged
48 ts1 = getTimeDiff(baseTime);
>> 49 cudaMallocManaged((void**)&dptr, num_bytes_);
50 ts2 = getTimeDiff(baseTime);
51 myfile<<"cudaMallocManaged, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
52 //printf("cudaMallocManaged, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
53 printf("cudaMallocManaged, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
54
55 // measuring initialization
56 ts1 = getTimeDiff(baseTime);
>> 57 init<<<1,1>>>(dptr, num_bytes_);
58 ts2 = getTimeDiff(baseTime);
59 myfile<<"initialization, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
60 //printf("init, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
61 printf("init, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
62
63 // measuring cudaFree
64 ts1 = getTimeDiff(baseTime);
>> 65 cudaFree(dptr);
66 ts2 = getTimeDiff(baseTime);
67 myfile<<"cudaFree, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
68 //printf("cudaFree, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
69 printf("cudaFree, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
70 sleep(1);
71 printf("\n");
72 }
73 }
74 myfile.close();
75 return 1;
76 }
结果
cudaMallocManaged, memory_size:1MB, duration:360
init, memory_size:1MB, duration:0
cudaFree, memory_size:1MB, **duration:2**
cudaMallocManaged, memory_size:2MB, duration:1
init, memory_size:2MB, duration:0
cudaFree, memory_size:2MB, **duration:4**
cudaMallocManaged, memory_size:4MB, duration:0
init, memory_size:4MB, duration:0
cudaFree, memory_size:4MB, **duration:9**
cudaMallocManaged, memory_size:8MB, duration:0
init, memory_size:8MB, duration:0
cudaFree, memory_size:8MB, **duration:18**
cudaMallocManaged, memory_size:16MB, duration:0
init, memory_size:16MB, duration:0
cudaFree, memory_size:16MB, **duration:34**
cudaMallocManaged, memory_size:32MB, duration:0
init, memory_size:32MB, duration:0
cudaFree, memory_size:32MB, **duration:69**
cudaMallocManaged, memory_size:64MB, duration:0
init, memory_size:64MB, duration:0
cudaFree, memory_size:64MB, **duration:132**
cudaMallocManaged, memory_size:128MB, duration:0
init, memory_size:128MB, duration:0
cudaFree, memory_size:128MB, **duration:241**
cudaMallocManaged, memory_size:256MB, duration:0
init, memory_size:256MB, duration:0
cudaFree, memory_size:256MB, **duration:476**
cudaMallocManaged, memory_size:512MB, duration:0
init, memory_size:512MB, duration:0
cudaFree, memory_size:512MB, **duration:984**
cudaMallocManaged, memory_size:1024MB, duration:0
init, memory_size:1024MB, duration:0
cudaFree, memory_size:1024MB, **duration:1910**
cudaMallocManaged, memory_size:2048MB, duration:0
init, memory_size:2048MB, duration:1
cudaFree, memory_size:2048MB, **duration:3830**
cudaMallocManaged, memory_size:4096MB, duration:0
init, memory_size:4096MB, duration:0
cudaFree, memory_size:4096MB, **duration:7715**
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, **duration:0**
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, **duration:0**
- 奇怪的是,8192MB 和 16384MB 不仅对于 cudaManagedMalloc 和 init,而且对于 cudaFree 都显示小于 0ms....
请赐教
Why does cudaFree have such a huge overhead?
没有。您使用的计时方法不正确,您归因于 cudaFree
的时间来自之前的异步操作
Does it fill whole memory region with zero for security issue?
没有
Or any other critical path it walks through?
不,除非它要求设备处于空闲状态,而您的情况并非如此。
让我们用您的代码解决最明显的问题:
// measuring initialization
ts1 = getTimeDiff(baseTime);
init<<<1,1>>>(dptr, num_bytes_);
cudaDeviceSynchronize(); // Wait until the GPU is idle
ts2 = getTimeDiff(baseTime);
然后 运行 你的实验:
$ nvcc -std=c++11 -o fliestime fliestime.cu
$ ./fliestime
sizeof(size_t): 8
sizeof(unsigned int): 4
sizeof(int): 4
sizeof(long): 8
cudaMallocManaged, memory_size:1MB, duration:102
init, memory_size:1MB, duration:2
cudaFree, memory_size:1MB, duration:1
cudaMallocManaged, memory_size:2MB, duration:0
init, memory_size:2MB, duration:5
cudaFree, memory_size:2MB, duration:0
cudaMallocManaged, memory_size:4MB, duration:1
init, memory_size:4MB, duration:8
cudaFree, memory_size:4MB, duration:0
cudaMallocManaged, memory_size:8MB, duration:1
init, memory_size:8MB, duration:17
cudaFree, memory_size:8MB, duration:1
cudaMallocManaged, memory_size:16MB, duration:1
init, memory_size:16MB, duration:33
cudaFree, memory_size:16MB, duration:1
cudaMallocManaged, memory_size:32MB, duration:3
init, memory_size:32MB, duration:65
cudaFree, memory_size:32MB, duration:2
cudaMallocManaged, memory_size:64MB, duration:5
init, memory_size:64MB, duration:121
cudaFree, memory_size:64MB, duration:4
cudaMallocManaged, memory_size:128MB, duration:9
init, memory_size:128MB, duration:219
cudaFree, memory_size:128MB, duration:8
cudaMallocManaged, memory_size:256MB, duration:17
init, memory_size:256MB, duration:427
cudaFree, memory_size:256MB, duration:18
cudaMallocManaged, memory_size:512MB, duration:34
init, memory_size:512MB, duration:854
cudaFree, memory_size:512MB, duration:35
cudaMallocManaged, memory_size:1024MB, duration:67
init, memory_size:1024MB, duration:1709
cudaFree, memory_size:1024MB, duration:70
cudaMallocManaged, memory_size:2048MB, duration:133
init, memory_size:2048MB, duration:3418
cudaFree, memory_size:2048MB, duration:141
cudaMallocManaged, memory_size:4096MB, duration:786
init, memory_size:4096MB, duration:4
cudaFree, memory_size:4096MB, duration:0
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, duration:0
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, duration:0
您可以看到您假设的时间 cudaFree
是您的完全串行 init
内核花费的时间 运行。随着分配大小的增加,cudaMallocManaged
和 cudaFree
确实使用了更多时间,但这并非不合理恕我直言。
weird thing is that 8192MB and 16384MB shows less than 0ms not only for cudaManagedMalloc and init but also for cudaFree....
那是因为什么都运行宁。如果您使用正确的 runtime error checking,您会看到一切都失败了,内存不足 运行 时间错误。
我观察到 cudaFree 操作需要很长时间才能运行。为了验证它,我编写了一个简单的微型基准代码来测试 cudaFree 开销。它显示了类似的趋势。
第一步:cudaMallocManaged
不需要那么长时间。实际上仅对于 cudaMallocManaged 不到 1ms。
第二步:init
要初始化 malloced 内存区域,将调用 init 函数。而且也不会少于1ms。
第三步:cudaFree
现在,开销出现了。 需要很多时间。还有一件事是更大的内存大小,需要更长的时间。(漂亮的线性)
问题是"Why does cudaFree have such a huge overhead? Does it fill whole memory region with zero for security issue? Or any other critical path it walks through?"
这里是代码和实测结果。 提前谢谢你:) !!!
10 int getMilliCount(){
11 timeb tb;
12 ftime(&tb);
13 int nCount = tb.millitm + (tb.time & 0xfffff) * 1000;
14 return nCount;
15 }
16
17 int getTimeDiff(int baseTime){
18 int diff = getMilliCount() - baseTime;
19 return diff;
20 }
21
22 __global__ void init(int* x, size_t bytes_){
23 int num_ = bytes_/sizeof(int);
24 for (int i=0; i<num_; i++){
25 x[i] = i;
26 }
27 }
28
29 int main(){
30 printf("sizeof(size_t): %zu\n", sizeof(size_t));
31 printf("sizeof(unsigned int): %zu\n", sizeof(unsigned int));
32 printf("sizeof(int): %zu\n", sizeof(int));
33 printf("sizeof(long): %zu\n", sizeof(long));
34
35 std::ofstream myfile;
36 myfile.open("output3.csv");
37 myfile<<"operation, num_bytes, start, end, duration\n";
38 int baseTime = getMilliCount();
39 int* dptr;
40 int ts1 = 0;
41 int ts2 = 0;
42 size_t KB = 1024; // start from 1KB
43 int num_trial_ = 1;
44 for (int j=10; j<25; j++){
45 size_t num_bytes_ = KB<<j;
46 for (int i=0; i<num_trial_; i++){
47 // measuring cudaMallocManaged
48 ts1 = getTimeDiff(baseTime);
>> 49 cudaMallocManaged((void**)&dptr, num_bytes_);
50 ts2 = getTimeDiff(baseTime);
51 myfile<<"cudaMallocManaged, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
52 //printf("cudaMallocManaged, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
53 printf("cudaMallocManaged, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
54
55 // measuring initialization
56 ts1 = getTimeDiff(baseTime);
>> 57 init<<<1,1>>>(dptr, num_bytes_);
58 ts2 = getTimeDiff(baseTime);
59 myfile<<"initialization, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
60 //printf("init, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
61 printf("init, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
62
63 // measuring cudaFree
64 ts1 = getTimeDiff(baseTime);
>> 65 cudaFree(dptr);
66 ts2 = getTimeDiff(baseTime);
67 myfile<<"cudaFree, "<<num_bytes_/(1024*1024)<<","<<ts1<<","<<ts2<<","<<ts2-ts1<<"\n";
68 //printf("cudaFree, memory_size:%zuMB, start:%d, end:%d, duration:%d\n", num_bytes_/(1024*1024), ts1, ts2, ts2-ts1);
69 printf("cudaFree, memory_size:%zuMB, duration:%d\n", num_bytes_/(1024*1024), ts2-ts1);
70 sleep(1);
71 printf("\n");
72 }
73 }
74 myfile.close();
75 return 1;
76 }
结果
cudaMallocManaged, memory_size:1MB, duration:360
init, memory_size:1MB, duration:0
cudaFree, memory_size:1MB, **duration:2**
cudaMallocManaged, memory_size:2MB, duration:1
init, memory_size:2MB, duration:0
cudaFree, memory_size:2MB, **duration:4**
cudaMallocManaged, memory_size:4MB, duration:0
init, memory_size:4MB, duration:0
cudaFree, memory_size:4MB, **duration:9**
cudaMallocManaged, memory_size:8MB, duration:0
init, memory_size:8MB, duration:0
cudaFree, memory_size:8MB, **duration:18**
cudaMallocManaged, memory_size:16MB, duration:0
init, memory_size:16MB, duration:0
cudaFree, memory_size:16MB, **duration:34**
cudaMallocManaged, memory_size:32MB, duration:0
init, memory_size:32MB, duration:0
cudaFree, memory_size:32MB, **duration:69**
cudaMallocManaged, memory_size:64MB, duration:0
init, memory_size:64MB, duration:0
cudaFree, memory_size:64MB, **duration:132**
cudaMallocManaged, memory_size:128MB, duration:0
init, memory_size:128MB, duration:0
cudaFree, memory_size:128MB, **duration:241**
cudaMallocManaged, memory_size:256MB, duration:0
init, memory_size:256MB, duration:0
cudaFree, memory_size:256MB, **duration:476**
cudaMallocManaged, memory_size:512MB, duration:0
init, memory_size:512MB, duration:0
cudaFree, memory_size:512MB, **duration:984**
cudaMallocManaged, memory_size:1024MB, duration:0
init, memory_size:1024MB, duration:0
cudaFree, memory_size:1024MB, **duration:1910**
cudaMallocManaged, memory_size:2048MB, duration:0
init, memory_size:2048MB, duration:1
cudaFree, memory_size:2048MB, **duration:3830**
cudaMallocManaged, memory_size:4096MB, duration:0
init, memory_size:4096MB, duration:0
cudaFree, memory_size:4096MB, **duration:7715**
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, **duration:0**
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, **duration:0**
- 奇怪的是,8192MB 和 16384MB 不仅对于 cudaManagedMalloc 和 init,而且对于 cudaFree 都显示小于 0ms....
请赐教
Why does cudaFree have such a huge overhead?
没有。您使用的计时方法不正确,您归因于 cudaFree
的时间来自之前的异步操作
Does it fill whole memory region with zero for security issue?
没有
Or any other critical path it walks through?
不,除非它要求设备处于空闲状态,而您的情况并非如此。
让我们用您的代码解决最明显的问题:
// measuring initialization
ts1 = getTimeDiff(baseTime);
init<<<1,1>>>(dptr, num_bytes_);
cudaDeviceSynchronize(); // Wait until the GPU is idle
ts2 = getTimeDiff(baseTime);
然后 运行 你的实验:
$ nvcc -std=c++11 -o fliestime fliestime.cu
$ ./fliestime
sizeof(size_t): 8
sizeof(unsigned int): 4
sizeof(int): 4
sizeof(long): 8
cudaMallocManaged, memory_size:1MB, duration:102
init, memory_size:1MB, duration:2
cudaFree, memory_size:1MB, duration:1
cudaMallocManaged, memory_size:2MB, duration:0
init, memory_size:2MB, duration:5
cudaFree, memory_size:2MB, duration:0
cudaMallocManaged, memory_size:4MB, duration:1
init, memory_size:4MB, duration:8
cudaFree, memory_size:4MB, duration:0
cudaMallocManaged, memory_size:8MB, duration:1
init, memory_size:8MB, duration:17
cudaFree, memory_size:8MB, duration:1
cudaMallocManaged, memory_size:16MB, duration:1
init, memory_size:16MB, duration:33
cudaFree, memory_size:16MB, duration:1
cudaMallocManaged, memory_size:32MB, duration:3
init, memory_size:32MB, duration:65
cudaFree, memory_size:32MB, duration:2
cudaMallocManaged, memory_size:64MB, duration:5
init, memory_size:64MB, duration:121
cudaFree, memory_size:64MB, duration:4
cudaMallocManaged, memory_size:128MB, duration:9
init, memory_size:128MB, duration:219
cudaFree, memory_size:128MB, duration:8
cudaMallocManaged, memory_size:256MB, duration:17
init, memory_size:256MB, duration:427
cudaFree, memory_size:256MB, duration:18
cudaMallocManaged, memory_size:512MB, duration:34
init, memory_size:512MB, duration:854
cudaFree, memory_size:512MB, duration:35
cudaMallocManaged, memory_size:1024MB, duration:67
init, memory_size:1024MB, duration:1709
cudaFree, memory_size:1024MB, duration:70
cudaMallocManaged, memory_size:2048MB, duration:133
init, memory_size:2048MB, duration:3418
cudaFree, memory_size:2048MB, duration:141
cudaMallocManaged, memory_size:4096MB, duration:786
init, memory_size:4096MB, duration:4
cudaFree, memory_size:4096MB, duration:0
cudaMallocManaged, memory_size:8192MB, duration:0
init, memory_size:8192MB, duration:0
cudaFree, memory_size:8192MB, duration:0
cudaMallocManaged, memory_size:16384MB, duration:0
init, memory_size:16384MB, duration:0
cudaFree, memory_size:16384MB, duration:0
您可以看到您假设的时间 cudaFree
是您的完全串行 init
内核花费的时间 运行。随着分配大小的增加,cudaMallocManaged
和 cudaFree
确实使用了更多时间,但这并非不合理恕我直言。
weird thing is that 8192MB and 16384MB shows less than 0ms not only for cudaManagedMalloc and init but also for cudaFree....
那是因为什么都运行宁。如果您使用正确的 runtime error checking,您会看到一切都失败了,内存不足 运行 时间错误。