CUDA 程序使用系统 ram 的指数数量
CUDA program using exponential amount of system ram
我的粒子模拟使用了太多内存。我关心的不是数量本身,而是当我有理由相信它不应该增长时它呈指数增长的事实。我正在使用 CUDA,这是我最近添加的组件,因此我怀疑是导致问题的原因。我已经确定这不是(仅)内核中的问题,因为即使内核没有 运行,ram 使用量也会增加。我怀疑这与我分配内存的方式有关,但我不明白哪里出错了。对于这样一个微不足道的问题,我深表歉意,我是 CUDA 的新手(如果这不是很明显的话)。这是有问题的意大利面,谢谢你的时间。
int main() {
std::srand(time(0));
window.setFramerateLimit(limit);
window.setVerticalSyncEnabled(true);
sf::Clock clock;
while (window.isOpen()) {
sf::Event evnt;
while (window.pollEvent(evnt)) {
switch (evnt.type) {
case sf::Event::Closed:
window.close();
break;
case sf::Event::TextEntered:
if (evnt.text.unicode < 128) {
//printf("%c", evnt.text.unicode);
}
}
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)) {
spawnParticle();
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)) {
for (auto particle : particleList) {
delete particle;
}
particleList.clear();
}
window.clear(sf::Color::Color::Black);
background.setFillColor(sf::Color::Color(25, 25, 25, 255));
background.setPosition(-8, -8);
window.draw(background);
for (int i = 0; i < particleList.size(); i++) {
particleList[i]->write(i);
}
int Num = particleList.size();
// Vectors for holding the host-side (CPU-side) data
float* h_big_algo, * h_big_relationships, * h_location, * h_destinations, * h_energies, * h_frequencies;
int* h_N;
cudaMallocHost(&h_big_algo, Num * Num * 8 * sizeof(float));
cudaMallocHost(&h_big_relationships, Num * Num * 3 * sizeof(float));
cudaMallocHost(&h_location, Num * 2 * sizeof(float));
cudaMallocHost(&h_N, sizeof(int));
cudaMallocHost(&h_destinations, Num * 2 * sizeof(float));
cudaMallocHost(&h_energies, Num * sizeof(float));
cudaMallocHost(&h_frequencies, Num * sizeof(float));
h_big_algo = big_algo.data();
h_big_relationships = big_relationships.data();
h_location = location_list.data();
h_N = &Num;
h_frequencies = frequencies.data();
// Allocate device memory
float* d_big_algo, * d_big_relationships, * d_location, *d_destinations, *d_energies, *d_frequencies;
int* d_N, * d_influence_N;
cudaMalloc(&d_big_algo, Num * Num *8*sizeof(float));
cudaMalloc(&d_big_relationships, Num * Num *3*sizeof(float));
cudaMalloc(&d_location, Num *2*sizeof(float));
cudaMalloc(&d_N, sizeof(int));
cudaMalloc(&d_destinations, Num * 2 * sizeof(float));
cudaMalloc(&d_influence_N, Num *sizeof(int));
cudaMalloc(&d_energies, Num * sizeof(float));
cudaMalloc(&d_frequencies, Num * sizeof(float));
// Copy data to the device
cudaMemcpy(d_big_algo, h_big_algo, Num * 8 * Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_big_relationships, h_big_relationships, Num * Num * 3 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_location, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_energies, energies.data(), Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_frequencies, h_frequencies, Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_destinations, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemset(d_influence_N, 0, Num * sizeof(int));
int NUM_THREADS = 1024;
int NUM_BLOCKS = (pow(Num,2) + NUM_THREADS - 1) / NUM_THREADS;
move <<<NUM_BLOCKS, NUM_THREADS>>> (d_big_algo, d_big_relationships, d_location, d_N,
d_destinations, d_influence_N, d_energies, d_frequencies);
// Copy back to the host
cudaMemcpy(h_destinations, d_destinations, Num * 2 * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_energies, d_energies, Num * sizeof(float), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(d_big_algo);
cudaFree(d_big_relationships);
cudaFree(d_location);
cudaFree(d_N);
cudaFree(d_destinations);
cudaFree(d_influence_N);
cudaFree(d_energies);
cudaFree(d_frequencies);
big_algo.clear();
big_relationships.clear();
location_list.clear();
energies.clear();
frequencies.clear();
//read from h_locations and h_energies
cudaFreeHost(h_big_algo);
cudaFreeHost(h_big_relationships);
cudaFreeHost(h_N);
cudaFreeHost(h_frequencies);
cudaFreeHost(h_location);
apply_all(h_destinations, h_energies);
cudaFreeHost(h_energies);
cudaFreeHost(h_destinations);
for (int i = 0; i < particleList.size(); i++) {
particleList[i]->draw_self();
/*if (particleList[i]->energy < 0) {
cout << "particle died" << endl;
particleList[i]->seppuku();
//doomed_particles.push_back({ i, particleList[i] });
}
if (particleList[i]->energy > 10) {
particleList[i]->reproduce();
particleList[i]->energy -= reproduction_cost;
}*/
}
window.display();
}
return 0;
}
还有内核:
__global__ void move(float* d_big_algo, float* d_big_relationships, float* d_location,
int* N, float* d_destinations, int * d_influence_N, float *d_energies, float*d_frequencies) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
if (id < (*N)*(*N)) {
//printf("%i ", *N);
int subject = (id-((id+*N)%*N))/ *N;
int object = (id + *N) % *N;
float distance = sqrt(powf((d_location[object*2] - d_location[subject*2]), 2.0f)
+ powf((d_location[object * 2 +1] - d_location[subject * 2+ 1]), 2.0f));
float relative_maximum = d_big_relationships[(object * 3) + (subject * *N * 3) +2];
if ((distance < relative_maximum)&&(distance > 0)) {
float relative_minimum = d_big_relationships[(object * 3) + (subject * *N * 3)];
float relative_medium = d_big_relationships[(object * 3) + (subject * *N * 3) + 1];
/*if (distance < 12) {
if (abs(d_frequencies[subject] - d_frequencies[object]) > 0.1) {
if (d_energies[subject] > d_energies[object]) {
d_energies[subject]+=0.1;
d_energies[object]-=0.1;
}
if (d_energies[subject] < d_energies[object]) {
d_energies[subject]-=0.1;
d_energies[object]+=0.1;
}
}
//else {
// d_energies[subject]+= (d_energies[subject] - d_energies[object])/100;
//}
}*/
if ((distance < 8) && (distance > 0)) {
//printf("%i moving\n", id);
float force = 2.0f * (-distance / powf(distance, 2));
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
d_destinations[subject*2] += force * (d_location[object * 2] - d_location[subject * 2]);
d_destinations[subject * 2+1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]);
//delete& force;
}
else if ((distance < relative_medium) && (distance > relative_minimum)) {
//printf("%i moving\n", id);
float force = d_big_algo[(object * 8) + (subject * *N * 8) + 4] * abs((d_big_algo[(object * 8) + (subject * *N * 8) + 5] * distance)
- d_big_algo[(object * 8) + (subject * *N * 8) + 6]) + d_big_algo[(object * 8) + (subject * *N * 8) + 7];
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
//delete& force;
//delete& destination_mod;
}
else if (distance > relative_medium) {
//printf("%i moving\n", id);
float force = d_big_algo[(object * 8) + (subject * *N * 8)] * abs((d_big_algo[(object * 8) + (subject * *N * 8) +1] * distance)
- d_big_algo[(object * 8) + (subject * *N * 8) + 2]) + d_big_algo[(object * 8) + (subject * *N * 8) + 3];
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
//delete& force;
//delete& destination_mod;
}
//delete& relative_minimum;
//delete& relative_medium;
}
//delete& subject;
//delete& object;
//delete& distance;
//delete& relative_maximum;
}
//__syncthreads();
if (id < *N) {
if (d_location[id * 2+1] < 2 || d_location[id * 2 + 1] > HEIGHT - 2) {
d_destinations[id * 2+1] = HEIGHT / 2;
}
if (d_location[id * 2]<2 || d_location[id * 2] > WIDTH - 2) {
d_destinations[id * 2] = WIDTH / 2;
}
if (d_location[id * 2] >= WIDTH - 10) {
d_destinations[id * 2] = abs(d_location[id * 2]) - ((d_location[id * 2] - (WIDTH - 10)) / 2) / (WIDTH / abs(d_location[id * 2]));
}
if (d_location[id * 2] < 10) {
d_destinations[id * 2] = abs(d_location[id * 2]) + ((d_location[id * 2] + 10) / 2) / (abs(d_location[id * 2]) + 0.1);
}
if (d_location[id * 2 + 1] >= HEIGHT - 10) {
d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) - ((d_location[id * 2 + 1] - (HEIGHT - 10)) / 2) / (HEIGHT / abs(d_location[id * 2 + 1]));
}
if (d_location[id * 2 + 1] < 10) {
d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) + ((d_location[id * 2 + 1] + 10) / 2) / (abs(d_location[id * 2 + 1]) + 0.1);
}
if (2.0f * (sqrt(powf(d_location[id * 2] - WIDTH / 2.0f, 2) + powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) > WIDTH) {
if (d_location[id * 2 + 1] >= HEIGHT / 2.0f) {
d_destinations[id * 2 + 1] = sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
}
if (d_location[id * 2 + 1] <= HEIGHT / 2.0f) {
d_destinations[id * 2 + 1] = -sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
}
if (d_location[id * 2] <= WIDTH / 2.0f) {
d_destinations[id * 2] = -sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
}
if (d_location[id * 2] >= WIDTH / 2.0f) {
d_destinations[id * 2] = sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
//send out
}
}
}
delete &id;
}
代码还有更多内容,但我认为这些是导致问题的部分(也许值得注意我也在使用 SFML)。
你多次调用cudaMallocHost
来分配space并将其存储在局部变量(h_big_algo
、h_N
等)中,然后立即覆盖返回的指针其他数据(h_big_algo = big_algo.data();
、h_N = &Num;
等)。
这将泄漏 cudaMallocHost
分配的内存。
您稍后调用 cudaFreeHost
,这将释放函数调用返回的内存,而不是 cudaMallocHost
分配的内存。更糟糕的是,对 cudaFreeHost(h_N);
的调用将传递一个指向本地基于堆栈的变量的指针。
您不应该为那些赋值的指针调用 cudaMallocHost
。
我的粒子模拟使用了太多内存。我关心的不是数量本身,而是当我有理由相信它不应该增长时它呈指数增长的事实。我正在使用 CUDA,这是我最近添加的组件,因此我怀疑是导致问题的原因。我已经确定这不是(仅)内核中的问题,因为即使内核没有 运行,ram 使用量也会增加。我怀疑这与我分配内存的方式有关,但我不明白哪里出错了。对于这样一个微不足道的问题,我深表歉意,我是 CUDA 的新手(如果这不是很明显的话)。这是有问题的意大利面,谢谢你的时间。
int main() {
std::srand(time(0));
window.setFramerateLimit(limit);
window.setVerticalSyncEnabled(true);
sf::Clock clock;
while (window.isOpen()) {
sf::Event evnt;
while (window.pollEvent(evnt)) {
switch (evnt.type) {
case sf::Event::Closed:
window.close();
break;
case sf::Event::TextEntered:
if (evnt.text.unicode < 128) {
//printf("%c", evnt.text.unicode);
}
}
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)) {
spawnParticle();
}
if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)) {
for (auto particle : particleList) {
delete particle;
}
particleList.clear();
}
window.clear(sf::Color::Color::Black);
background.setFillColor(sf::Color::Color(25, 25, 25, 255));
background.setPosition(-8, -8);
window.draw(background);
for (int i = 0; i < particleList.size(); i++) {
particleList[i]->write(i);
}
int Num = particleList.size();
// Vectors for holding the host-side (CPU-side) data
float* h_big_algo, * h_big_relationships, * h_location, * h_destinations, * h_energies, * h_frequencies;
int* h_N;
cudaMallocHost(&h_big_algo, Num * Num * 8 * sizeof(float));
cudaMallocHost(&h_big_relationships, Num * Num * 3 * sizeof(float));
cudaMallocHost(&h_location, Num * 2 * sizeof(float));
cudaMallocHost(&h_N, sizeof(int));
cudaMallocHost(&h_destinations, Num * 2 * sizeof(float));
cudaMallocHost(&h_energies, Num * sizeof(float));
cudaMallocHost(&h_frequencies, Num * sizeof(float));
h_big_algo = big_algo.data();
h_big_relationships = big_relationships.data();
h_location = location_list.data();
h_N = &Num;
h_frequencies = frequencies.data();
// Allocate device memory
float* d_big_algo, * d_big_relationships, * d_location, *d_destinations, *d_energies, *d_frequencies;
int* d_N, * d_influence_N;
cudaMalloc(&d_big_algo, Num * Num *8*sizeof(float));
cudaMalloc(&d_big_relationships, Num * Num *3*sizeof(float));
cudaMalloc(&d_location, Num *2*sizeof(float));
cudaMalloc(&d_N, sizeof(int));
cudaMalloc(&d_destinations, Num * 2 * sizeof(float));
cudaMalloc(&d_influence_N, Num *sizeof(int));
cudaMalloc(&d_energies, Num * sizeof(float));
cudaMalloc(&d_frequencies, Num * sizeof(float));
// Copy data to the device
cudaMemcpy(d_big_algo, h_big_algo, Num * 8 * Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_big_relationships, h_big_relationships, Num * Num * 3 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_location, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_energies, energies.data(), Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_frequencies, h_frequencies, Num * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_destinations, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemset(d_influence_N, 0, Num * sizeof(int));
int NUM_THREADS = 1024;
int NUM_BLOCKS = (pow(Num,2) + NUM_THREADS - 1) / NUM_THREADS;
move <<<NUM_BLOCKS, NUM_THREADS>>> (d_big_algo, d_big_relationships, d_location, d_N,
d_destinations, d_influence_N, d_energies, d_frequencies);
// Copy back to the host
cudaMemcpy(h_destinations, d_destinations, Num * 2 * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_energies, d_energies, Num * sizeof(float), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(d_big_algo);
cudaFree(d_big_relationships);
cudaFree(d_location);
cudaFree(d_N);
cudaFree(d_destinations);
cudaFree(d_influence_N);
cudaFree(d_energies);
cudaFree(d_frequencies);
big_algo.clear();
big_relationships.clear();
location_list.clear();
energies.clear();
frequencies.clear();
//read from h_locations and h_energies
cudaFreeHost(h_big_algo);
cudaFreeHost(h_big_relationships);
cudaFreeHost(h_N);
cudaFreeHost(h_frequencies);
cudaFreeHost(h_location);
apply_all(h_destinations, h_energies);
cudaFreeHost(h_energies);
cudaFreeHost(h_destinations);
for (int i = 0; i < particleList.size(); i++) {
particleList[i]->draw_self();
/*if (particleList[i]->energy < 0) {
cout << "particle died" << endl;
particleList[i]->seppuku();
//doomed_particles.push_back({ i, particleList[i] });
}
if (particleList[i]->energy > 10) {
particleList[i]->reproduce();
particleList[i]->energy -= reproduction_cost;
}*/
}
window.display();
}
return 0;
}
还有内核:
__global__ void move(float* d_big_algo, float* d_big_relationships, float* d_location,
int* N, float* d_destinations, int * d_influence_N, float *d_energies, float*d_frequencies) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
if (id < (*N)*(*N)) {
//printf("%i ", *N);
int subject = (id-((id+*N)%*N))/ *N;
int object = (id + *N) % *N;
float distance = sqrt(powf((d_location[object*2] - d_location[subject*2]), 2.0f)
+ powf((d_location[object * 2 +1] - d_location[subject * 2+ 1]), 2.0f));
float relative_maximum = d_big_relationships[(object * 3) + (subject * *N * 3) +2];
if ((distance < relative_maximum)&&(distance > 0)) {
float relative_minimum = d_big_relationships[(object * 3) + (subject * *N * 3)];
float relative_medium = d_big_relationships[(object * 3) + (subject * *N * 3) + 1];
/*if (distance < 12) {
if (abs(d_frequencies[subject] - d_frequencies[object]) > 0.1) {
if (d_energies[subject] > d_energies[object]) {
d_energies[subject]+=0.1;
d_energies[object]-=0.1;
}
if (d_energies[subject] < d_energies[object]) {
d_energies[subject]-=0.1;
d_energies[object]+=0.1;
}
}
//else {
// d_energies[subject]+= (d_energies[subject] - d_energies[object])/100;
//}
}*/
if ((distance < 8) && (distance > 0)) {
//printf("%i moving\n", id);
float force = 2.0f * (-distance / powf(distance, 2));
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
d_destinations[subject*2] += force * (d_location[object * 2] - d_location[subject * 2]);
d_destinations[subject * 2+1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]);
//delete& force;
}
else if ((distance < relative_medium) && (distance > relative_minimum)) {
//printf("%i moving\n", id);
float force = d_big_algo[(object * 8) + (subject * *N * 8) + 4] * abs((d_big_algo[(object * 8) + (subject * *N * 8) + 5] * distance)
- d_big_algo[(object * 8) + (subject * *N * 8) + 6]) + d_big_algo[(object * 8) + (subject * *N * 8) + 7];
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
//delete& force;
//delete& destination_mod;
}
else if (distance > relative_medium) {
//printf("%i moving\n", id);
float force = d_big_algo[(object * 8) + (subject * *N * 8)] * abs((d_big_algo[(object * 8) + (subject * *N * 8) +1] * distance)
- d_big_algo[(object * 8) + (subject * *N * 8) + 2]) + d_big_algo[(object * 8) + (subject * *N * 8) + 3];
d_influence_N[subject] += 1;
//printf("%i ready to sync\n", id);
//__syncthreads();
//printf("%i INFLUENCE ", d_influence_N[subject]);
float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
//delete& force;
//delete& destination_mod;
}
//delete& relative_minimum;
//delete& relative_medium;
}
//delete& subject;
//delete& object;
//delete& distance;
//delete& relative_maximum;
}
//__syncthreads();
if (id < *N) {
if (d_location[id * 2+1] < 2 || d_location[id * 2 + 1] > HEIGHT - 2) {
d_destinations[id * 2+1] = HEIGHT / 2;
}
if (d_location[id * 2]<2 || d_location[id * 2] > WIDTH - 2) {
d_destinations[id * 2] = WIDTH / 2;
}
if (d_location[id * 2] >= WIDTH - 10) {
d_destinations[id * 2] = abs(d_location[id * 2]) - ((d_location[id * 2] - (WIDTH - 10)) / 2) / (WIDTH / abs(d_location[id * 2]));
}
if (d_location[id * 2] < 10) {
d_destinations[id * 2] = abs(d_location[id * 2]) + ((d_location[id * 2] + 10) / 2) / (abs(d_location[id * 2]) + 0.1);
}
if (d_location[id * 2 + 1] >= HEIGHT - 10) {
d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) - ((d_location[id * 2 + 1] - (HEIGHT - 10)) / 2) / (HEIGHT / abs(d_location[id * 2 + 1]));
}
if (d_location[id * 2 + 1] < 10) {
d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) + ((d_location[id * 2 + 1] + 10) / 2) / (abs(d_location[id * 2 + 1]) + 0.1);
}
if (2.0f * (sqrt(powf(d_location[id * 2] - WIDTH / 2.0f, 2) + powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) > WIDTH) {
if (d_location[id * 2 + 1] >= HEIGHT / 2.0f) {
d_destinations[id * 2 + 1] = sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
}
if (d_location[id * 2 + 1] <= HEIGHT / 2.0f) {
d_destinations[id * 2 + 1] = -sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
}
if (d_location[id * 2] <= WIDTH / 2.0f) {
d_destinations[id * 2] = -sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
}
if (d_location[id * 2] >= WIDTH / 2.0f) {
d_destinations[id * 2] = sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
//send out
}
}
}
delete &id;
}
代码还有更多内容,但我认为这些是导致问题的部分(也许值得注意我也在使用 SFML)。
你多次调用cudaMallocHost
来分配space并将其存储在局部变量(h_big_algo
、h_N
等)中,然后立即覆盖返回的指针其他数据(h_big_algo = big_algo.data();
、h_N = &Num;
等)。
这将泄漏 cudaMallocHost
分配的内存。
您稍后调用 cudaFreeHost
,这将释放函数调用返回的内存,而不是 cudaMallocHost
分配的内存。更糟糕的是,对 cudaFreeHost(h_N);
的调用将传递一个指向本地基于堆栈的变量的指针。
您不应该为那些赋值的指针调用 cudaMallocHost
。