CUDA 程序使用系统 ram 的指数数量

Question

我的粒子模拟使用了太多内存。我关心的不是数量本身，而是当我有理由相信它不应该增长时它呈指数增长的事实。我正在使用 CUDA，这是我最近添加的组件，因此我怀疑是导致问题的原因。我已经确定这不是（仅）内核中的问题，因为即使内核没有运行，ram 使用量也会增加。我怀疑这与我分配内存的方式有关，但我不明白哪里出错了。对于这样一个微不足道的问题，我深表歉意，我是 CUDA 的新手（如果这不是很明显的话）。这是有问题的意大利面，谢谢你的时间。


int main() {
    std::srand(time(0));
    window.setFramerateLimit(limit);
    window.setVerticalSyncEnabled(true);
    sf::Clock clock;
    
    while (window.isOpen()) {
        sf::Event evnt;
        while (window.pollEvent(evnt)) {
            switch (evnt.type) {
            case sf::Event::Closed:
                window.close();
                break;
            case sf::Event::TextEntered:
                if (evnt.text.unicode < 128) {
                    //printf("%c", evnt.text.unicode);
                }
            }
            
        }


        if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::Space)) {
            spawnParticle();
            
        }
        if (sf::Keyboard::isKeyPressed(sf::Keyboard::Key::R)) {
            for (auto particle : particleList) {
                delete particle;
            }
            particleList.clear();
        }

        window.clear(sf::Color::Color::Black);
        background.setFillColor(sf::Color::Color(25, 25, 25, 255));
        background.setPosition(-8, -8);
        window.draw(background);

        for (int i = 0; i < particleList.size(); i++) {
            particleList[i]->write(i);

        }


        int Num = particleList.size();

        // Vectors for holding the host-side (CPU-side) data
        float* h_big_algo, * h_big_relationships, * h_location,  * h_destinations, * h_energies, * h_frequencies;
        int* h_N;
        cudaMallocHost(&h_big_algo, Num * Num * 8 * sizeof(float));
        cudaMallocHost(&h_big_relationships, Num * Num * 3 * sizeof(float));
        cudaMallocHost(&h_location, Num * 2 * sizeof(float));
        cudaMallocHost(&h_N, sizeof(int));
        cudaMallocHost(&h_destinations, Num * 2 * sizeof(float));
        cudaMallocHost(&h_energies, Num * sizeof(float));
        cudaMallocHost(&h_frequencies, Num * sizeof(float));

        h_big_algo = big_algo.data();
        h_big_relationships = big_relationships.data();
        h_location = location_list.data();
        h_N = &Num;
        h_frequencies = frequencies.data();

        // Allocate device memory
        float* d_big_algo, * d_big_relationships, * d_location, *d_destinations,  *d_energies, *d_frequencies;
        int* d_N, * d_influence_N;
        cudaMalloc(&d_big_algo, Num * Num *8*sizeof(float));
        cudaMalloc(&d_big_relationships, Num * Num *3*sizeof(float));
        cudaMalloc(&d_location, Num *2*sizeof(float));
        cudaMalloc(&d_N, sizeof(int));
        cudaMalloc(&d_destinations, Num * 2 * sizeof(float));
        cudaMalloc(&d_influence_N, Num *sizeof(int));
        cudaMalloc(&d_energies, Num * sizeof(float));
        cudaMalloc(&d_frequencies, Num * sizeof(float));

        // Copy data to the device
        cudaMemcpy(d_big_algo, h_big_algo, Num * 8 * Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_big_relationships, h_big_relationships, Num * Num * 3 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_location, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_N, h_N, sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_energies, energies.data(), Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_frequencies, h_frequencies, Num * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_destinations, h_location, Num * 2 * sizeof(float), cudaMemcpyHostToDevice);

        cudaMemset(d_influence_N, 0, Num * sizeof(int));

        int NUM_THREADS = 1024;

        int NUM_BLOCKS = (pow(Num,2) + NUM_THREADS - 1) / NUM_THREADS;

        move <<<NUM_BLOCKS, NUM_THREADS>>> (d_big_algo, d_big_relationships, d_location, d_N, 
                                                d_destinations, d_influence_N, d_energies, d_frequencies);
         
        // Copy back to the host
        cudaMemcpy(h_destinations, d_destinations, Num * 2 * sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_energies, d_energies, Num * sizeof(float), cudaMemcpyDeviceToHost);

        // Free memory on device
        cudaFree(d_big_algo);
        cudaFree(d_big_relationships);
        cudaFree(d_location);
        cudaFree(d_N);
        cudaFree(d_destinations);
        cudaFree(d_influence_N);
        cudaFree(d_energies);
        cudaFree(d_frequencies);

        big_algo.clear();
        big_relationships.clear();

        location_list.clear();
        energies.clear();
        frequencies.clear();

        //read from h_locations and h_energies

        cudaFreeHost(h_big_algo);
        cudaFreeHost(h_big_relationships);
        cudaFreeHost(h_N);
        cudaFreeHost(h_frequencies);
        cudaFreeHost(h_location);
        apply_all(h_destinations, h_energies);
        cudaFreeHost(h_energies);
        cudaFreeHost(h_destinations);

        for (int i = 0; i < particleList.size(); i++) {
            particleList[i]->draw_self();

            /*if (particleList[i]->energy < 0) {
                cout << "particle died" << endl;
                particleList[i]->seppuku();
                //doomed_particles.push_back({ i, particleList[i] });
            }
            if (particleList[i]->energy > 10) {
                particleList[i]->reproduce();
                particleList[i]->energy -= reproduction_cost;
            }*/
            
        }
        

        window.display();
        }
        
    return 0;
}

还有内核：

__global__ void move(float* d_big_algo, float* d_big_relationships, float* d_location, 
                int* N,  float* d_destinations, int * d_influence_N, float *d_energies, float*d_frequencies) {
    
    int id = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (id < (*N)*(*N)) {
        //printf("%i ", *N);

        int subject = (id-((id+*N)%*N))/ *N;
        int object = (id + *N) % *N;
        
        float distance = sqrt(powf((d_location[object*2] - d_location[subject*2]), 2.0f) 
                    + powf((d_location[object * 2 +1] - d_location[subject * 2+ 1]), 2.0f));

        float relative_maximum = d_big_relationships[(object * 3) + (subject * *N * 3) +2];
        
        if ((distance < relative_maximum)&&(distance > 0)) {
            float relative_minimum = d_big_relationships[(object * 3) + (subject * *N * 3)];
            float relative_medium = d_big_relationships[(object * 3) + (subject * *N * 3) + 1];

            /*if (distance < 12) {
                if (abs(d_frequencies[subject] - d_frequencies[object]) > 0.1) {
                    if (d_energies[subject] > d_energies[object]) {
                        d_energies[subject]+=0.1;
                        d_energies[object]-=0.1;
                    }
                    if (d_energies[subject] < d_energies[object]) {
                        d_energies[subject]-=0.1;
                        d_energies[object]+=0.1;
                    }
                }
                //else {
                //  d_energies[subject]+= (d_energies[subject] - d_energies[object])/100;
                //}
            }*/

            if ((distance < 8) && (distance > 0)) {
                //printf("%i moving\n", id);
                float force = 2.0f * (-distance / powf(distance, 2));
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                d_destinations[subject*2] += force * (d_location[object * 2] - d_location[subject * 2]);
                d_destinations[subject * 2+1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]);
                //delete& force;

            }
            else if ((distance < relative_medium) && (distance > relative_minimum)) {
                //printf("%i moving\n", id);
                float force = d_big_algo[(object * 8) + (subject * *N * 8) + 4] * abs((d_big_algo[(object * 8) + (subject * *N * 8) + 5] * distance)
                                                - d_big_algo[(object * 8) + (subject * *N * 8) + 6]) + d_big_algo[(object * 8) + (subject * *N * 8) + 7];
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
                d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
                d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
                //delete& force;
                //delete& destination_mod;
            }

            else if (distance > relative_medium) {
                //printf("%i moving\n", id);
                float force = d_big_algo[(object * 8) + (subject * *N * 8)] * abs((d_big_algo[(object * 8) + (subject * *N * 8) +1] * distance)
                                                - d_big_algo[(object * 8) + (subject * *N * 8) + 2]) + d_big_algo[(object * 8) + (subject * *N * 8) + 3];
                d_influence_N[subject] += 1;
                //printf("%i ready to sync\n", id);
                //__syncthreads();
                //printf("%i INFLUENCE ", d_influence_N[subject]);
                float destination_mod = (2.0f * d_influence_N[subject]) / powf(d_influence_N[subject], 2.0f);
                d_destinations[subject * 2] += force * (d_location[object * 2] - d_location[subject * 2]) * destination_mod;
                d_destinations[subject * 2 + 1] += force * (d_location[object * 2 + 1] - d_location[subject * 2 + 1]) * destination_mod;
                //delete& force;
                //delete& destination_mod;
                
            }
            //delete& relative_minimum;
            //delete& relative_medium;
        }
        //delete& subject;
        //delete& object;
        //delete& distance;
        //delete& relative_maximum;
    }

    //__syncthreads();
    if (id < *N) {
        if (d_location[id * 2+1] < 2 || d_location[id * 2 + 1] > HEIGHT - 2) {
            d_destinations[id * 2+1] = HEIGHT / 2;
        }
        if (d_location[id * 2]<2 || d_location[id * 2] > WIDTH - 2) {
            d_destinations[id * 2] = WIDTH / 2;
        }

        if (d_location[id * 2] >= WIDTH - 10) {
            d_destinations[id * 2] = abs(d_location[id * 2]) - ((d_location[id * 2] - (WIDTH - 10)) / 2) / (WIDTH / abs(d_location[id * 2]));
        }
        if (d_location[id * 2] < 10) {
            d_destinations[id * 2] = abs(d_location[id * 2]) + ((d_location[id * 2] + 10) / 2) / (abs(d_location[id * 2]) + 0.1);
        }
        if (d_location[id * 2 + 1] >= HEIGHT - 10) {
            d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) - ((d_location[id * 2 + 1] - (HEIGHT - 10)) / 2) / (HEIGHT / abs(d_location[id * 2 + 1]));
        }
        if (d_location[id * 2 + 1] < 10) {
            d_destinations[id * 2 + 1] = abs(d_location[id * 2 + 1]) + ((d_location[id * 2 + 1] + 10) / 2) / (abs(d_location[id * 2 + 1]) + 0.1);
        }

        if (2.0f * (sqrt(powf(d_location[id * 2] - WIDTH / 2.0f, 2) + powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) > WIDTH) {
            if (d_location[id * 2 + 1] >= HEIGHT / 2.0f) {
                d_destinations[id * 2 + 1] = sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
            }
            if (d_location[id * 2 + 1] <= HEIGHT / 2.0f) {
                d_destinations[id * 2 + 1] = -sqrt(abs(powf(HEIGHT / 2.0f, 2) - powf(d_location[id * 2] - HEIGHT / 2.0f, 2))) + HEIGHT / 2.0f;
            }
            if (d_location[id * 2] <= WIDTH / 2.0f) {
                d_destinations[id * 2] = -sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;
            }
            if (d_location[id * 2] >= WIDTH / 2.0f) {
                d_destinations[id * 2] = sqrt(abs(powf(WIDTH / 2.0f, 2) - powf(d_location[id * 2 + 1] - WIDTH / 2.0f, 2))) + WIDTH / 2.0f;

                //send out
            }
        }
    }
    delete &id;
    
    
}

代码还有更多内容，但我认为这些是导致问题的部分（也许值得注意我也在使用 SFML）。

Answer 1

你多次调用cudaMallocHost来分配space并将其存储在局部变量（h_big_algo、h_N等）中，然后立即覆盖返回的指针其他数据（h_big_algo = big_algo.data();、h_N = &Num; 等）。

这将泄漏 cudaMallocHost 分配的内存。

您稍后调用 cudaFreeHost，这将释放函数调用返回的内存，而不是 cudaMallocHost 分配的内存。更糟糕的是，对 cudaFreeHost(h_N); 的调用将传递一个指向本地基于堆栈的变量的指针。

您不应该为那些赋值的指针调用 cudaMallocHost。

CUDA 程序使用系统 ram 的指数数量

CUDA program using exponential amount of system ram

c++

cuda

sfml