在 OpenGL 中实例化数百万个对象:提高每秒帧数

Instancing millions of objects in OpenGL: improving frames-per-second

我的最终目标是以 60 fps 的速度渲染 100 万个不同大小和颜色的球体。我也希望能够在屏幕上移动相机。

我修改了 this page of the tutorial I am studying 上的代码以尝试实例化多个球体。然而,我发现在低至 64 个球体时我的 fps 低于 60,而在 900 个球体时我的 fps 是可怜的 4。我对实例化的理解是幼稚的,但我相信我应该比每秒获得更多的帧数这个。只有 64 个球体应该可以达到 60 fps。我相信我在某种程度上导致 CPU 和 GPU 比他们应该更频繁地进行通信。所以我的问题是:如何在不导致 fps 降低(理想情况下为 60 fps)的情况下实例化这么多对象(最好是数百万个)?

我通过每 10 帧计算 (10 / time_elapsed) 来计算 fps,其中 time_elapsed 是自上次调用 fps 以来经过的时间。我在代码的第 118 行使用 printf 打印出来。

我一直在通过 this tutorial and so I use 32-bit GLEW and 32-bit GLFW in Visual Studio 2013. I have 8 GB of RAM on a 64-bit operating system (Windows 7) with a 2.30 GHz CPU 学习 OpenGL。

我已经尝试根据上面的教程编写自己的示例。源代码:

(将第 2 行设置为要实例化的球体数量。确保第 2 行具有整数平方根。将第 4 行设置为球体的细节,它可以达到的最低值是0. 数字越大 = 越详细。)

// Make sure NUM_INS is a square number
#define NUM_INS 1

// Detail up to 4 is probably good enough
#define SPHERE_DETAIL 4

#include <vector>

// GLEW
#define GLEW_STATIC
#include <GL/glew.h>

// GLFW
#include <GLFW/glfw3.h>

// GL includes
#include "Shader.h"

// GLM Mathemtics
#include <glm/glm.hpp>
#include <glm/gtc/matrix_transform.hpp>
#include <glm/gtc/type_ptr.hpp>

// Properties
GLuint screenWidth = 800, screenHeight = 600;

// Function prototypes
void key_callback(GLFWwindow* window, int key, int scancode, int action, int mode);
std::vector<GLfloat> create_sphere(int recursion);

// The MAIN function, from here we start our application and run the Game loop
int main()
{
  // Init GLFW
  glfwInit();
  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
  glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);

  GLFWwindow* window = glfwCreateWindow(screenWidth, screenHeight, "LearnOpenGL", nullptr, nullptr); // Windowed
  glfwMakeContextCurrent(window);

  // Set the required callback functions
  glfwSetKeyCallback(window, key_callback);

  // Initialize GLEW to setup the OpenGL Function pointers
  glewExperimental = GL_TRUE;
  glewInit();

  // Define the viewport dimensions
  glViewport(0, 0, screenWidth, screenHeight);
  glPolygonMode(GL_FRONT_AND_BACK, GL_LINE); // Comment to remove wireframe mode

  // Setup OpenGL options
  glEnable(GL_DEPTH_TEST);

  // Setup and compile our shader(s)
  Shader shader("core.vs", "core.frag");

  // Generate a list of 100 quad locations/translation-vectors
  std::vector<glm::vec2> translations(NUM_INS);
  //glm::vec2 translations[NUM_INS];
  int index = 0;
  GLfloat offset = 1.0f / (float)sqrt(NUM_INS);
  for (GLint y = -(float)sqrt(NUM_INS); y < (float)sqrt(NUM_INS); y += 2)
  {
    for (GLint x = -(float)sqrt(NUM_INS); x < (float)sqrt(NUM_INS); x += 2)
    {
      glm::vec2 translation;
      translation.x = (GLfloat)x / (float)sqrt(NUM_INS) + offset;
      translation.y = (GLfloat)y / (float)sqrt(NUM_INS) + offset;
      translations[index++] = translation;
    }
  }

  // Store instance data in an array buffer
  GLuint instanceVBO;
  glGenBuffers(1, &instanceVBO);
  glBindBuffer(GL_ARRAY_BUFFER, instanceVBO);
  glBufferData(GL_ARRAY_BUFFER, sizeof(glm::vec2) * NUM_INS, &translations[0], GL_STATIC_DRAW);
  glBindBuffer(GL_ARRAY_BUFFER, 0);

  // create 12 vertices of a icosahedron
  std::vector<GLfloat> vv = create_sphere(SPHERE_DETAIL);

  GLuint quadVAO, quadVBO;
  glGenVertexArrays(1, &quadVAO);
  glGenBuffers(1, &quadVBO);
  glBindVertexArray(quadVAO);
  glBindBuffer(GL_ARRAY_BUFFER, quadVBO);
  glBufferData(GL_ARRAY_BUFFER, vv.size() * sizeof(GLfloat), &vv[0], GL_STATIC_DRAW);
  glEnableVertexAttribArray(0);
  glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(GLfloat), (GLvoid*)0);
  glEnableVertexAttribArray(1);
  glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 6 * sizeof(GLfloat), (GLvoid*)(2 * sizeof(GLfloat)));
  // Also set instance data
  glEnableVertexAttribArray(2);
  glBindBuffer(GL_ARRAY_BUFFER, instanceVBO);
  glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(GLfloat), (GLvoid*)0);
  glBindBuffer(GL_ARRAY_BUFFER, 0);
  glVertexAttribDivisor(2, 1); // Tell OpenGL this is an instanced vertex attribute.
  glBindVertexArray(0);

  // For printing frames-per-second
  float counter = 0;
  double get_time = 0;
  double new_time;

  // Game loop
  while (!glfwWindowShouldClose(window))
  {
    // Print fps by printing (number_of_frames / time_elapsed)
    counter += 1;
    if (counter > 10) {
      counter -= 10;
      new_time = glfwGetTime();
      printf("fps: %.2f ", (10/(new_time - get_time)));
      get_time = new_time;
    }

    // Check and call events
    glfwPollEvents();

    // Clear buffers
    //glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
    glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

    // Draw 100 instanced quads
    shader.Use();

    glm::mat4 model;
    model = glm::rotate(model, 0.0f, glm::vec3(1.0f, 0.0f, 0.0f));
    // Camera/View transformation
    glm::mat4 view;
    GLfloat radius = 10.0f;
    GLfloat camX = sin(glfwGetTime()) * radius;
    GLfloat camZ = cos(glfwGetTime()) * radius;
    view = glm::lookAt(glm::vec3(camX, 0.0f, camZ), glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 1.0f, 0.0f));
    // Projection 
    glm::mat4 projection;
    projection = glm::perspective(45.0f, (GLfloat)screenWidth / (GLfloat)screenHeight, 0.1f, 100.0f);
    // Get the uniform locations
    GLint modelLoc = glGetUniformLocation(shader.Program, "model");
    GLint viewLoc = glGetUniformLocation(shader.Program, "view");
    GLint projLoc = glGetUniformLocation(shader.Program, "projection");
    // Pass the matrices to the shader
    glUniformMatrix4fv(modelLoc, 1, GL_FALSE, glm::value_ptr(model));
    glUniformMatrix4fv(viewLoc, 1, GL_FALSE, glm::value_ptr(view));
    glUniformMatrix4fv(projLoc, 1, GL_FALSE, glm::value_ptr(projection));

    glBindVertexArray(quadVAO);
    glDrawArraysInstanced(GL_TRIANGLES, 0, vv.size() / 3, NUM_INS); // 100 triangles of 6 vertices each
    glBindVertexArray(0);

    // Swap the buffers
    glfwSwapBuffers(window);
  }

  glfwTerminate();
  return 0;
}

// Is called whenever a key is pressed/released via GLFW
void key_callback(GLFWwindow* window, int key, int scancode, int action, int mode)
{
  if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
    glfwSetWindowShouldClose(window, GL_TRUE);
}

std::vector<GLfloat> add_color(std::vector<GLfloat> sphere) {
  // Add color
  std::vector<GLfloat> colored_sphere;
  for (GLint i = 0; i < sphere.size(); i+=9) {
    colored_sphere.push_back(sphere[i]);
    colored_sphere.push_back(sphere[i+1]);
    colored_sphere.push_back(sphere[i+2]);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(sphere[i+3]);
    colored_sphere.push_back(sphere[i+4]);
    colored_sphere.push_back(sphere[i+5]);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(sphere[i+6]);
    colored_sphere.push_back(sphere[i+7]);
    colored_sphere.push_back(sphere[i+8]);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(0.0f);
    colored_sphere.push_back(0.0f);
  }

  return colored_sphere;
}

std::vector<GLfloat> tesselate(std::vector<GLfloat> shape, int recursion) {

  if (recursion > 0) {
    std::vector<GLfloat> new_sphere = {};
    for (GLint i = 0; i < shape.size(); i += 9) {

      // 1.902113 approximately
      GLfloat radius = sqrt(1.0f + pow((1.0f + sqrt(5.0f)) / 2.0f, 2));

      // Every 9 points is a triangle.  Take 1 triangle and turn it into 4 triangles.  
      GLfloat p_one[] = {shape[i], shape[i + 1], shape[i + 2]};
      GLfloat p_two[] = {shape[i + 3], shape[i + 4], shape[i + 5]};
      GLfloat p_thr[] = {shape[i + 6], shape[i + 7], shape[i + 8]};
      GLfloat p_one_two[] = { (p_one[0] + p_two[0]) / 2.0f, (p_one[1] + p_two[1]) / 2.0f, (p_one[2] + p_two[2]) / 2.0f };
      GLfloat p_one_thr[] = { (p_one[0] + p_thr[0]) / 2.0f, (p_one[1] + p_thr[1]) / 2.0f, (p_one[2] + p_thr[2]) / 2.0f };
      GLfloat p_two_thr[] = { (p_two[0] + p_thr[0]) / 2.0f, (p_two[1] + p_thr[1]) / 2.0f, (p_two[2] + p_thr[2]) / 2.0f };

      GLfloat r_one_two = sqrt((p_one_two[0]*p_one_two[0]) + (p_one_two[1]*p_one_two[1]) + (p_one_two[2]*p_one_two[2]));
      GLfloat r_one_thr = sqrt((p_one_thr[0]*p_one_thr[0]) + (p_one_thr[1]*p_one_thr[1]) + (p_one_thr[2]*p_one_thr[2]));
      GLfloat r_two_thr = sqrt((p_two_thr[0]*p_two_thr[0]) + (p_two_thr[1]*p_two_thr[1]) + (p_two_thr[2]*p_two_thr[2]));

      GLfloat t_one_two[] = { radius * p_one_two[0] / r_one_two, radius * p_one_two[1] / r_one_two, radius * p_one_two[2] / r_one_two };
      GLfloat t_one_thr[] = { radius * p_one_thr[0] / r_one_thr, radius * p_one_thr[1] / r_one_thr, radius * p_one_thr[2] / r_one_thr };
      GLfloat t_two_thr[] = { radius * p_two_thr[0] / r_two_thr, radius * p_two_thr[1] / r_two_thr, radius * p_two_thr[2] / r_two_thr };

      // Triangle 1:
        new_sphere.push_back(p_one[0]);
      new_sphere.push_back(p_one[1]);
      new_sphere.push_back(p_one[2]);

      new_sphere.push_back(t_one_two[0]);
      new_sphere.push_back(t_one_two[1]);
      new_sphere.push_back(t_one_two[2]);

      new_sphere.push_back(t_one_thr[0]);
      new_sphere.push_back(t_one_thr[1]);
      new_sphere.push_back(t_one_thr[2]);

      // Triangle 2:
        new_sphere.push_back(p_two[0]);
      new_sphere.push_back(p_two[1]);
      new_sphere.push_back(p_two[2]);

      new_sphere.push_back(t_one_two[0]);
      new_sphere.push_back(t_one_two[1]);
      new_sphere.push_back(t_one_two[2]);

      new_sphere.push_back(t_two_thr[0]);
      new_sphere.push_back(t_two_thr[1]);
      new_sphere.push_back(t_two_thr[2]);

      // Triangle 3: 
        new_sphere.push_back(p_thr[0]);
      new_sphere.push_back(p_thr[1]);
      new_sphere.push_back(p_thr[2]);

      new_sphere.push_back(t_one_thr[0]);
      new_sphere.push_back(t_one_thr[1]);
      new_sphere.push_back(t_one_thr[2]);

      new_sphere.push_back(t_two_thr[0]);
      new_sphere.push_back(t_two_thr[1]);
      new_sphere.push_back(t_two_thr[2]);

      // Center Triangle:

        new_sphere.push_back(t_one_two[0]);
      new_sphere.push_back(t_one_two[1]);
      new_sphere.push_back(t_one_two[2]);

      new_sphere.push_back(t_one_thr[0]);
      new_sphere.push_back(t_one_thr[1]);
      new_sphere.push_back(t_one_thr[2]);

      new_sphere.push_back(t_two_thr[0]);
      new_sphere.push_back(t_two_thr[1]);
      new_sphere.push_back(t_two_thr[2]);
    }
    return tesselate(new_sphere, recursion - 1);
  }

  printf("number of vertices to be rendered: %d || ", shape.size());
  return shape;
}

std::vector<GLfloat> create_sphere(int recursion) {

  // Define the starting icosahedron
  GLfloat t_ = (1.0f + sqrt(5.0f)) / 2.0f;

  std::vector<GLfloat> icosahedron = {
    -1.0f,  t_,  0.0f, -t_,  0.0f,  1.0f, 0.0f,  1.0f,  t_,
    -1.0f,  t_,  0.0f, 0.0f,  1.0f,  t_, 1.0f,  t_,  0.0f,
    -1.0f,  t_,  0.0f, 1.0f,  t_,  0.0f, 0.0f,  1.0f, -t_,
    -1.0f,  t_,  0.0f, 0.0f,  1.0f, -t_, -t_,  0.0f, -1.0f,
    -1.0f,  t_,  0.0f, -t_,  0.0f, -1.0f, -t_,  0.0f,  1.0f,

    1.0f,  t_,  0.0f, 0.0f,  1.0f,  t_,  t_,  0.0f,  1.0f, 
    0.0f,  1.0f,  t_, -t_,  0.0f,  1.0f,  0.0f, -1.0f,  t_, 
    -t_,  0.0f,  1.0f, -t_,  0.0f, -1.0f, -1.0f, -t_,  0.0f, 
    -t_,  0.0f, -1.0f,  0.0f,  1.0f, -t_,  0.0f, -1.0f, -t_, 
    0.0f,  1.0f, -t_,  1.0f,  t_,  0.0f,  t_,  0.0f, -1.0f, 

    1.0f, -t_,  0.0f, t_,  0.0f,  1.0f, 0.0f, -1.0f,  t_,
    1.0f, -t_,  0.0f, 0.0f, -1.0f,  t_,-1.0f, -t_,  0.0f,
    1.0f, -t_,  0.0f,-1.0f, -t_,  0.0f, 0.0f, -1.0f, -t_,
    1.0f, -t_,  0.0f, 0.0f, -1.0f, -t_, t_,  0.0f, -1.0f,
    1.0f, -t_,  0.0f, t_,  0.0f, -1.0f, t_,  0.0f,  1.0f,

    0.0f, -1.0f,  t_, t_,  0.0f,  1.0f, 0.0f,  1.0f,  t_,
    -1.0f, -t_,  0.0f, 0.0f, -1.0f,  t_,-t_,  0.0f,  1.0f,
    0.0f, -1.0f, -t_,-1.0f, -t_,  0.0f,-t_,  0.0f, -1.0f,
    t_,  0.0f, -1.0f, 0.0f, -1.0f, -t_, 0.0f,  1.0f, -t_,
    t_,  0.0f,  1.0f, t_,  0.0f, -1.0f, 1.0f,  t_,  0.0f,
  };

  // Tesselate the icososphere the number of times recursion
  std::vector<GLfloat> colorless_sphere = tesselate(icosahedron, recursion);

  // Add color and return
  return add_color(colorless_sphere);
}

顶点着色器:(命名为core.vs)

#version 330 core
layout (location = 0) in vec3 position;
layout (location = 1) in vec3 color;
layout (location = 2) in vec2 offset;

out vec3 fColor;

uniform mat4 model;
uniform mat4 view;
uniform mat4 projection;

void main()
{
  gl_Position = projection * view * model * vec4(position.x + offset.x, position.y + offset.y, position.z, 1.0f);
  fColor = color;
} 

片段着色器:(名为 core.frag)

#version 330 core
in vec3 fColor;
out vec4 color;

void main()
{
  color = vec4(fColor, 1.0f);
}

着色器class:(命名为Shader.h)

#ifndef SHADER_H
#define SHADER_H

#include <string>
#include <fstream>
#include <sstream>
#include <iostream>

#include <GL/glew.h>

class Shader
{
  public:
  GLuint Program;
  // Constructor generates the shader on the fly
  Shader(const GLchar* vertexPath, const GLchar* fragmentPath)
  {
    // 1. Retrieve the vertex/fragment source code from filePath
    std::string vertexCode;
    std::string fragmentCode;
    std::ifstream vShaderFile;
    std::ifstream fShaderFile;
    // ensures ifstream objects can throw exceptions:
      vShaderFile.exceptions(std::ifstream::badbit);
    fShaderFile.exceptions(std::ifstream::badbit);
    try
    {
      // Open files
      vShaderFile.open(vertexPath);
      fShaderFile.open(fragmentPath);
      std::stringstream vShaderStream, fShaderStream;
      // Read file's buffer contents into streams
      vShaderStream << vShaderFile.rdbuf();
      fShaderStream << fShaderFile.rdbuf();
      // close file handlers
      vShaderFile.close();
      fShaderFile.close();
      // Convert stream into string
      vertexCode = vShaderStream.str();
      fragmentCode = fShaderStream.str();
    }
      catch (std::ifstream::failure e)
      {
      std::cout << "ERROR::SHADER::FILE_NOT_SUCCESFULLY_READ" << std::endl;
      }
      const GLchar* vShaderCode = vertexCode.c_str();
      const GLchar * fShaderCode = fragmentCode.c_str();
      // 2. Compile shaders
      GLuint vertex, fragment;
      GLint success;
      GLchar infoLog[512];
      // Vertex Shader
      vertex = glCreateShader(GL_VERTEX_SHADER);
      glShaderSource(vertex, 1, &vShaderCode, NULL);
      glCompileShader(vertex);
      // Print compile errors if any
      glGetShaderiv(vertex, GL_COMPILE_STATUS, &success);
      if (!success)
      {
      glGetShaderInfoLog(vertex, 512, NULL, infoLog);
      std::cout << "ERROR::SHADER::VERTEX::COMPILATION_FAILED\n" << infoLog << std::endl;
      }
      // Fragment Shader
      fragment = glCreateShader(GL_FRAGMENT_SHADER);
      glShaderSource(fragment, 1, &fShaderCode, NULL);
      glCompileShader(fragment);
      // Print compile errors if any
      glGetShaderiv(fragment, GL_COMPILE_STATUS, &success);
      if (!success)
      {
      glGetShaderInfoLog(fragment, 512, NULL, infoLog);
      std::cout << "ERROR::SHADER::FRAGMENT::COMPILATION_FAILED\n" << infoLog << std::endl;
      }
      // Shader Program
      this->Program = glCreateProgram();
      glAttachShader(this->Program, vertex);
      glAttachShader(this->Program, fragment);
      glLinkProgram(this->Program);
      // Print linking errors if any
      glGetProgramiv(this->Program, GL_LINK_STATUS, &success);
      if (!success)
      {
      glGetProgramInfoLog(this->Program, 512, NULL, infoLog);
      std::cout << "ERROR::SHADER::PROGRAM::LINKING_FAILED\n" << infoLog << std::endl;
      }
      // Delete the shaders as they're linked into our program now and no longer necessery
      glDeleteShader(vertex);
      glDeleteShader(fragment);

  }
    // Uses the current shader
    void Use()
    {
      glUseProgram(this->Program);
    }
};

#endif

My ultimate goal is to render 1 million spheres of different sizes and colors at 60 fps.

这是一个不合理的期望。

假设每个球体由 50 个三角形组成。对于良好的球形来说有点小,但让我们假设它们是那么小。

100 万个球体,每个球体有 50 个三角形,即每帧有 5000 万个三角形 。在 60 FPS 下,即每秒 3 亿 个三角形。

市售的 GPU 都不足以做到这一点。那只是一个 50 三角形球体;你的 4x 镶嵌二十面体将超过 5,000 个三角形。

现在是的,绘制 60 个这样的球体每帧仅约 300,000 个三角形。但即使是 60 FPS,也是每秒约 1800 万个三角形。确实存在可以处理那么多三角形的硬件,但很明显它是一个很多。而且你绝对不会得到其中的 100 万个。

这不是 GPU/CPU 通信或开销的问题。您只是在 GPU 上投入了超出其处理能力的更多工作。您可能可以在这里和那里改进一些东西,但没有什么可以让您获得想要的东西的十分之一。

至少,不是这种整体方法。


对于您想要绘制数百万个球体的特殊情况,我会使用光线跟踪冒名顶替者而不是球体的实际几何形状。也就是说,您绘制四边形,其位置由顶点(或几何)着色器生成。您为每个球体生成一个四边形,这样四边形就可以包围球体。然后片段着色器做一个简单的光线球相交测试,看看有问题的片段(从相机视图的方向)是否击中了球体。如果射线没有击中球体,则丢弃该片段。

您还需要修改 gl_FragDepth 以给冒名顶替者适当的深度值,以便相交的球体可以工作。