GLSL 计算着色器不 运行 大输入

GLSL Compute Shader doesn't run for large inputs

着色器采用具有位置、方向、波长和强度的光子 SSBO,每个线程负责准确地跟踪一个光子通过网格,在每个网格单元中,光子命中,强度累积每个波长为每个网格单元创建光谱分布。

问题是着色器对 100,000 个光子完美运行,但 return 对 1,000,000 个光子没有结果。

我查看了 SSBO 的大小,发现它们都在我的 GPU (NVIDIA Quadro P6000) 2GB 限制范围内:

如果我在某些地方更改逻辑,它可以处理一百万个光子(请参阅第 87 行和第 114 行的评论)。

我目前无法解释为什么着色器对 1,000,000 个光子失败,但对 100,000 个光子有效。逻辑相同,缓冲区大小在限制范围内。 (缓冲区大小不会成为问题也被它在更改逻辑时起作用所证实。)

下面是源代码。如果您想自己尝试,这里是 github 上的代码:https://github.com/TheJhonny007/TextureTracerDebug

计算着色器:

#version 430

#extension GL_EXT_compute_shader: enable
#extension GL_EXT_shader_storage_buffer_object: enable
#extension GL_ARB_compute_variable_group_size: enable

const uint TEX_WIDTH = 1024u;
const uint TEX_HEIGHT = TEX_WIDTH;

const uint MIN_WAVELENGTH = 380u;
const uint MAX_WAVELENGTH = 740u;
const uint NUM_WAVELENGTHS = MAX_WAVELENGTH - MIN_WAVELENGTH;

// Size: 24 bytes -> ~40,000,000 photons per available gigabyte of ram
struct Photon {
    vec2 position;// m
    vec2 direction;// normalized
    uint wavelength;// nm
    float intensity;// 0..1 should start at 1
};

layout(std430, binding = 0) buffer Photons {
    Photon photons[];
};

// Size: 1440 bytes -> ~700,000 pixels per available gigabyte of ram
struct Pixel {
    uint intensityAtWavelengths[NUM_WAVELENGTHS];// [0..1000]
};

layout(std430, binding = 1) buffer Pixels {
//Pixel pixels[TEX_WIDTH][TEX_HEIGHT];
// NVIDIAs linker takes ages to link if the sizes are specified :(
    Pixel[] pixels;
};

uniform float xAxisScalingFactor;

vec2 getHorizontalRectangleAt(int i) {
    float x = pow(float(i), xAxisScalingFactor);
    float w = pow(float(i + 1), xAxisScalingFactor);
    return vec2(x, w);
}

uniform float rectangleHeight;

struct Rectangle {
    float x;
    float y;
    float w;
    float h;
};

layout (local_size_variable) in;

void addToPixel(uvec2 idx, uint wavelength, uint intensity) {
    if (idx.x >= 0u && idx.x < TEX_WIDTH && idx.y >= 0u && idx.y < TEX_HEIGHT) {
        uint index = (idx.y * TEX_WIDTH) + idx.x;
        atomicAdd(pixels[index].intensityAtWavelengths[wavelength - MIN_WAVELENGTH], intensity);
    }
}

/// Returns the rectangle at the given indices.
Rectangle getRectangleAt(ivec2 indices) {
    vec2 horRect = getHorizontalRectangleAt(indices.x);
    return Rectangle(horRect.x, rectangleHeight * float(indices.y), horRect.y, rectangleHeight);
}

uniform float shadowLength;
uniform float shadowHeight;

/// Returns the indices of the rectangle at the given location
ivec2 getRectangleIdxAt(vec2 location) {
    int x = 0;
    int y = int(location.y / rectangleHeight);
    return ivec2(x, y);
}

float getRayIntersectAtX(Photon ray, float x) {
    float slope = ray.direction.y / ray.direction.x;
    return slope * (x - ray.position.x) + ray.position.y;
}

ivec2 getRayRectangleExitEdge(Photon ray, Rectangle rect) {
    float intersectHeight = getRayIntersectAtX(ray, rect.x + rect.w);

    // IF ONE OF THE FIRST TWO CONDITIONS GETS REMOVED IT WORKS WITH 1'000'000 PHOTONS OTHERWISE ONLY 100'000 WHY?
    if (intersectHeight < rect.y) {
        return ivec2(0, -1);
    } else if (intersectHeight > rect.y + rect.h) {
        return ivec2(0, 1);
    } else {
        return ivec2(1, 0);
    }
}

void main() {
    uint gid = gl_GlobalInvocationID.x;
    if (gid >= photons.length()) return;

    Photon photon = photons[gid];

    ivec2 photonTexIndices = getRectangleIdxAt(photon.position);
    while (photonTexIndices.x < TEX_WIDTH && photonTexIndices.y < TEX_HEIGHT &&
    photonTexIndices.x >= 0        && photonTexIndices.y >= 0) {
        // need to convert to uint for atomic add operations...
        addToPixel(uvec2(photonTexIndices), photon.wavelength, uint(photon.intensity * 100.0));

        ivec2 dir = getRayRectangleExitEdge(photon, getRectangleAt(photonTexIndices));
        photonTexIndices += dir;

        // When the ray goes out of bounds on the bottom then mirror it to simulate rays coming from
        // the other side of the planet. This works because of the rotational symmetry of the system.
        // IF COMMENTET OUT IT WORKS WITH 1'000'000 PHOTONS OTHERWISE ONLY 100'000 WHY?
        if (photonTexIndices.y < 0) {
            photonTexIndices.y = 0;
            photon.position.y *= -1.0;
            photon.direction.y *= -1.0;
        }
    }
}

Tracer.hpp

#ifndef TEXTURE_TRACER_HPP
#define TEXTURE_TRACER_HPP

#include <glm/glm.hpp>
#include <random>

namespace gpu {

    // 6 * 4 = 24 Bytes
    struct Photon {
        glm::vec2 position;  // m
        glm::vec2 direction; // normalized
        uint32_t waveLength; // nm
        float intensity;     // 0..1 should start at 1
    };

    class TextureTracer {
    public:
        TextureTracer();
        uint32_t createShadowMap(size_t numPhotons);

    private:
        void initTextureTracer();
        void traceThroughTexture(uint32_t ssboPhotons, size_t numPhotons);
        Photon emitPhoton();
        std::vector<Photon> generatePhotons(uint32_t count);

        struct {
            uint32_t uRectangleHeight;
            uint32_t uShadowLength;
            uint32_t uShadowHeight;
            uint32_t uXAxisScalingFactor;
        } mTextureTracerUniforms;

        uint32_t mTextureTracerProgram;

        std::mt19937_64 mRNG;
        std::uniform_real_distribution<> mDistributionSun;
        std::uniform_int_distribution<uint32_t> mDistributionWavelength;
        std::bernoulli_distribution mDistributionBoolean;
    };

} // namespace gpu

#endif // TEXTURE_TRACER_HPP

Tracer.cpp

#include "TextureTracer.hpp"
#include <GL/glew.h>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <random>
#include <string>
#include <vector>

void GLAPIENTRY MessageCallback(GLenum source, GLenum type, GLuint id,
                                GLenum severity, GLsizei length,
                                const GLchar *message, const void *userParam) {
  if (type == GL_DEBUG_TYPE_ERROR)
    fprintf(stderr, "GL ERROR: type = 0x%x, severity = 0x%x, message = %s\n",
            type, severity, message);
  else
    fprintf(stdout, "GL INFO: type = 0x%x, severity = 0x%x, message = %s\n",
            type, severity, message);
}

namespace gpu {
const double TEX_HEIGHT_TO_RADIUS_FACTOR = 4;
const double TEX_SHADOW_LENGTH_FACTOR = 8;

const uint32_t TEX_WIDTH = 1024u;
const uint32_t TEX_HEIGHT = TEX_WIDTH;

const double RADIUS = 6'371'000.0;
const double RADIUS_FACTORED = RADIUS * TEX_HEIGHT_TO_RADIUS_FACTOR;

const double SUN_RADIUS = 695'510'000.0;
const double DIST_TO_SUN = 149'600'000'000.0;
const double ATMO_HEIGHT = 42'000.0;

std::string loadShader(const std::string &fileName) {
  std::ifstream shaderFileStream(fileName, std::ios::in);
  if (!shaderFileStream.is_open()) {
    std::cerr << "Could not load the GLSL shader from '" << fileName << "'!"
              << std::endl;
    exit(-1);
  }

  std::string shaderCode;
  while (!shaderFileStream.eof()) {
    std::string line;
    std::getline(shaderFileStream, line);
    shaderCode.append(line + "\n");
  }

  return shaderCode;
}

void TextureTracer::initTextureTracer() {
  mTextureTracerProgram = glCreateProgram();
  uint32_t rayTracingComputeShader = glCreateShader(GL_COMPUTE_SHADER);

  std::string code = loadShader("../resources/TextureTracer.glsl");
  const char *shader = code.c_str();
  glShaderSource(rayTracingComputeShader, 1, &shader, nullptr);
  glCompileShader(rayTracingComputeShader);

  glAttachShader(mTextureTracerProgram, rayTracingComputeShader);
  glLinkProgram(mTextureTracerProgram);

  mTextureTracerUniforms.uRectangleHeight =
      glGetUniformLocation(mTextureTracerProgram, "rectangleHeight");
  mTextureTracerUniforms.uShadowHeight =
      glGetUniformLocation(mTextureTracerProgram, "shadowHeight");
  mTextureTracerUniforms.uShadowLength =
      glGetUniformLocation(mTextureTracerProgram, "shadowLength");
  mTextureTracerUniforms.uXAxisScalingFactor =
      glGetUniformLocation(mTextureTracerProgram, "xAxisScalingFactor");

  glDetachShader(mTextureTracerProgram, rayTracingComputeShader);
  glDeleteShader(rayTracingComputeShader);
}

TextureTracer::TextureTracer()
    : mRNG(1L), mDistributionSun(
                    std::uniform_real_distribution<>(-SUN_RADIUS, SUN_RADIUS)),
      mDistributionWavelength(
          std::uniform_int_distribution<uint32_t>(380, 739)),
      mDistributionBoolean(std::bernoulli_distribution(0.5)) {
  glEnable(GL_DEBUG_OUTPUT);
  glDebugMessageCallback(MessageCallback, nullptr);

  initTextureTracer();
}

double raySphereDistance(glm::dvec2 origin, glm::dvec2 direction,
                         glm::dvec2 center, double radius) {
  glm::dvec2 m = origin - center;
  double b = glm::dot(m, direction);
  double c = glm::dot(m, m) - (radius * radius);
  if (c > 0.0 && b > 0.0)
    return -1.0;

  double discr = b * b - c;

  // A negative discriminant corresponds to ray missing sphere
  if (discr < 0.0)
    return -1.0;

  // Ray now found to intersect sphere, compute smallest t value of intersection
  return glm::max(0.0, -b - glm::sqrt(discr));
}

Photon TextureTracer::emitPhoton() {
  std::uniform_real_distribution<> distributionEarth(0.0, ATMO_HEIGHT);
  glm::dvec2 target = {0.0, RADIUS + distributionEarth(mRNG)};

  double d;
  do {
    d = glm::length(glm::dvec2(mDistributionSun(mRNG), mDistributionSun(mRNG)));
  } while (d > SUN_RADIUS);

  glm::dvec2 startPosition =
      glm::dvec2(-DIST_TO_SUN, mDistributionBoolean(mRNG) ? d : -d);
  glm::dvec2 direction = glm::normalize(target - startPosition);

  startPosition +=
      direction * raySphereDistance(startPosition, direction, {0.0, 0.0},
                                    RADIUS + ATMO_HEIGHT);

  return {glm::vec2(0.0, startPosition.y), glm::vec2(direction),
          mDistributionWavelength(mRNG), 1.0f};
}

std::vector<Photon> TextureTracer::generatePhotons(uint32_t count) {
  std::vector<Photon> photons(count);
  std::generate(photons.begin(), photons.end(),
                [this]() { return emitPhoton(); });
  return photons;
}

void TextureTracer::traceThroughTexture(uint32_t ssboPhotons,
                                        size_t numPhotons) {
  glUseProgram(mTextureTracerProgram);

  glUniform1f(mTextureTracerUniforms.uRectangleHeight,
              RADIUS_FACTORED / TEX_HEIGHT);

  const double shadowLength =
      TEX_SHADOW_LENGTH_FACTOR * (DIST_TO_SUN * RADIUS) / (SUN_RADIUS - RADIUS);

  glUniform1f(mTextureTracerUniforms.uShadowLength, shadowLength);
  glUniform1f(mTextureTracerUniforms.uShadowHeight, RADIUS_FACTORED);

  const double xAxisScalingFactor =
      glm::log(shadowLength) / glm::log(static_cast<double>(TEX_WIDTH));

  glUniform1f(mTextureTracerUniforms.uXAxisScalingFactor,
              static_cast<float>(xAxisScalingFactor));

  const uint32_t MIN_WAVELENGTH = 380u;
  const uint32_t MAX_WAVELENGTH = 740u;
  const uint32_t NUM_WAVELENGTHS = MAX_WAVELENGTH - MIN_WAVELENGTH;

  size_t pixelBufferSize =
      TEX_WIDTH * TEX_HEIGHT * NUM_WAVELENGTHS * sizeof(uint32_t);
  uint32_t ssboPixels;
  glGenBuffers(1, &ssboPixels);
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPixels);
  glBufferData(GL_SHADER_STORAGE_BUFFER, pixelBufferSize, nullptr,
               GL_DYNAMIC_COPY);

  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssboPhotons);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ssboPixels);

  const uint32_t numThreads = 32u;
  const uint32_t numBlocks = numPhotons / numThreads;
  std::cout << "numBlocks: " << numBlocks << std::endl;

  glDispatchComputeGroupSizeARB(numBlocks, 1, 1, numThreads, 1, 1);
  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

  struct Pixel {
    uint32_t intensityAtWavelengths[NUM_WAVELENGTHS];
  };

  std::vector<Pixel> pixels(TEX_WIDTH * TEX_HEIGHT);

  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPixels);
  glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, pixelBufferSize,
                     pixels.data());
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);

  for (int y = 0; y < TEX_HEIGHT; ++y) {
    printf("%4i | ", y);
    for (int x = 0; x < TEX_WIDTH; ++x) {
      Pixel p = pixels[y * TEX_WIDTH + x];
      int counter = 0;
      for (uint32_t i : p.intensityAtWavelengths) {
        counter += i;
      }

      if (counter == 0) {
        printf("  ");
      } else if (counter > 100'000'000) {
        printf("%4s", "\u25A0");
      } else if (counter > 10'000'000) {
        printf("%4s", "\u25A3");
      } else if (counter > 1'000'000) {
        printf("%4s", "\u25A6");
      } else if (counter > 100'000) {
        printf("%4s", "\u25A4");
      } else {
        printf("%4s", "\u25A1");
      }
    }

    std::cout << std::endl;
  }

  glDeleteBuffers(1, &ssboPixels);

  glUseProgram(0);
}

uint32_t TextureTracer::createShadowMap(size_t numPhotons) {
  std::vector<Photon> photons = generatePhotons(numPhotons);

  uint32_t ssboPhotons;
  glGenBuffers(1, &ssboPhotons);
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPhotons);
  glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(Photon) * photons.size(),
               photons.data(), GL_DYNAMIC_COPY);

  traceThroughTexture(ssboPhotons, photons.size());

  glDeleteBuffers(1, &ssboPhotons);
  glDeleteProgram(mTextureTracerProgram);

  glDisable(GL_DEBUG_OUTPUT);
  glDebugMessageCallback(nullptr, nullptr);

  return 0;
}
}

main.cpp

#include <GL/glew.h>
#include <GL/glut.h>

#include "TextureTracer.hpp"

int main(int argc, char *argv[]) {
    glutInit(&argc, argv);
    glutCreateWindow("OpenGL needs a window o.O");
    glewInit();

    auto mapper = gpu::TextureTracer();

    // WITH 100'000 PHOTONS IT WORKS, WITH 1'000'000 PHOTONS NOT WHY?
    mapper.createShadowMap(100'000);

    return 0;
}

如果 GPU 程序执行时间过长,操作系统会取消它们的执行。在 Windows 上通常是两秒,在 Linux 上大部分时间是五秒,但可能会有所不同。

这是为了检测卡住的GPU程序并取消它们。有不同的方法来解决此超时,但它们都需要 admin/root 权限,这并不总是可用的。

如果可能,可以将执行拆分为多个调用,如以下代码段所示:

const uint32_t passSize   = 2048u;
const uint32_t numPasses  = (numPhotons / passSize) + 1;
const uint32_t numThreads = 64u;
const uint32_t numBlocks  = passSize / numThreads;

glUniform1ui(glGetUniformLocation(mTextureTracerProgram, "passSize"), passSize);
for (uint32_t pass = 0u; pass < numPasses; ++pass) {
  glUniform1ui(glGetUniformLocation(mTextureTracerProgram, "pass"), pass);

  glDispatchComputeGroupSizeARB(numBlocks, 1, 1, numThreads, 1, 1);
  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
  glFlush();
  glFinish();
}

glFlush()glFinish() 调用很重要,否则执行将捆绑在一起并且 OS 无论如何都会触发超时。

在着色器中,您只需像这样访问输入数据的正确部分:

// other stuff

uniform uint pass;
uniform uint passSize;

void main() {
  uint gid = gl_GlobalInvocationID.x;
  uint passId = pass * passSize + gid;
  if (passId >= photons.length()) return;

  Photon photon = photons[passId];

  // rest of program
}

这就是全部。

如果您想禁用 OS 超时,这里是 Linux 的相关 post:

这是关于 Windows 的 post: