如何修复 "Buffer preloading failed" 计算着色器性能问题?
How to fix "Buffer preloading failed" compute shader performance issue?
从我的应用程序捕获 GPU 帧时,我在 Pipeline Statistics -> Remarks 下看到以下消息:
Buffer preloading failed
Make sure your data size is a multiple of 4
bytes and aligned to 4 bytes and try using a simple access pattern.
For constant buffers, try using a fixed buffer size.
pointLightBufferCenterAndRadius could not be promoted -
lightCuller.metal:light_culler
这是我的缓冲区初始化:
const int MaxLights = 2048;
pointLightCenterAndRadiusBuffer = [GfxDevice::GetMetalDevice() newBufferWithLength:MaxLights * sizeof( Vec4 )
options:MTLResourceCPUCacheModeDefaultCache];
pointLightCenterAndRadiusBuffer.label = @"pointLightCenterAndRadiusBuffer";
这是我的着色器的相关部分:
kernel void light_culler(texture2d<float, access::read> depthNormalsTexture [[texture(0)]],
constant Uniforms& uniforms [[ buffer(0) ]],
constant float4* pointLightBufferCenterAndRadius [[ buffer(1) ]],
device uint* perTileLightIndexBufferOut [[ buffer(2) ]],
constant float4* spotLightBufferCenterAndRadius [[ buffer(3) ]],
uint2 gid [[thread_position_in_grid]],
uint2 tid [[thread_position_in_threadgroup]],
uint2 dtid [[threadgroup_position_in_grid]])
{
threadgroup uint ldsLightIdx[ MAX_NUM_LIGHTS_PER_TILE ];
threadgroup atomic_uint ldsZMax;
threadgroup atomic_uint ldsZMin;
threadgroup atomic_uint ldsLightIdxCounter;
uint2 globalIdx = gid;
uint2 localIdx = tid;
uint2 groupIdx = dtid;
uint localIdxFlattened = localIdx.x + localIdx.y * TILE_RES;
uint tileIdxFlattened = groupIdx.x + groupIdx.y * GetNumTilesX( uniforms.windowWidth );
if (localIdxFlattened == 0)
{
atomic_store_explicit( &ldsZMin, 0x7f7fffff, memory_order_relaxed ); // FLT_MAX as uint
atomic_store_explicit( &ldsZMax, 0, memory_order_relaxed );
atomic_store_explicit( &ldsLightIdxCounter, 0, memory_order_relaxed );
}
float4 frustumEqn[ 4 ];
{
uint pxm = TILE_RES * groupIdx.x;
uint pym = TILE_RES * groupIdx.y;
uint pxp = TILE_RES * (groupIdx.x + 1);
uint pyp = TILE_RES * (groupIdx.y + 1);
float winWidth = float( TILE_RES * GetNumTilesX( uniforms.windowWidth ) );
float winHeight = float( TILE_RES * GetNumTilesY( uniforms.windowHeight) );
float4 v0 = float4( pxm / winWidth * 2.0f - 1.0f, (winHeight - pym) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 v1 = float4( pxp / winWidth * 2.0f - 1.0f, (winHeight - pym) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 v2 = float4( pxp / winWidth * 2.0f - 1.0f, (winHeight - pyp) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 v3 = float4( pxm / winWidth * 2.0f - 1.0f, (winHeight - pyp) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 frustum[ 4 ];
frustum[ 0 ] = ConvertClipToView( v0, uniforms.clipToView );
frustum[ 1 ] = ConvertClipToView( v1, uniforms.clipToView );
frustum[ 2 ] = ConvertClipToView( v2, uniforms.clipToView );
frustum[ 3 ] = ConvertClipToView( v3, uniforms.clipToView );
for (uint i = 0; i < 4; ++i)
{
frustumEqn[ i ] = CreatePlaneEquation( frustum[ i ], frustum[ (i + 1) & 3 ] );
}
}
threadgroup_barrier( mem_flags::mem_threadgroup );
float minZ = FLT_MAX;
float maxZ = 0.0f;
float depth = depthNormalsTexture.read( globalIdx.xy ).x;
uint z = as_type< uint >( depth );
if (depth != 0.0f)
{
/*uint i =*/ atomic_fetch_min_explicit( &ldsZMin, z, memory_order::memory_order_relaxed );
/*uint j =*/ atomic_fetch_max_explicit( &ldsZMax, z, memory_order::memory_order_relaxed );
}
threadgroup_barrier( mem_flags::mem_threadgroup );
uint zMin = atomic_load_explicit( &ldsZMin, memory_order::memory_order_relaxed );
uint zMax = atomic_load_explicit( &ldsZMax, memory_order::memory_order_relaxed );
minZ = as_type< float >( zMax );
maxZ = as_type< float >( zMin );
int numPointLights = uniforms.numLights & 0xFFFFu;
for (int i = 0; i < numPointLights; i += NUM_THREADS_PER_TILE)
{
int il = localIdxFlattened + i;
if (il < numPointLights)
{
float4 center = pointLightBufferCenterAndRadius[ il ];
float radius = center.w;
center.xyz = (uniforms.localToView * float4( center.xyz, 1.0f ) ).xyz;
if (-center.z + minZ < radius && center.z - maxZ < radius)
{
if ((GetSignedDistanceFromPlane( center, frustumEqn[ 0 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 1 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 2 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 3 ] ) < radius))
{
// do a thread-safe increment of the list counter
// and put the index of this light into the list
int dstIdx = atomic_fetch_add_explicit( &ldsLightIdxCounter, 1, memory_order::memory_order_relaxed );
ldsLightIdx[ dstIdx ] = il;
}
}
}
}
threadgroup_barrier( mem_flags::mem_threadgroup );
int numPointLightsInThisTile = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
// Spot lights.
int numSpotLights = (uniforms.numLights & 0xFFFF0000u) >> 16;
for (int i = 0; i < numSpotLights; i += NUM_THREADS_PER_TILE)
{
int il = localIdxFlattened + i;
if (il < numSpotLights)
{
float4 center = spotLightBufferCenterAndRadius[ il ];
float radius = center.w * 5.0f; // FIXME: Multiply was added, but more clever culling should be done instead.
center.xyz = (uniforms.localToView * float4( center.xyz, 1.0f )).xyz;
if (-center.z + minZ < radius && center.z - maxZ < radius)
{
if ((GetSignedDistanceFromPlane( center, frustumEqn[ 0 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 1 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 2 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 3 ] ) < radius))
{
int dstIdx = atomic_fetch_add_explicit( &ldsLightIdxCounter, 1, memory_order::memory_order_relaxed );
ldsLightIdx[ dstIdx ] = il;
}
}
}
}
threadgroup_barrier( mem_flags::mem_threadgroup );
{ // write back
int startOffset = uniforms.maxNumLightsPerTile * tileIdxFlattened;
for (int i = localIdxFlattened; i < numPointLightsInThisTile; i += NUM_THREADS_PER_TILE)
{
// per-tile list of light indices
perTileLightIndexBufferOut[ startOffset + i ] = ldsLightIdx[ i ];
}
int jMax = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
for (int j = localIdxFlattened + numPointLightsInThisTile; j < jMax; j += NUM_THREADS_PER_TILE)
{
// per-tile list of light indices
perTileLightIndexBufferOut[ startOffset + j + 1 ] = ldsLightIdx[ j ];
}
if (localIdxFlattened == 0)
{
perTileLightIndexBufferOut[ startOffset + numPointLightsInThisTile ] = LIGHT_INDEX_BUFFER_SENTINEL;
int offs = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
perTileLightIndexBufferOut[ startOffset + offs + 1 ] = LIGHT_INDEX_BUFFER_SENTINEL;
}
}
}
我正在使用 iOS 11.4 和 Xcode 9.4 在 iPad Pro 10.5" 上调试应用程序。如何修复警告?
我还尝试将缓冲区的类型从 constant float4*
更改为 constant PointLight& pointLightBufferCenterAndRadius
,其中 PointLight
是 struct PointLight { float4 d[ 2048 ]; }
,正如 Apple 的 Metal WWDC talk 所建议的那样。
此警告通常并不表示灾难性的性能损失。所以解决这个问题可能不会给你带来很大的收益,你可能想看看优化内核的其他部分。
避免出现这种情况的主要方法是在顶点着色器或计算内核中使用 [[ stage_in ]] 输入来获取每个 vertex/thread 数据。这并不总是可行,具体取决于所使用的算法,因为您可能无法像 [[ stage_in ]] 输入那样访问数据 "in order"。
从我的应用程序捕获 GPU 帧时,我在 Pipeline Statistics -> Remarks 下看到以下消息:
Buffer preloading failed
Make sure your data size is a multiple of 4 bytes and aligned to 4 bytes and try using a simple access pattern. For constant buffers, try using a fixed buffer size.
pointLightBufferCenterAndRadius could not be promoted - lightCuller.metal:light_culler
这是我的缓冲区初始化:
const int MaxLights = 2048;
pointLightCenterAndRadiusBuffer = [GfxDevice::GetMetalDevice() newBufferWithLength:MaxLights * sizeof( Vec4 )
options:MTLResourceCPUCacheModeDefaultCache];
pointLightCenterAndRadiusBuffer.label = @"pointLightCenterAndRadiusBuffer";
这是我的着色器的相关部分:
kernel void light_culler(texture2d<float, access::read> depthNormalsTexture [[texture(0)]],
constant Uniforms& uniforms [[ buffer(0) ]],
constant float4* pointLightBufferCenterAndRadius [[ buffer(1) ]],
device uint* perTileLightIndexBufferOut [[ buffer(2) ]],
constant float4* spotLightBufferCenterAndRadius [[ buffer(3) ]],
uint2 gid [[thread_position_in_grid]],
uint2 tid [[thread_position_in_threadgroup]],
uint2 dtid [[threadgroup_position_in_grid]])
{
threadgroup uint ldsLightIdx[ MAX_NUM_LIGHTS_PER_TILE ];
threadgroup atomic_uint ldsZMax;
threadgroup atomic_uint ldsZMin;
threadgroup atomic_uint ldsLightIdxCounter;
uint2 globalIdx = gid;
uint2 localIdx = tid;
uint2 groupIdx = dtid;
uint localIdxFlattened = localIdx.x + localIdx.y * TILE_RES;
uint tileIdxFlattened = groupIdx.x + groupIdx.y * GetNumTilesX( uniforms.windowWidth );
if (localIdxFlattened == 0)
{
atomic_store_explicit( &ldsZMin, 0x7f7fffff, memory_order_relaxed ); // FLT_MAX as uint
atomic_store_explicit( &ldsZMax, 0, memory_order_relaxed );
atomic_store_explicit( &ldsLightIdxCounter, 0, memory_order_relaxed );
}
float4 frustumEqn[ 4 ];
{
uint pxm = TILE_RES * groupIdx.x;
uint pym = TILE_RES * groupIdx.y;
uint pxp = TILE_RES * (groupIdx.x + 1);
uint pyp = TILE_RES * (groupIdx.y + 1);
float winWidth = float( TILE_RES * GetNumTilesX( uniforms.windowWidth ) );
float winHeight = float( TILE_RES * GetNumTilesY( uniforms.windowHeight) );
float4 v0 = float4( pxm / winWidth * 2.0f - 1.0f, (winHeight - pym) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 v1 = float4( pxp / winWidth * 2.0f - 1.0f, (winHeight - pym) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 v2 = float4( pxp / winWidth * 2.0f - 1.0f, (winHeight - pyp) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 v3 = float4( pxm / winWidth * 2.0f - 1.0f, (winHeight - pyp) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
float4 frustum[ 4 ];
frustum[ 0 ] = ConvertClipToView( v0, uniforms.clipToView );
frustum[ 1 ] = ConvertClipToView( v1, uniforms.clipToView );
frustum[ 2 ] = ConvertClipToView( v2, uniforms.clipToView );
frustum[ 3 ] = ConvertClipToView( v3, uniforms.clipToView );
for (uint i = 0; i < 4; ++i)
{
frustumEqn[ i ] = CreatePlaneEquation( frustum[ i ], frustum[ (i + 1) & 3 ] );
}
}
threadgroup_barrier( mem_flags::mem_threadgroup );
float minZ = FLT_MAX;
float maxZ = 0.0f;
float depth = depthNormalsTexture.read( globalIdx.xy ).x;
uint z = as_type< uint >( depth );
if (depth != 0.0f)
{
/*uint i =*/ atomic_fetch_min_explicit( &ldsZMin, z, memory_order::memory_order_relaxed );
/*uint j =*/ atomic_fetch_max_explicit( &ldsZMax, z, memory_order::memory_order_relaxed );
}
threadgroup_barrier( mem_flags::mem_threadgroup );
uint zMin = atomic_load_explicit( &ldsZMin, memory_order::memory_order_relaxed );
uint zMax = atomic_load_explicit( &ldsZMax, memory_order::memory_order_relaxed );
minZ = as_type< float >( zMax );
maxZ = as_type< float >( zMin );
int numPointLights = uniforms.numLights & 0xFFFFu;
for (int i = 0; i < numPointLights; i += NUM_THREADS_PER_TILE)
{
int il = localIdxFlattened + i;
if (il < numPointLights)
{
float4 center = pointLightBufferCenterAndRadius[ il ];
float radius = center.w;
center.xyz = (uniforms.localToView * float4( center.xyz, 1.0f ) ).xyz;
if (-center.z + minZ < radius && center.z - maxZ < radius)
{
if ((GetSignedDistanceFromPlane( center, frustumEqn[ 0 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 1 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 2 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 3 ] ) < radius))
{
// do a thread-safe increment of the list counter
// and put the index of this light into the list
int dstIdx = atomic_fetch_add_explicit( &ldsLightIdxCounter, 1, memory_order::memory_order_relaxed );
ldsLightIdx[ dstIdx ] = il;
}
}
}
}
threadgroup_barrier( mem_flags::mem_threadgroup );
int numPointLightsInThisTile = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
// Spot lights.
int numSpotLights = (uniforms.numLights & 0xFFFF0000u) >> 16;
for (int i = 0; i < numSpotLights; i += NUM_THREADS_PER_TILE)
{
int il = localIdxFlattened + i;
if (il < numSpotLights)
{
float4 center = spotLightBufferCenterAndRadius[ il ];
float radius = center.w * 5.0f; // FIXME: Multiply was added, but more clever culling should be done instead.
center.xyz = (uniforms.localToView * float4( center.xyz, 1.0f )).xyz;
if (-center.z + minZ < radius && center.z - maxZ < radius)
{
if ((GetSignedDistanceFromPlane( center, frustumEqn[ 0 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 1 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 2 ] ) < radius) &&
(GetSignedDistanceFromPlane( center, frustumEqn[ 3 ] ) < radius))
{
int dstIdx = atomic_fetch_add_explicit( &ldsLightIdxCounter, 1, memory_order::memory_order_relaxed );
ldsLightIdx[ dstIdx ] = il;
}
}
}
}
threadgroup_barrier( mem_flags::mem_threadgroup );
{ // write back
int startOffset = uniforms.maxNumLightsPerTile * tileIdxFlattened;
for (int i = localIdxFlattened; i < numPointLightsInThisTile; i += NUM_THREADS_PER_TILE)
{
// per-tile list of light indices
perTileLightIndexBufferOut[ startOffset + i ] = ldsLightIdx[ i ];
}
int jMax = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
for (int j = localIdxFlattened + numPointLightsInThisTile; j < jMax; j += NUM_THREADS_PER_TILE)
{
// per-tile list of light indices
perTileLightIndexBufferOut[ startOffset + j + 1 ] = ldsLightIdx[ j ];
}
if (localIdxFlattened == 0)
{
perTileLightIndexBufferOut[ startOffset + numPointLightsInThisTile ] = LIGHT_INDEX_BUFFER_SENTINEL;
int offs = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
perTileLightIndexBufferOut[ startOffset + offs + 1 ] = LIGHT_INDEX_BUFFER_SENTINEL;
}
}
}
我正在使用 iOS 11.4 和 Xcode 9.4 在 iPad Pro 10.5" 上调试应用程序。如何修复警告?
我还尝试将缓冲区的类型从 constant float4*
更改为 constant PointLight& pointLightBufferCenterAndRadius
,其中 PointLight
是 struct PointLight { float4 d[ 2048 ]; }
,正如 Apple 的 Metal WWDC talk 所建议的那样。
此警告通常并不表示灾难性的性能损失。所以解决这个问题可能不会给你带来很大的收益,你可能想看看优化内核的其他部分。
避免出现这种情况的主要方法是在顶点着色器或计算内核中使用 [[ stage_in ]] 输入来获取每个 vertex/thread 数据。这并不总是可行,具体取决于所使用的算法,因为您可能无法像 [[ stage_in ]] 输入那样访问数据 "in order"。