Cuda L2 传输开销
Cuda L2 transfer overhead
我有一个内核可以用 atomicMin
测试渲染点。测试设置在一个想法案例内存布局中有很多点。两个缓冲区,一个 uint32
用于 256x uint32
.
的簇
namespace Point
{
struct PackedBitfield
{
glm::uint32_t x : 6;
glm::uint32_t y : 6;
glm::uint32_t z : 6;
glm::uint32_t nx : 4;
glm::uint32_t ny : 4;
glm::uint32_t nz : 4;
glm::uint32_t unused : 2;
};
union __align__(4) Packed
{
glm::uint32_t bits;
PackedBitfield field;
};
struct ClusterPositionBitfield
{
glm::uint32_t x : 10;
glm::uint32_t y : 10;
glm::uint32_t z : 10;
glm::uint32_t w : 2;
};
union ClusterPosition
{
glm::uint32_t bits;
ClusterPositionBitfield field;
};
}
//
// launch with blockSize=(256, 1, 1) and grid=(numberOfClusters, 1, 1)
//
extern "C" __global__ void pointsRenderKernel(mat4 u_mvp,
ivec2 u_resolution,
uint64_t* rasterBuffer,
Point::Packed* points,
Point::ClusterPosition* clusterPosition)
{
// extract and compute world position
const Point::ClusterPosition cPosition(clusterPosition[blockIdx.x]);
const Point::Packed point(points[blockIdx.x*256 + threadIdx.x]);
...use points and write to buffer...
}
结果 SASS 如下所示:
查看内存分析器输出:Point::Packed*
缓冲区读取的 L2 传输开销为 3.0。 这是为什么呢?内存应该是完美对齐和顺序的。还有为什么会自动生成 LDG
(compute_50, sm_50)?我不需要这个缓存。
在 L2 Transfer Overhead 的工具提示中,它表示它测量 "the number of bytes actually transferred between L1 and L2 for each requested byte in L1",并且还表示 "lower is better"。
在我的例子中,读取 Point::Packed
的 L2 传输开销是 1.0
。
测试代码
namespace Point
{
struct PackedBitfield
{
uint32_t x : 6;
uint32_t y : 6;
uint32_t z : 6;
uint32_t nx : 4;
uint32_t ny : 4;
uint32_t nz : 4;
uint32_t unused : 2;
};
union __align__(4) Packed
{
uint32_t bits;
PackedBitfield field;
};
struct ClusterPositionBitfield
{
uint32_t x : 10;
uint32_t y : 10;
uint32_t z : 10;
uint32_t w : 2;
};
union ClusterPosition
{
uint32_t bits;
ClusterPositionBitfield field;
};
}
__global__ void pointsRenderKernel(Point::Packed* points, Point::ClusterPosition* clusterPosition)
{
int t_id = blockIdx.x * blockDim.x + threadIdx.x;
clusterPosition[blockIdx.x + blockDim.x] = clusterPosition[blockIdx.x];
points[t_id + blockDim.x * gridDim.x] = points[t_id];
}
void main()
{
int blockSize = 256;
int numberOfClusters = 256;
std::cout << sizeof(Point::Packed) << std::endl;
std::cout << sizeof(Point::ClusterPosition) << std::endl;
Point::Packed *d_points;
cudaMalloc(&d_points, sizeof(Point::Packed) * numberOfClusters * blockSize * 2);
Point::ClusterPosition *d_clusterPositions;
cudaMalloc(&d_points, sizeof(Point::ClusterPosition) * numberOfClusters * 2);
pointsRenderKernel<<<numberOfClusters, blockSize>>>(d_points, d_clusterPositions);
}
更新
之前使用最新的驱动程序时,我在使用 Nsight 时遇到了一些其他问题。我将驱动程序降级为默认 CUDA 8.0.61 安装程序(从 here 下载)附带的版本,它解决了这个问题。安装程序附带的版本是 376.51。在Windows10 64位和Visual Studio2015上测试,Nsight版本是5.2,我的显卡是cc6.1.
这是我的完整编译器命令:
nvcc.exe -gencode=arch=compute_61,code=\"sm_61,compute_61\" --use-local-env --cl-version 2015 -Xcompiler "/wd 4819" -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include" -lineinfo --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /O2 /FS /Zi /MD " -o x64\Release\kernel.cu.obj kernel.cu"
更新 2
当我使用 sm_50,compute_50
选项编译时得到相同的结果:1.0
用于 L2 传输开销。
我有一个内核可以用 atomicMin
测试渲染点。测试设置在一个想法案例内存布局中有很多点。两个缓冲区,一个 uint32
用于 256x uint32
.
namespace Point
{
struct PackedBitfield
{
glm::uint32_t x : 6;
glm::uint32_t y : 6;
glm::uint32_t z : 6;
glm::uint32_t nx : 4;
glm::uint32_t ny : 4;
glm::uint32_t nz : 4;
glm::uint32_t unused : 2;
};
union __align__(4) Packed
{
glm::uint32_t bits;
PackedBitfield field;
};
struct ClusterPositionBitfield
{
glm::uint32_t x : 10;
glm::uint32_t y : 10;
glm::uint32_t z : 10;
glm::uint32_t w : 2;
};
union ClusterPosition
{
glm::uint32_t bits;
ClusterPositionBitfield field;
};
}
//
// launch with blockSize=(256, 1, 1) and grid=(numberOfClusters, 1, 1)
//
extern "C" __global__ void pointsRenderKernel(mat4 u_mvp,
ivec2 u_resolution,
uint64_t* rasterBuffer,
Point::Packed* points,
Point::ClusterPosition* clusterPosition)
{
// extract and compute world position
const Point::ClusterPosition cPosition(clusterPosition[blockIdx.x]);
const Point::Packed point(points[blockIdx.x*256 + threadIdx.x]);
...use points and write to buffer...
}
结果 SASS 如下所示:
查看内存分析器输出:Point::Packed*
缓冲区读取的 L2 传输开销为 3.0。 这是为什么呢?内存应该是完美对齐和顺序的。还有为什么会自动生成 LDG
(compute_50, sm_50)?我不需要这个缓存。
在 L2 Transfer Overhead 的工具提示中,它表示它测量 "the number of bytes actually transferred between L1 and L2 for each requested byte in L1",并且还表示 "lower is better"。
在我的例子中,读取 Point::Packed
的 L2 传输开销是 1.0
。
测试代码
namespace Point
{
struct PackedBitfield
{
uint32_t x : 6;
uint32_t y : 6;
uint32_t z : 6;
uint32_t nx : 4;
uint32_t ny : 4;
uint32_t nz : 4;
uint32_t unused : 2;
};
union __align__(4) Packed
{
uint32_t bits;
PackedBitfield field;
};
struct ClusterPositionBitfield
{
uint32_t x : 10;
uint32_t y : 10;
uint32_t z : 10;
uint32_t w : 2;
};
union ClusterPosition
{
uint32_t bits;
ClusterPositionBitfield field;
};
}
__global__ void pointsRenderKernel(Point::Packed* points, Point::ClusterPosition* clusterPosition)
{
int t_id = blockIdx.x * blockDim.x + threadIdx.x;
clusterPosition[blockIdx.x + blockDim.x] = clusterPosition[blockIdx.x];
points[t_id + blockDim.x * gridDim.x] = points[t_id];
}
void main()
{
int blockSize = 256;
int numberOfClusters = 256;
std::cout << sizeof(Point::Packed) << std::endl;
std::cout << sizeof(Point::ClusterPosition) << std::endl;
Point::Packed *d_points;
cudaMalloc(&d_points, sizeof(Point::Packed) * numberOfClusters * blockSize * 2);
Point::ClusterPosition *d_clusterPositions;
cudaMalloc(&d_points, sizeof(Point::ClusterPosition) * numberOfClusters * 2);
pointsRenderKernel<<<numberOfClusters, blockSize>>>(d_points, d_clusterPositions);
}
更新
之前使用最新的驱动程序时,我在使用 Nsight 时遇到了一些其他问题。我将驱动程序降级为默认 CUDA 8.0.61 安装程序(从 here 下载)附带的版本,它解决了这个问题。安装程序附带的版本是 376.51。在Windows10 64位和Visual Studio2015上测试,Nsight版本是5.2,我的显卡是cc6.1.
这是我的完整编译器命令:
nvcc.exe -gencode=arch=compute_61,code=\"sm_61,compute_61\" --use-local-env --cl-version 2015 -Xcompiler "/wd 4819" -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include" -lineinfo --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /O2 /FS /Zi /MD " -o x64\Release\kernel.cu.obj kernel.cu"
更新 2
当我使用 sm_50,compute_50
选项编译时得到相同的结果:1.0
用于 L2 传输开销。