将 Directx 12 纹理导入 cuda 表面:列好,行搞砸了
Importing Directx 12 texture to cuda surface: columns OK, rows screwed
我正在尝试从 cuda 编写 D3D12 纹理。为此,我将 D3D 纹理映射到 mipmap,然后映射到 cudaArray。我可以看到预期的结果模数很多列都丢失了。我错过了什么?
完整来源:https://github.com/mprevot/CudaD3D12Update
D2D12部分:
TextureChannels = 4;
TextureWidth = m_width;
TextureHeight = m_height;
const auto textureSurface = TextureWidth * TextureHeight;
const auto texturePixels = textureSurface * TextureChannels;
const auto textureSizeBytes = sizeof(float)* texturePixels;
D3D12_RESOURCE_DESC d3dTexDesc{};
d3dTexDesc.MipLevels = 1;
d3dTexDesc.Format = TextureChannels == 4 ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R32G32B32_FLOAT;
d3dTexDesc.Width = TextureWidth;
d3dTexDesc.Height = TextureHeight;
d3dTexDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
d3dTexDesc.DepthOrArraySize = 1;
d3dTexDesc.SampleDesc.Count = 1;
d3dTexDesc.SampleDesc.Quality = 0;
d3dTexDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_SHARED,
&d3dTexDesc, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, nullptr, IID_PPV_ARGS(&TextureArray)));
NAME_D3D12_OBJECT(TextureArray);
Import/mapping部分:
HANDLE sharedHandle{};
WindowsSecurityAttributes secAttr{};
LPCWSTR name{};
ThrowIfFailed(m_device->CreateSharedHandle(TextureArray.Get(), &secAttr, GENERIC_ALL, name, &sharedHandle));
const auto allocInfo = m_device->GetResourceAllocationInfo(m_nodeMask, 1, &d3dTexDesc);
cudaExternalMemoryHandleDesc cuExtmemHandleDesc{};
cuExtmemHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
cuExtmemHandleDesc.handle.win32.handle = sharedHandle;
cuExtmemHandleDesc.size = allocInfo.SizeInBytes;
cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated;
CheckCudaErrors(cudaImportExternalMemory(&m_externalMemory, &cuExtmemHandleDesc));
cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(TextureWidth, TextureHeight, 0);
cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
cuExtmemMipDesc.numLevels = 1;
CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));
cudaResourceDesc cuResDesc{};
cuResDesc.resType = cudaResourceTypeArray;
cuResDesc.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
库达部分:
int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }
__global__ void UpdateSurface(cudaSurfaceObject_t surf, unsigned int width, unsigned int height, float time)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= height | x >= width) return;
auto xVar = (float)x / (float)width;
auto yVar = (float)y / (float)height;
auto cost = __cosf(time) * 0.5f + 0.5f;
auto costx = __cosf(time) * 0.5f + xVar;
auto costy = __cosf(time) * 0.5f + yVar;
auto costxx = (__cosf(time) * 0.5f + 0.5f) * width;
auto costyy = (__cosf(time) * 0.5f + 0.5f) * height;
auto costxMany = __cosf(y * time) * 0.5f + yVar;
auto costyMany = __cosf((float)x/100 * time) * 0.5f + xVar;
auto margin = 1;
float4 pixel{};
if (y == 0)
pixel = make_float4(costyMany * 0.3, costyMany * 1, costyMany * 0.4, 1);
else if (y == height - 1)
pixel = make_float4(costyMany * 0.6, costyMany * 0.7, costyMany * 1, 1);
else if (x % 2 == 0)
{
if (x > width / 2)
pixel = make_float4(0.1, 0.5, costx * 1, 1);
else
pixel = make_float4(costx * 1, 0.1, 0.2, 1);
}
else if (x > width - margin - 1 | x <= margin)
pixel = make_float4(costxMany, costxMany * 0.9, costxMany * 0.6, 1);
else
pixel = make_float4(costx * 0.3, costx * 0.4, costx * 0.6, 1);
surf2Dwrite(pixel, surf, x, y);
}
void RunKernel(size_t textureW, size_t textureH, cudaSurfaceObject_t surfaceObject, cudaStream_t streamToRun, float animTime)
{
auto unit = 16;
dim3 threads(unit, unit);
dim3 grid(iDivUp(textureW, unit), iDivUp(textureH, unit));
UpdateSurface <<<grid, threads, 0, streamToRun >>> (surfaceObject, textureW, textureH, animTime);
getLastCudaError("UpdateSurface execution failed.\n");
}
所以,最后是cuda部分不正确:
surf2Dwrite
的x
必须乘以16.
surf2Dwrite(pixel, surf, x * 16, y);
找到
我正在尝试从 cuda 编写 D3D12 纹理。为此,我将 D3D 纹理映射到 mipmap,然后映射到 cudaArray。我可以看到预期的结果模数很多列都丢失了。我错过了什么?
完整来源:https://github.com/mprevot/CudaD3D12Update
D2D12部分:
TextureChannels = 4;
TextureWidth = m_width;
TextureHeight = m_height;
const auto textureSurface = TextureWidth * TextureHeight;
const auto texturePixels = textureSurface * TextureChannels;
const auto textureSizeBytes = sizeof(float)* texturePixels;
D3D12_RESOURCE_DESC d3dTexDesc{};
d3dTexDesc.MipLevels = 1;
d3dTexDesc.Format = TextureChannels == 4 ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R32G32B32_FLOAT;
d3dTexDesc.Width = TextureWidth;
d3dTexDesc.Height = TextureHeight;
d3dTexDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
d3dTexDesc.DepthOrArraySize = 1;
d3dTexDesc.SampleDesc.Count = 1;
d3dTexDesc.SampleDesc.Quality = 0;
d3dTexDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_SHARED,
&d3dTexDesc, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, nullptr, IID_PPV_ARGS(&TextureArray)));
NAME_D3D12_OBJECT(TextureArray);
Import/mapping部分:
HANDLE sharedHandle{};
WindowsSecurityAttributes secAttr{};
LPCWSTR name{};
ThrowIfFailed(m_device->CreateSharedHandle(TextureArray.Get(), &secAttr, GENERIC_ALL, name, &sharedHandle));
const auto allocInfo = m_device->GetResourceAllocationInfo(m_nodeMask, 1, &d3dTexDesc);
cudaExternalMemoryHandleDesc cuExtmemHandleDesc{};
cuExtmemHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
cuExtmemHandleDesc.handle.win32.handle = sharedHandle;
cuExtmemHandleDesc.size = allocInfo.SizeInBytes;
cuExtmemHandleDesc.flags = cudaExternalMemoryDedicated;
CheckCudaErrors(cudaImportExternalMemory(&m_externalMemory, &cuExtmemHandleDesc));
cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(TextureWidth, TextureHeight, 0);
cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
cuExtmemMipDesc.numLevels = 1;
CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));
cudaResourceDesc cuResDesc{};
cuResDesc.resType = cudaResourceTypeArray;
cuResDesc.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
库达部分:
int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }
__global__ void UpdateSurface(cudaSurfaceObject_t surf, unsigned int width, unsigned int height, float time)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= height | x >= width) return;
auto xVar = (float)x / (float)width;
auto yVar = (float)y / (float)height;
auto cost = __cosf(time) * 0.5f + 0.5f;
auto costx = __cosf(time) * 0.5f + xVar;
auto costy = __cosf(time) * 0.5f + yVar;
auto costxx = (__cosf(time) * 0.5f + 0.5f) * width;
auto costyy = (__cosf(time) * 0.5f + 0.5f) * height;
auto costxMany = __cosf(y * time) * 0.5f + yVar;
auto costyMany = __cosf((float)x/100 * time) * 0.5f + xVar;
auto margin = 1;
float4 pixel{};
if (y == 0)
pixel = make_float4(costyMany * 0.3, costyMany * 1, costyMany * 0.4, 1);
else if (y == height - 1)
pixel = make_float4(costyMany * 0.6, costyMany * 0.7, costyMany * 1, 1);
else if (x % 2 == 0)
{
if (x > width / 2)
pixel = make_float4(0.1, 0.5, costx * 1, 1);
else
pixel = make_float4(costx * 1, 0.1, 0.2, 1);
}
else if (x > width - margin - 1 | x <= margin)
pixel = make_float4(costxMany, costxMany * 0.9, costxMany * 0.6, 1);
else
pixel = make_float4(costx * 0.3, costx * 0.4, costx * 0.6, 1);
surf2Dwrite(pixel, surf, x, y);
}
void RunKernel(size_t textureW, size_t textureH, cudaSurfaceObject_t surfaceObject, cudaStream_t streamToRun, float animTime)
{
auto unit = 16;
dim3 threads(unit, unit);
dim3 grid(iDivUp(textureW, unit), iDivUp(textureH, unit));
UpdateSurface <<<grid, threads, 0, streamToRun >>> (surfaceObject, textureW, textureH, animTime);
getLastCudaError("UpdateSurface execution failed.\n");
}
所以,最后是cuda部分不正确:
surf2Dwrite
的x
必须乘以16.
surf2Dwrite(pixel, surf, x * 16, y);
找到