从用户空间应用程序读取原始 GPU 内存

Reading raw GPU memory from userspace application

我正在尝试从用户空间应用程序读取原始 gpu 内存。这个想法是从应用程序映射 /sys/bus/pci/devices/[device addr]/resource1 并对其进行加载和存储。

此处的设备是 Nvidia 3060Ti,板载内存为 8GiB。 BAR 配置为可调整大小,因此应该可以访问所有 8GiB 内存:

(base) [xps] pcimem git:(master) ✗ ls -lah /sys/bus/pci/devices/0000:01:00.0/resource*                   
-r--r--r-- 1 root root 4,0K avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource
-rw------- 1 root root  16M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource0
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1_wc
-rw------- 1 root root  32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3
-rw------- 1 root root  32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3_wc
-rw------- 1 root root  128 avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource5

使用pcimem访问内存无效。将 0 写入某个位置会在下一次读取时 return 为零,但会在任何后续读取时 return 0x000000005665BDF5 。第一次读取后,所有位置的值 0x000000005665BDF5 都相同。

对这些进行基准测试(失败)reads/writes 似乎表明它们确实达到了 GPU。读取延迟约为 900ns,接近 PCIe 往返时间。

我已经尝试 mmap 直接将帧缓冲区 (/dev/fb0) 和 read/write 连接到它。这有效,我看到类似的 read/write 延迟。但是,帧缓冲区对于我的用例来说太小了。

CUDA 不工作,因为从设备内存读取时,GPU 会将该页面移动到主机。

有没有办法从 Linux 访问 GPU 上的内存?

我这里的目标是能够在用户空间应用程序中映射GPU的内存并将其用作内存扩展。用户空间应用程序(CPU 上的运行)将直接在 GPU 内存上分配和访问数据结构。

TIA

您似乎可以使用 GDRCopy 库,或者至少可以使用它的内核驱动程序。来自 website:

GDRCopy is a low-latency GPU memory copy library based on GPUDirect RDMA technology that allows the CPU to directly map and access GPU memory.

解决方案是使用vulcan API在GPU上分配一个堆并访问它。但是,由于 x86 无法缓存 MMIO 地址,因此每次访问都会通过 PCIe 访问 GPU。

实施的延迟与 Nvidia 的服务器解决方案大致相同。

这是 C++ 中的一个快速而肮脏的实现,它将 GPU 抽象为堆内存,并允许在其上使用 malloc()free()

要找出堆类型,请检查:http://vulkan.gpuinfo.org/displayreport.php?id=14928#memory

createVertexBuffer()

调用 findMemoryType() 时,您需要检查 GPU 支持哪个标志
#include <chrono>
#include <vulkan/vulkan.h>

#include <algorithm>
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <optional>
#include <set>
#include <stdexcept>
#include <vector>

#include "libvram/libvram.hh"
class VRamWrapper;

VRamWrapper *vrw_obj;

const size_t DEV_EXT_LEN = 1;
const char *deviceExtensions[] = {VK_KHR_SWAPCHAIN_EXTENSION_NAME};

struct QueueFamilyIndices {
  std::optional<uint32_t> graphicsFamily;

  bool isComplete() { return graphicsFamily.has_value(); }
};

class VRamWrapper {
public:
  void init() { initVulkan(); }

  void *malloc(size_t bytes) { return this->createVertexBuffer(bytes); }
  void free(void *buf) { assert(0); }

private:
  VkInstance instance;

  VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
  VkDevice device;

  VkQueue graphicsQueue;

  std::vector<VkBuffer> buffers;
  std::vector<VkDeviceMemory> bufferMemories;

  void initVulkan() {
    createInstance();
    pickPhysicalDevice();
    createLogicalDevice();
  }

  void cleanup() {
    for (auto buf : buffers) {
      vkDestroyBuffer(device, buf, nullptr);
    }

    for (auto mem : bufferMemories) {
      vkFreeMemory(device, mem, nullptr);
    }

    vkDestroyDevice(device, nullptr);
    vkDestroyInstance(instance, nullptr);
  }

  void createInstance() {
    VkApplicationInfo appInfo{};
    appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
    appInfo.pApplicationName = "Hello Triangle";
    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.pEngineName = "No Engine";
    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.apiVersion = VK_API_VERSION_1_0;

    VkInstanceCreateInfo createInfo{};
    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
    createInfo.pApplicationInfo = &appInfo;

    createInfo.enabledLayerCount = 0;

    createInfo.pNext = nullptr;

    if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
      throw std::runtime_error("failed to create instance!");
    }
  }

  void pickPhysicalDevice() {
    uint32_t deviceCount = 0;
    vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);

    if (deviceCount == 0) {
      throw std::runtime_error("failed to find GPUs with Vulkan support!");
    }

    std::vector<VkPhysicalDevice> devices(deviceCount);
    vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());

    for (const auto &device : devices) {
      if (isDeviceSuitable(device)) {
        physicalDevice = device;
        break;
      }
    }

    if (physicalDevice == VK_NULL_HANDLE) {
      throw std::runtime_error("failed to find a suitable GPU!");
    }
  }

  void createLogicalDevice() {
    QueueFamilyIndices indices = findQueueFamilies(physicalDevice);

    std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
    std::set<uint32_t> uniqueQueueFamilies = {indices.graphicsFamily.value()};

    float queuePriority = 1.0f;
    for (uint32_t queueFamily : uniqueQueueFamilies) {
      VkDeviceQueueCreateInfo queueCreateInfo{};
      queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
      queueCreateInfo.queueFamilyIndex = queueFamily;
      queueCreateInfo.queueCount = 1;
      queueCreateInfo.pQueuePriorities = &queuePriority;
      queueCreateInfos.push_back(queueCreateInfo);
    }

    VkPhysicalDeviceFeatures deviceFeatures{};

    VkDeviceCreateInfo createInfo{};
    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;

    createInfo.queueCreateInfoCount =
        static_cast<uint32_t>(queueCreateInfos.size());
    createInfo.pQueueCreateInfos = queueCreateInfos.data();

    createInfo.pEnabledFeatures = &deviceFeatures;

    createInfo.enabledExtensionCount = static_cast<uint32_t>(DEV_EXT_LEN);
    createInfo.ppEnabledExtensionNames = deviceExtensions;

    createInfo.enabledLayerCount = 0;

    if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) !=
        VK_SUCCESS) {
      throw std::runtime_error("failed to create logical device!");
    }

    vkGetDeviceQueue(device, indices.graphicsFamily.value(), 0, &graphicsQueue);
  }

  void *createVertexBuffer(size_t bytes) {
    VkBuffer buffer;
    VkDeviceMemory bufferMemory;

    VkBufferCreateInfo bufferInfo{};
    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
    bufferInfo.size = bytes;
    bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;

    if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
      throw std::runtime_error("failed to create vertex buffer!");
    }

    VkMemoryRequirements memRequirements;
    vkGetBufferMemoryRequirements(device, buffer, &memRequirements);

    assert(memRequirements.size == bytes);

    VkMemoryAllocateInfo allocInfo{};
    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
    allocInfo.allocationSize = memRequirements.size;
    allocInfo.memoryTypeIndex =
        findMemoryType(memRequirements.memoryTypeBits,
                       VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
                           VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

    if (auto res = vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory);
        res != VK_SUCCESS) {
      throw std::runtime_error("failed to allocate vertex buffer memory");
    }

    vkBindBufferMemory(device, buffer, bufferMemory, 0);

    void *data;
    auto res = vkMapMemory(device, bufferMemory, 0, bytes, 0, &data);
    if (res != VK_SUCCESS) {
      throw std::runtime_error("Map failed");
    }

    fprintf(stderr, "Map completed. Allocated %lu MiB at %p\n",
            (bytes) / (1024UL * 1024), data);

    this->buffers.push_back(buffer);
    this->bufferMemories.push_back(bufferMemory);

    return data;
  }

  uint32_t findMemoryType(uint32_t typeFilter,
                          VkMemoryPropertyFlags properties) {
    VkPhysicalDeviceMemoryProperties memProperties;
    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);

    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
      if ((typeFilter & (1 << i)) &&
          (memProperties.memoryTypes[i].propertyFlags & properties) ==
              properties) {
        return i;
      }
    }

    throw std::runtime_error("failed to find suitable memory type!");
  }

  bool isDeviceSuitable(VkPhysicalDevice device) {
    QueueFamilyIndices indices = findQueueFamilies(device);

    bool extensionsSupported = checkDeviceExtensionSupport(device);

    return indices.isComplete() &&
           extensionsSupported /* && swapChainAdequate */;
  }

  bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
    uint32_t extensionCount;
    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
                                         nullptr);

    std::vector<VkExtensionProperties> availableExtensions(extensionCount);
    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
                                         availableExtensions.data());

    std::set<std::string> requiredExtensions(deviceExtensions,
                                             deviceExtensions + DEV_EXT_LEN);

    for (const auto &extension : availableExtensions) {
      requiredExtensions.erase(extension.extensionName);
    }

    return requiredExtensions.empty();
  }

  QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
    QueueFamilyIndices indices;

    uint32_t queueFamilyCount = 0;
    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
                                             nullptr);

    std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
                                             queueFamilies.data());

    int i = 0;
    for (const auto &queueFamily : queueFamilies) {
      if (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
        indices.graphicsFamily = i;
      }

      if (indices.isComplete()) {
        break;
      }

      i++;
    }

    return indices;
  }
};

void ctor_libvram() {
  fprintf(stderr, "%s() called\n", __FUNCTION__);
  vrw_obj = new VRamWrapper();
  vrw_obj->init();
}

void *libvram::malloc(size_t bytes) {
  return vrw_obj->malloc(bytes);
}

void libvram::free(void *ptr) {
  vrw_obj->free(ptr);
}