gpu: use distinct allocator/memory for staging buffers;

Fixes easily-encounterable GPU OOM on discrete cards. Currently when mapping CPU-accessible GPU memory, there are only two types of memory: write and read. The "write" allocations try to use the special 256MB pinned memory region, with the thought that since this memory is usually for vertices, uniforms, etc. it should be fast. However, this memory is also used for staging buffers for buffers and textures, which can easily exceed the 256MB (or 246MB on NV) limit upon creating a handful of large textures. To fix this, we're going to separate WRITE mappings into STREAM and STAGING. STREAM will act like the old CPU_WRITE mapping type and use the same memory type. STAGING will use plain host-visible memory and avoid hogging the precious 256MB memory region. STAGING also uses a different allocation strategy. Instead of creating a big buffer with a zone for each tick, it's a more traditional linear allocator that allocates in 4MB chunks and condemns the chunk if it ever fills up. This is a better fit for staging buffer lifetimes since there's usually a bunch of them at startup and then a small/sporadic amount afterwards. The buffer doesn't need to double in size, and it doesn't need to be kept around after the transfers are issued. The memory really is single-use and won't roll over from frame to frame like the other scratchpads.
2022-11-07 20:46:41 -08:00 · 2022-11-07 20:46:41 -08:00 · 3775ed1be6
parent c682ced654
commit 3775ed1be6
3 changed files with 83 additions and 59 deletions
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -36,8 +36,9 @@ bool gpu_buffer_init(gpu_buffer* buffer, gpu_buffer_info* info);
 void gpu_buffer_destroy(gpu_buffer* buffer);

 typedef enum {
-  GPU_MAP_WRITE,
-  GPU_MAP_READ
+  GPU_MAP_STREAM,
+  GPU_MAP_STAGING,
+  GPU_MAP_READBACK
 } gpu_map_mode;

 void* gpu_map(gpu_buffer* buffer, uint32_t size, uint32_t align, gpu_map_mode mode);
--- a/src/core/gpu_vk.c
+++ b/src/core/gpu_vk.c
@ -83,8 +83,9 @@ typedef struct {

 typedef enum {
  GPU_MEMORY_BUFFER_GPU,
-  GPU_MEMORY_BUFFER_CPU_WRITE,
-  GPU_MEMORY_BUFFER_CPU_READ,
+  GPU_MEMORY_BUFFER_MAP_STREAM,
+  GPU_MEMORY_BUFFER_MAP_STAGING,
+  GPU_MEMORY_BUFFER_MAP_READBACK,
  GPU_MEMORY_TEXTURE_COLOR,
  GPU_MEMORY_TEXTURE_D16,
  GPU_MEMORY_TEXTURE_D32F,
@ -179,7 +180,7 @@ static struct {
  gpu_cache_entry framebuffers[16][4];
  gpu_allocator allocators[GPU_MEMORY_COUNT];
  uint8_t allocatorLookup[GPU_MEMORY_COUNT];
-  gpu_scratchpad scratchpad[2];
+  gpu_scratchpad scratchpad[3];
  gpu_memory memory[256];
  uint32_t streamCount;
  uint32_t tick[2];
@ -401,40 +402,56 @@ void gpu_buffer_destroy(gpu_buffer* buffer) {
  gpu_release(&state.memory[buffer->memory]);
 }

+// There are 3 mapping modes, which use different strategies/memory types:
+// - MAP_STREAM: Used to "stream" data to the GPU, to be read by shaders.  This tries to use the
+//   special 256MB memory type present on discrete GPUs because it's both device local and host-
+//   visible and that supposedly makes it fast.  A single buffer is allocated with a "zone" for each
+//   tick.  If one of the zones fills up, a new bigger buffer is allocated.  It's important to have
+//   one buffer and keep it alive since streaming is expected to happen very frequently.
+// - MAP_STAGING: Used to stage data to upload to buffers/textures.  Can only be used for transfers.
+//   Uses uncached host-visible memory so as to not pollute the CPU cache.  This uses a slightly
+//   different allocation strategy where blocks of memory are allocated, linearly allocated from,
+//   and condemned once they fill up.  This is because uploads are much less frequent than
+//   streaming and are usually too big to fit in the 256MB memory.
+// - MAP_READBACK: Used for readbacks.  Uses cached memory when available since reading from
+//   uncached memory on the CPU is super duper slow.  Uses the same "zone" system as STREAM, since
+//   we want to be able to handle per-frame readbacks without thrashing.
 void* gpu_map(gpu_buffer* buffer, uint32_t size, uint32_t align, gpu_map_mode mode) {
  gpu_scratchpad* pool = &state.scratchpad[mode];
  uint32_t cursor = ALIGN(pool->cursor, align);
+  uint32_t zone = mode == GPU_MAP_STAGING ? 0 : (state.tick[CPU] & TICK_MASK);

-  // "Big" buffers don't pollute the scratchpad (heuristic)
-  bool oversized = size > (1 << 26);
-
-  // There's only 1 buffer per mode, split into a zone for each tick.
-  uint32_t zone = state.tick[CPU] & TICK_MASK;
-
-  // If the scratchpad buffer fills up, condemn it and allocate a bigger one to use.
-  if (oversized || cursor + size > pool->size) {
-    uint32_t bufferSize;
-
-    if (oversized) {
-      bufferSize = size;
-    } else {
-      while (pool->size < cursor + size) {
-        pool->size = pool->size ? (pool->size << 1) : (1 << 22);
-      }
-      bufferSize = pool->size * COUNTOF(state.ticks);
-    }
+  // If the scratchpad buffer fills up, condemn it and allocate a new/bigger one to use.
+  if (cursor + size > pool->size) {
+    VkBufferUsageFlags usages[] = {
+      [GPU_MAP_STREAM] =
+        VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+        VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
+        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+        VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      [GPU_MAP_STAGING] = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      [GPU_MAP_READBACK] = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
+    };

    VkBufferCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
-      .size = bufferSize,
-      .usage = mode == GPU_MAP_WRITE ?
-        (VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
-        VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
-        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
-        VK_BUFFER_USAGE_TRANSFER_SRC_BIT) :
-        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
+      .usage = usages[mode]
    };

+    // Staging buffers use 4MB block sizes, stream/download start out at 4MB and double after that
+    if (pool->size == 0) {
+      pool->size = 1 << 22;
+    }
+
+    if (mode == GPU_MAP_STAGING) {
+      info.size = MAX(pool->size, size);
+    } else {
+      while (pool->size < size) {
+        pool->size <<= 1;
+      }
+      info.size = pool->size * COUNTOF(state.ticks);
+    }
+
    VkBuffer handle;
    VK(vkCreateBuffer(state.device, &info, NULL, &handle), "Could not create scratch buffer") return NULL;
    nickname(handle, VK_OBJECT_TYPE_BUFFER, "Scratchpad");
@ -442,7 +459,7 @@ void* gpu_map(gpu_buffer* buffer, uint32_t size, uint32_t align, gpu_map_mode mo
    VkDeviceSize offset;
    VkMemoryRequirements requirements;
    vkGetBufferMemoryRequirements(state.device, handle, &requirements);
-    gpu_memory* memory = gpu_allocate(GPU_MEMORY_BUFFER_CPU_WRITE + mode, requirements, &offset);
+    gpu_memory* memory = gpu_allocate(GPU_MEMORY_BUFFER_MAP_STREAM + mode, requirements, &offset);

    VK(vkBindBufferMemory(state.device, handle, memory->handle, offset), "Could not bind scratchpad memory") {
      vkDestroyBuffer(state.device, handle, NULL);
@ -450,7 +467,8 @@ void* gpu_map(gpu_buffer* buffer, uint32_t size, uint32_t align, gpu_map_mode mo
      return NULL;
    }

-    if (oversized) {
+    // If this was an oversized allocation, condemn it immediately, don't touch the pool
+    if (size > pool->size) {
      gpu_release(memory);
      condemn(handle, VK_OBJECT_TYPE_BUFFER);
      buffer->handle = handle;
@ -2079,7 +2097,7 @@ bool gpu_init(gpu_config* config) {
          VK_BUFFER_USAGE_TRANSFER_DST_BIT,
        .flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
      },
-      [GPU_MEMORY_BUFFER_CPU_WRITE] = {
+      [GPU_MEMORY_BUFFER_MAP_STREAM] = {
        .usage =
          VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
          VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
@ -2087,8 +2105,12 @@ bool gpu_init(gpu_config* config) {
          VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
        .flags = hostVisible | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
      },
-      [GPU_MEMORY_BUFFER_CPU_READ] = {
-        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+      [GPU_MEMORY_BUFFER_MAP_STAGING] = {
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .flags = hostVisible
+      },
+      [GPU_MEMORY_BUFFER_MAP_READBACK] = {
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
        .flags = hostVisible | VK_MEMORY_PROPERTY_HOST_CACHED_BIT
      }
    };
@ -2116,16 +2138,16 @@ bool gpu_init(gpu_config* config) {
          continue;
        }

-        if ((memoryTypes[j].propertyFlags & fallback) == fallback) {
-          allocator->memoryFlags = memoryTypes[j].propertyFlags;
-          allocator->memoryType = j;
-        }
-
        if ((memoryTypes[j].propertyFlags & bufferFlags[i].flags) == bufferFlags[i].flags) {
          allocator->memoryFlags = memoryTypes[j].propertyFlags;
          allocator->memoryType = j;
          break;
        }
+
+        if ((memoryTypes[j].propertyFlags & fallback) == fallback) {
+          allocator->memoryFlags = memoryTypes[j].propertyFlags;
+          allocator->memoryType = j;
+        }
      }
    }

@ -2370,8 +2392,8 @@ uint32_t gpu_begin() {
  gpu_tick* tick = &state.ticks[state.tick[CPU] & TICK_MASK];
  VK(vkResetFences(state.device, 1, &tick->fence), "Fence reset failed") return 0;
  VK(vkResetCommandPool(state.device, tick->pool, 0), "Command pool reset failed") return 0;
-  state.scratchpad[GPU_MAP_WRITE].cursor = 0;
-  state.scratchpad[GPU_MAP_READ].cursor = 0;
+  state.scratchpad[GPU_MAP_STREAM].cursor = 0;
+  state.scratchpad[GPU_MAP_READBACK].cursor = 0;
  state.streamCount = 0;
  expunge();
  return state.tick[CPU];
@ -2475,8 +2497,9 @@ static gpu_memory* gpu_allocate(gpu_memory_type type, VkMemoryRequirements info,

  static const uint32_t blockSizes[] = {
    [GPU_MEMORY_BUFFER_GPU] = 1 << 26,
-    [GPU_MEMORY_BUFFER_CPU_WRITE] = 0,
-    [GPU_MEMORY_BUFFER_CPU_READ] = 0,
+    [GPU_MEMORY_BUFFER_MAP_STREAM] = 0,
+    [GPU_MEMORY_BUFFER_MAP_STAGING] = 0,
+    [GPU_MEMORY_BUFFER_MAP_READBACK] = 0,
    [GPU_MEMORY_TEXTURE_COLOR] = 1 << 28,
    [GPU_MEMORY_TEXTURE_D16] = 1 << 28,
    [GPU_MEMORY_TEXTURE_D32F] = 1 << 28,
--- a/src/modules/graphics/graphics.c
+++ b/src/modules/graphics/graphics.c
@ -525,7 +525,7 @@ bool lovrGraphicsInit(GraphicsConfig* config) {

  beginFrame();
  gpu_buffer* scratchpad = tempAlloc(gpu_sizeof_buffer());
-  float* pointer = gpu_map(scratchpad, sizeof(data), 4, GPU_MAP_WRITE);
+  float* pointer = gpu_map(scratchpad, sizeof(data), 4, GPU_MAP_STAGING);
  memcpy(pointer, data, sizeof(data));
  gpu_copy_buffers(state.stream, scratchpad, state.defaultBuffer->gpu, 0, 0, sizeof(data));

@ -1113,7 +1113,7 @@ Buffer* lovrGraphicsGetBuffer(BufferInfo* info, void** data) {
  buffer->hash = hash64(info->fields, info->fieldCount * sizeof(BufferField));

  beginFrame();
-  buffer->pointer = gpu_map(buffer->gpu, size, state.limits.uniformBufferAlign, GPU_MAP_WRITE);
+  buffer->pointer = gpu_map(buffer->gpu, size, state.limits.uniformBufferAlign, GPU_MAP_STREAM);
  buffer->tick = state.tick;

  if (data) {
@ -1145,7 +1145,7 @@ Buffer* lovrBufferCreate(const BufferInfo* info, void** data) {
  if (data && *data == NULL) {
    beginFrame();
    gpu_buffer* scratchpad = tempAlloc(gpu_sizeof_buffer());
-    *data = gpu_map(scratchpad, size, 4, GPU_MAP_WRITE);
+    *data = gpu_map(scratchpad, size, 4, GPU_MAP_STAGING);
    gpu_copy_buffers(state.stream, scratchpad, buffer->gpu, 0, 0, size);
    buffer->sync.writePhase = GPU_PHASE_TRANSFER;
    buffer->sync.pendingWrite = GPU_CACHE_TRANSFER_WRITE;
@ -1262,7 +1262,7 @@ Texture* lovrTextureCreate(const TextureInfo* info) {
    }

    scratchpad = tempAlloc(gpu_sizeof_buffer());
-    char* data = gpu_map(scratchpad, total, 64, GPU_MAP_WRITE);
+    char* data = gpu_map(scratchpad, total, 64, GPU_MAP_STAGING);

    for (uint32_t level = 0; level < levelCount; level++) {
      for (uint32_t layer = 0; layer < info->layers; layer++) {
@ -2029,7 +2029,7 @@ Material* lovrMaterialCreate(const MaterialInfo* info) {
    beginFrame();
    uint32_t size = stride * MATERIALS_PER_BLOCK;
    gpu_buffer* scratchpad = tempAlloc(gpu_sizeof_buffer());
-    data = gpu_map(scratchpad, size, 4, GPU_MAP_WRITE);
+    data = gpu_map(scratchpad, size, 4, GPU_MAP_STAGING);
    gpu_copy_buffers(state.stream, scratchpad, block->buffer, 0, stride * material->index, stride);
    state.hasMaterialUpload = true;
  }
@ -2298,7 +2298,7 @@ static Glyph* lovrFontGetGlyph(Font* font, uint32_t codepoint, bool* resized) {
  size_t stack = tempPush();
  float* pixels = tempAlloc(pixelWidth * pixelHeight * 4 * sizeof(float));
  lovrRasterizerGetPixels(font->info.rasterizer, glyph->codepoint, pixels, pixelWidth, pixelHeight, font->info.spread);
-  uint8_t* dst = gpu_map(scratchpad, pixelWidth * pixelHeight * 4 * sizeof(uint8_t), 4, GPU_MAP_WRITE);
+  uint8_t* dst = gpu_map(scratchpad, pixelWidth * pixelHeight * 4 * sizeof(uint8_t), 4, GPU_MAP_STAGING);
  float* src = pixels;
  for (uint32_t y = 0; y < pixelHeight; y++) {
    for (uint32_t x = 0; x < pixelWidth; x++) {
@ -2995,7 +2995,7 @@ static void lovrModelReskin(Model* model) {

    float transform[16];
    uint32_t size = bindings[3].buffer.extent = skin->jointCount * 16 * sizeof(float);
-    float* joint = gpu_map(joints, size, state.limits.uniformBufferAlign, GPU_MAP_WRITE);
+    float* joint = gpu_map(joints, size, state.limits.uniformBufferAlign, GPU_MAP_STREAM);
    for (uint32_t j = 0; j < skin->jointCount; j++) {
      mat4_init(transform, model->globalTransforms + 16 * skin->joints[j]);
      mat4_mul(transform, skin->inverseBindMatrices + 16 * j);
@ -3057,7 +3057,7 @@ Readback* lovrReadbackCreate(const ReadbackInfo* info) {
      break;
  }

-  readback->pointer = gpu_map(readback->buffer, readback->size, 16, GPU_MAP_READ);
+  readback->pointer = gpu_map(readback->buffer, readback->size, 16, GPU_MAP_READBACK);
  return readback;
 }

@ -3432,7 +3432,7 @@ Pass* lovrGraphicsGetPass(PassInfo* info) {
  pass->builtins[2] = (gpu_binding) { 2, GPU_SLOT_UNIFORM_BUFFER, .buffer = draws };
  pass->builtins[3] = (gpu_binding) { 3, GPU_SLOT_SAMPLER, .sampler = NULL };

-  Globals* global = gpu_map(pass->builtins[0].buffer.object, sizeof(Globals), state.limits.uniformBufferAlign, GPU_MAP_WRITE);
+  Globals* global = gpu_map(pass->builtins[0].buffer.object, sizeof(Globals), state.limits.uniformBufferAlign, GPU_MAP_STREAM);

  global->resolution[0] = pass->width;
  global->resolution[1] = pass->height;
@ -4109,7 +4109,7 @@ static void bindBundles(Pass* pass, Draw* draw, Shader* shader) {
      }

      uint32_t size = pass->viewCount * sizeof(Camera);
-      void* data = gpu_map(pass->builtins[1].buffer.object, size, state.limits.uniformBufferAlign, GPU_MAP_WRITE);
+      void* data = gpu_map(pass->builtins[1].buffer.object, size, state.limits.uniformBufferAlign, GPU_MAP_STREAM);
      memcpy(data, pass->cameras, size);
      pass->cameraDirty = false;
      builtinsDirty = true;
@ -4117,7 +4117,7 @@ static void bindBundles(Pass* pass, Draw* draw, Shader* shader) {

    if (pass->drawCount % 256 == 0) {
      uint32_t size = 256 * sizeof(DrawData);
-      pass->drawData = gpu_map(pass->builtins[2].buffer.object, size, state.limits.uniformBufferAlign, GPU_MAP_WRITE);
+      pass->drawData = gpu_map(pass->builtins[2].buffer.object, size, state.limits.uniformBufferAlign, GPU_MAP_STREAM);
      builtinsDirty = true;
    }

@ -4254,7 +4254,7 @@ static void bindBuffers(Pass* pass, Draw* draw) {
    uint32_t size = draw->vertex.count * stride;

    gpu_buffer* scratchpad = tempAlloc(gpu_sizeof_buffer());
-    *draw->vertex.pointer = gpu_map(scratchpad, size, stride, GPU_MAP_WRITE);
+    *draw->vertex.pointer = gpu_map(scratchpad, size, stride, GPU_MAP_STREAM);

    gpu_bind_vertex_buffers(pass->stream, &scratchpad, NULL, 0, 1);
    pass->vertexBuffer = scratchpad;
@ -4269,7 +4269,7 @@ static void bindBuffers(Pass* pass, Draw* draw) {
    uint32_t size = draw->index.count * sizeof(uint16_t);

    gpu_buffer* scratchpad = tempAlloc(gpu_sizeof_buffer());
-    *draw->index.pointer = gpu_map(scratchpad, size, sizeof(uint16_t), GPU_MAP_WRITE);
+    *draw->index.pointer = gpu_map(scratchpad, size, sizeof(uint16_t), GPU_MAP_STREAM);

    gpu_bind_index_buffer(pass->stream, scratchpad, 0, GPU_INDEX_U16);
    pass->indexBuffer = scratchpad;
@ -5232,7 +5232,7 @@ void* lovrPassCopyDataToBuffer(Pass* pass, Buffer* buffer, uint32_t offset, uint
  lovrCheck(!lovrBufferIsTemporary(buffer), "Temporary buffers can not be copied to, use Buffer:setData");
  lovrCheck(offset + extent <= buffer->size, "Buffer copy range goes past the end of the Buffer");
  gpu_buffer* scratchpad = tempAlloc(gpu_sizeof_buffer());
-  void* pointer = gpu_map(scratchpad, extent, 4, GPU_MAP_WRITE);
+  void* pointer = gpu_map(scratchpad, extent, 4, GPU_MAP_STAGING);
  gpu_copy_buffers(pass->stream, scratchpad, buffer->gpu, 0, offset, extent);
  trackBuffer(pass, buffer, GPU_PHASE_TRANSFER, GPU_CACHE_TRANSFER_WRITE);
  return pointer;
@ -5289,7 +5289,7 @@ void lovrPassCopyImageToTexture(Pass* pass, Image* image, Texture* texture, uint
  layerOffset += measureTexture(texture->info.format, srcOffset[0], 1, 1);
  uint32_t pitch = measureTexture(texture->info.format, lovrImageGetWidth(image, srcOffset[3]), 1, 1);
  gpu_buffer* buffer = tempAlloc(gpu_sizeof_buffer());
-  char* dst = gpu_map(buffer, totalSize, 64, GPU_MAP_WRITE);
+  char* dst = gpu_map(buffer, totalSize, 64, GPU_MAP_STAGING);
  for (uint32_t z = 0; z < extent[2]; z++) {
    const char* src = (char*) lovrImageGetLayerData(image, srcOffset[3], z) + layerOffset;
    for (uint32_t y = 0; y < extent[1]; y++) {