From 7c165d7fa81a2d8f345b66431211cf2b83e18379 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 6 Feb 2025 00:02:18 -0600 Subject: [PATCH] vulkan: use smaller combined allocations to avoid fragmentation (llama/11551) --- ggml/src/ggml-alloc.c | 14 +------------- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 9a3bf9f2..7244a9cb 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); } - if (this_size > max_size) { - GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", - __func__, t->name, - ggml_backend_buft_name(buft), - this_size, max_size); - for (size_t i = 0; i < n_buffers; i++) { - ggml_backend_buffer_free(buffers[i]); - } - free(buffers); - return NULL; - } - - if ((cur_buf_size + this_size) > max_size) { + if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { // allocate tensors in the current buffer if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 48ac489a..2e1bcf69 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -156,6 +156,7 @@ struct vk_device_struct { vk::PhysicalDeviceProperties properties; std::string name; uint64_t max_memory_allocation_size; + uint64_t suballocation_block_size; bool fp16; bool pipeline_robustness; vk::Device device; @@ -2269,6 +2270,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device.getProperties2(&props2); device->properties = props2.properties; + device->vendor_id = device->properties.vendorID; const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); @@ -2280,7 +2282,20 @@ static vk_device ggml_vk_get_device(size_t idx) { device->max_memory_allocation_size = props3.maxMemoryAllocationSize; } - device->vendor_id = device->properties.vendorID; + const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); + + if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { + device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE); +#if defined(_WIN32) + } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) { + // Limit batching of allocations to 1GB by default to avoid fragmentation issues + device->suballocation_block_size = 1024*1024*1024; +#endif + } else { + device->suballocation_block_size = device->max_memory_allocation_size; + } + device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size); + device->subgroup_size = subgroup_props.subgroupSize; device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; if (sm_builtins) { @@ -7561,7 +7576,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; - return ctx->device->max_memory_allocation_size; + return ctx->device->suballocation_block_size; } static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {