metal : use residency sets (llama/11427)

* metal : use residency sets ggml-ci * metal : restore commandBufferWithUnretainedReferences calls [no ci] * metal : release descriptors ggml-ci * metal : check env GGML_METAL_NO_RESIDENCY ggml-ci * metal : fix build + clean-up ggml-ci
2025-02-15 14:52:07 +00:00 · 2025-01-26 20:06:16 +02:00 · 2025-01-26 20:06:16 +02:00 · 8639c003a9
commit 8639c003a9
parent d5d831da65
1 changed files with 119 additions and 17 deletions
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -19,7 +19,10 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
-#define UNUSED(x) (void)(x)
+// create residency sets only on macOS >= 15.0
 #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
 #define GGML_METAL_HAS_RESIDENCY_SETS 1
 #endif
 // globals
@ -39,6 +42,7 @@ static struct ggml_backend_metal_device_context {
    bool has_simdgroup_reduction;
    bool has_simdgroup_mm;
    bool has_residency_sets;
    bool has_bfloat;
    bool use_bfloat;
@ -48,6 +52,7 @@ static struct ggml_backend_metal_device_context {
    /*.mtl_device_ref_count    =*/ 0,
    /*.has_simdgroup_reduction =*/ false,
    /*.has_simdgroup_mm        =*/ false,
    /*.has_residency_sets      =*/ false,
    /*.has_bfloat              =*/ false,
    /*.use_bfloat              =*/ false,
    /*.name                    =*/ "",
@ -65,6 +70,10 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
        ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
        ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
 #endif
        ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
        ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
@ -483,6 +492,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
    ctx->queue  = [device newCommandQueue];
    if (ctx->queue == nil) {
        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
        return NULL;
    }
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
    id<MTLLibrary> metal_library;
@ -649,6 +663,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, ctx_dev->has_simdgroup_reduction     ? "true" : "false");
    GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm            ? "true" : "false");
    GGML_LOG_INFO("%s: has residency sets    = %s\n", __func__, ctx_dev->has_residency_sets          ? "true" : "false");
    GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
    GGML_LOG_INFO("%s: use bfloat            = %s\n", __func__, ctx_dev->use_bfloat                  ? "true" : "false");
    GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@ -1035,8 +1050,70 @@ struct ggml_backend_metal_buffer_context {
    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
    int n_buffers;
    struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
    // optional MTLResidencySet
    id rset;
 };
 // rset init
 static bool ggml_backend_metal_buffer_rset_init(
        struct ggml_backend_metal_buffer_context * ctx,
        struct ggml_backend_metal_device_context * ctx_dev,
        id<MTLDevice> device) {
    ctx->rset = nil;
    if (!ctx_dev->has_residency_sets) {
        return true;
    }
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
    if (@available(macOS 15.0, *)) {
        MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
        desc.label = @"ggml_backend_metal";
        desc.initialCapacity = ctx->n_buffers;
        NSError * error;
        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
        if (error) {
            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            [desc release];
            return false;
        }
        [desc release];
        for (int i = 0; i < ctx->n_buffers; i++) {
            [ctx->rset addAllocation:ctx->buffers[i].metal];
        }
        [ctx->rset commit];
        [ctx->rset requestResidency];
        return true;
    }
 #else
    GGML_UNUSED(ctx_dev);
    GGML_UNUSED(device);
 #endif
    return true;
 }
 // rset free
 static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
    if (@available(macOS 15.0, *)) {
        if (ctx->rset) {
            [ctx->rset endResidency];
            [ctx->rset removeAllAllocations];
            [ctx->rset release];
        }
    }
 #else
    GGML_UNUSED(ctx);
 #endif
 }
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
    for (int i = 0; i < ctx->n_buffers; i++) {
        [ctx->buffers[i].metal release];
    }
    ggml_backend_metal_buffer_rset_free(ctx);
    ggml_backend_metal_device_rel(buffer->buft->device->context);
    if (ctx->owned) {
@ -4198,19 +4277,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    memset((char *)tensor->data + offset, value, size);
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    memcpy((char *)tensor->data + offset, data, size);
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    memcpy(data, (const char *)tensor->data + offset, size);
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
    }
    return false;
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -4246,7 +4325,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
 static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
    return "Metal";
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
    }
 #endif
 #endif
-    UNUSED(device);
+    GGML_UNUSED(device);
-    UNUSED(size_aligned);
+    GGML_UNUSED(size_aligned);
 }
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
        size_aligned += (size_page - (size_aligned % size_page));
    }
-    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
    ctx->all_data = ggml_metal_host_malloc(size_aligned);
    ctx->all_size = size_aligned;
@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
    if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
        free(ctx);
-        ggml_backend_metal_device_rel(buft->device->context);
+        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
        free(ctx);
        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
 static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 32;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
    return max_size;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return true;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
    return "Metal_Mapped";
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
        size_aligned += (size_page - (size_aligned % size_page));
    }
-    id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
+    struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
    // the buffer fits into the max buffer size allowed by the device
    if (size_aligned <= device.maxBufferLength) {
@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
        }
    }
    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
        free(ctx);
        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {
    return "Metal";
-    UNUSED(backend);
+    GGML_UNUSED(backend);
 }
 static void ggml_backend_metal_free(ggml_backend_t backend) {
@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
        }
    }
    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
        free(ctx);
        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
            buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
-    UNUSED(dev);
+    GGML_UNUSED(dev);
 }
 static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {