diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 0e5bf0ae..2e9ba58a 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -91,13 +91,14 @@ extern "C" { // (optional) complete all pending operations void (*GGML_CALL synchronize)(ggml_backend_t backend); - // compute graph with a plan + // create a plan for ggml_cgraph and free it ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // compute graph with a plan + enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index c86673b0..d60d9841 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla backend->iface.graph_plan_free(backend, plan); } -void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - backend->iface.graph_plan_compute(backend, plan); +enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + return backend->iface.graph_plan_compute(backend, plan); } -bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } @@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g GGML_UNUSED(backend); } -GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); + return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; - ggml_graph_compute(cgraph, &cplan); - return true; + return ggml_graph_compute(cgraph, &cplan); } GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } -static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { +static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; @@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { - if (!ggml_backend_graph_compute(split_backend, &split->graph)) { - return false; + enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph); + if (ec != GGML_STATUS_SUCCESS) { + return ec; } //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { @@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - if (!ggml_backend_graph_compute(split_backend, &gv)) { - return false; + enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv); + if (ec != GGML_STATUS_SUCCESS) { + return ec; } if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { @@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { } #endif - return true; + return GGML_STATUS_SUCCESS; } ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { @@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * return true; } -bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); if (!sched->is_reset) { @@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg ggml_backend_sched_split_graph(sched, graph); if (!ggml_backend_sched_alloc_splits(sched)) { - return false; + return GGML_STATUS_ALLOC_FAILED; } - if (!ggml_backend_sched_compute_splits(sched)) { - return false; - } - - return true; + return ggml_backend_sched_compute_splits(sched); } void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { diff --git a/ggml-backend.h b/ggml-backend.h index 8fb54bd9..8bed2257 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -66,12 +66,13 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); - GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + + GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); @@ -157,26 +158,26 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); - GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // Get the number of splits of the last graph - GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); + GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); - GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); - GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); + GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // Reset all assignments and allocators - must be called before changing the node backends - GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); + GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute - GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); // // Utils diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7d027a30..72bcec8c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -12241,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -12277,7 +12277,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg GGML_ASSERT(ok); } - return true; + return GGML_STATUS_SUCCESS; } GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index e740a76d..83a7822f 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g return ggml_backend_kompute_buffer_type(ctx->device); } -static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { auto * ctx = static_cast(backend->context); ggml_vk_graph_compute(ctx, cgraph); - return true; + return GGML_STATUS_SUCCESS; } static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/ggml-metal.m b/ggml-metal.m index 6b5a8fdf..00df2283 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -748,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const } } -static bool ggml_metal_graph_compute( +static enum ggml_status ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @@ -2484,7 +2484,7 @@ static bool ggml_metal_graph_compute( MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - return false; + return GGML_STATUS_FAILED; } } @@ -2493,7 +2493,7 @@ static bool ggml_metal_graph_compute( } } - return true; + return GGML_STATUS_SUCCESS; } //////////////////////////////////////////////////////////////////////////////// @@ -2795,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe UNUSED(backend); } -GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index df619a88..aa73d67d 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg GGML_UNUSED(backend); } -static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { +static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; switch (node->op) { @@ -2246,7 +2246,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap } } - return true; + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cad08d61..47a605b0 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15581,7 +15581,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); @@ -15613,7 +15613,7 @@ GGML_CALL static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, gg GGML_ASSERT(ok); } - return true; + return GGML_STATUS_SUCCESS; } GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index ae9cb3c1..bc316c3f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5092,7 +5092,7 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { ctx->transfer_ctx = nullptr; } -GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -5135,7 +5135,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml ggml_vk_graph_cleanup(ctx); - return true; + return GGML_STATUS_SUCCESS; UNUSED(backend); } diff --git a/ggml.c b/ggml.c index 870e4161..3c2e94c1 100644 --- a/ggml.c +++ b/ggml.c @@ -320,6 +320,16 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; +const char * ggml_status_to_string(enum ggml_status status) { + switch (status) { + case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; + case GGML_STATUS_FAILED: return "GGML status: error (operation failed)"; + case GGML_STATUS_SUCCESS: return "GGML status: success"; + case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)"; + default: GGML_ASSERT(false); + } +} + // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { @@ -17400,6 +17410,7 @@ struct ggml_compute_state { ggml_thread_t thrd; int ith; struct ggml_compute_state_shared * shared; + enum ggml_status ec; }; static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { @@ -17693,7 +17704,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { state->shared->node_n += 1; - return (thread_ret_t) GGML_EXIT_ABORTED; + state->ec = GGML_STATUS_ABORTED; + return 0; } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { @@ -17815,7 +17827,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - return GGML_EXIT_SUCCESS; + return 0; } struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { @@ -18011,7 +18023,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } -int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { +enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { GGML_ASSERT(cplan); GGML_ASSERT(cplan->n_threads > 0); @@ -18055,6 +18067,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { .thrd = 0, .ith = j, .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, }; const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); @@ -18065,12 +18078,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { workers[0].ith = 0; workers[0].shared = &state_shared; + workers[0].ec = GGML_STATUS_SUCCESS; const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); // this is a work thread too - int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); + ggml_graph_compute_thread(&workers[0]); + enum ggml_status compute_status = workers[0].ec; // don't leave affinity set on the main thread clear_numa_thread_affinity(); @@ -18080,6 +18095,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { for (int j = 1; j < n_threads; j++) { const int rc = ggml_thread_join(workers[j].thrd, NULL); GGML_ASSERT(rc == 0); + if (workers[j].ec != GGML_STATUS_SUCCESS) + compute_status = workers[j].ec; } } @@ -18107,14 +18124,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { return compute_status; } -void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { +enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - ggml_graph_compute(cgraph, &cplan); + return ggml_graph_compute(cgraph, &cplan); } struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { diff --git a/ggml.h b/ggml.h index 98cfc7bf..0ea4f884 100644 --- a/ggml.h +++ b/ggml.h @@ -315,6 +315,16 @@ extern "C" { #endif + enum ggml_status { + GGML_STATUS_ALLOC_FAILED = -2, + GGML_STATUS_FAILED = -1, + GGML_STATUS_SUCCESS = 0, + GGML_STATUS_ABORTED = 1, + }; + + // get ggml_status name string + GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status); + typedef uint16_t ggml_fp16_t; // convert FP16 <-> FP32 @@ -1940,12 +1950,11 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); - + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data - GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);