mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
Vulkan k-quant mmq and ggml-backend offload functionality (llama/6155)
* Fix Vulkan no kv offload incoherence * Add k-quant mul mat mat shaders * Rework working buffer allocation, reduces vram use noticeably Clean up cpu assist code, replaced with ggml-backend offload function * Default to all dedicated GPUs * Add fallback for integrated GPUs if no dedicated GPUs are found * Add debug info which device is allocating memory * Fix Intel dequant issue Fix validation issue * Fix Vulkan GGML_OP_GET_ROWS implementation * Clean up merge artifacts * Remove Vulkan warning
This commit is contained in:
parent
b83a9fc9d3
commit
fa966b9b40
633
ggml-vulkan.cpp
633
ggml-vulkan.cpp
File diff suppressed because it is too large
Load Diff
@ -11,17 +11,6 @@ extern "C" {
|
|||||||
#define GGML_VK_MAX_DEVICES 16
|
#define GGML_VK_MAX_DEVICES 16
|
||||||
|
|
||||||
GGML_API void ggml_vk_instance_init(void);
|
GGML_API void ggml_vk_instance_init(void);
|
||||||
GGML_API void ggml_vk_init_cpu_assist(void);
|
|
||||||
|
|
||||||
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
|
||||||
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
|
|
||||||
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
|
|
||||||
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
||||||
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
|
||||||
#endif
|
|
||||||
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
|
|
||||||
GGML_API void ggml_vk_free_cpu_assist(void);
|
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||||
|
35
ggml.c
35
ggml.c
@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
#include "ggml-vulkan.h"
|
|
||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_OPENBLAS)
|
#elif defined(GGML_USE_OPENBLAS)
|
||||||
#if defined(GGML_BLAS_USE_MKL)
|
#if defined(GGML_BLAS_USE_MKL)
|
||||||
@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
#include "ggml-vulkan.h"
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// floating point type used to accumulate sums
|
// floating point type used to accumulate sums
|
||||||
@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
ggml_vk_init_cpu_assist();
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_setup_op_has_task_pass();
|
ggml_setup_op_has_task_pass();
|
||||||
@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
|
||||||
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
||||||
if (skip_cpu) {
|
|
||||||
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (skip_cpu) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
#endif // GGML_USE_VULKAN
|
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
||||||
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
|
||||||
}
|
|
||||||
ggml_vk_preallocate_buffers_cpu_assist();
|
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
||||||
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int n_threads = cplan->n_threads;
|
const int n_threads = cplan->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
|
||||||
ggml_vk_graph_cleanup_cpu_assist();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// performance stats (graph)
|
// performance stats (graph)
|
||||||
{
|
{
|
||||||
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
||||||
|
Loading…
Reference in New Issue
Block a user