mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-21 05:33:06 +00:00
Basic Vulkan Multi-GPU implementation (llama/5321)
* Initial Vulkan multi-gpu implementation Move most global variables into backend context * Add names to backend device functions * Add further missing cleanup code * Reduce code duplication in tensor split layer assignment * generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h * Only do device info print in the beginning and initialize one backend for cpu assist Add missing cleanup code * Rework backend memory management to make sure devices and buffers get properly allocated and freed * Rename cpu assist free function --------- Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
77bf6b5f56
commit
ef5e6b746f
14
ggml.c
14
ggml.c
@ -2343,7 +2343,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
ggml_vk_init();
|
ggml_vk_init_cpu_assist();
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
ggml_init_sycl();
|
ggml_init_sycl();
|
||||||
#endif
|
#endif
|
||||||
@ -14850,10 +14850,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
ggml_vk_check_results_1(params, tensor);
|
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
@ -17269,12 +17269,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
||||||
}
|
}
|
||||||
ggml_vk_preallocate_buffers();
|
ggml_vk_preallocate_buffers_cpu_assist();
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -17330,7 +17330,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
ggml_vk_graph_cleanup();
|
ggml_vk_graph_cleanup_cpu_assist();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// performance stats (graph)
|
// performance stats (graph)
|
||||||
|
Loading…
Reference in New Issue
Block a user