Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
0cc4m 2024-02-07 07:54:50 +01:00 committed by Georgi Gerganov
parent 77bf6b5f56
commit ef5e6b746f
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

14
ggml.c
View File

@ -2343,7 +2343,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
ggml_cl_init(); ggml_cl_init();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
ggml_vk_init(); ggml_vk_init_cpu_assist();
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
ggml_init_sycl(); ggml_init_sycl();
#endif #endif
@ -14850,10 +14850,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward(params, tensor); const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) { if (skip_cpu) {
ggml_vk_check_results_1(params, tensor); ggml_vk_check_results_1_cpu_assist(params, tensor);
} }
#endif #endif
if (skip_cpu) { if (skip_cpu) {
@ -17269,12 +17269,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
} }
ggml_vk_preallocate_buffers(); ggml_vk_preallocate_buffers_cpu_assist();
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1); ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
} }
#endif #endif
@ -17330,7 +17330,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
} }
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup(); ggml_vk_graph_cleanup_cpu_assist();
#endif #endif
// performance stats (graph) // performance stats (graph)