From 43e3d25d9365676ab24ac82e8dee624cbdfe0683 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:20:38 +0800 Subject: [PATCH] CUDA/HIP: Share the same unified memory allocation logic. (llama/12934) Replace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs. --- ggml/CMakeLists.txt | 1 - ggml/src/ggml-cuda/ggml-cuda.cu | 31 ++++++++++++++++--------------- ggml/src/ggml-cuda/vendors/hip.h | 2 ++ ggml/src/ggml-hip/CMakeLists.txt | 4 ---- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d33f843b..438c2a73 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -170,7 +170,6 @@ option(GGML_HIP "ggml: use HIP" option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF) option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) -option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4af18970..9ced4665 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -96,31 +96,32 @@ int ggml_cuda_get_device() { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); -#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA) - auto res = hipMallocManaged(ptr, size); - if (res == hipSuccess) { - // if error we "need" to know why... - CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); - } - return res; -#else - -#if !defined(GGML_USE_HIP) cudaError_t err; if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) { err = cudaMallocManaged(ptr, size); +#if defined(GGML_USE_HIP) + if (err == hipSuccess) { + CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); + } + + // fall back to cudaMalloc if not supported (e.g. on Windows) + if (err == hipErrorNotSupported) { + static bool warned_unsupported = false; + if (!warned_unsupported) { + GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n"); + warned_unsupported = true; + } + + err = cudaMalloc(ptr, size); + } +#endif // defined(GGML_USE_HIP) } else { err = cudaMalloc(ptr, size); } return err; -#else - return cudaMalloc(ptr, size); -#endif // !defined(GGML_USE_HIP) - -#endif } #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 420b41b8..1a28831b 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -71,6 +71,8 @@ #define cudaLaunchHostFunc hipLaunchHostFunc #define cudaMalloc hipMalloc #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMallocManaged hipMallocManaged +#define cudaMemAdvise hipMemAdvise #define cudaMemcpy hipMemcpy #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyPeerAsync hipMemcpyPeerAsync diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index e3762649..1fe8fe3b 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -89,10 +89,6 @@ endif() add_compile_definitions(GGML_USE_HIP) -if (GGML_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) -endif() - if (GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif()