mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-28 06:49:42 +00:00
CUDA/HIP: Share the same unified memory allocation logic. (llama/12934)
Replace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs.
This commit is contained in:
parent
e1dbf9a42e
commit
43e3d25d93
@ -170,7 +170,6 @@ option(GGML_HIP "ggml: use HIP"
|
|||||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
||||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
|
@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
|
|||||||
|
|
||||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
|
|
||||||
auto res = hipMallocManaged(ptr, size);
|
|
||||||
if (res == hipSuccess) {
|
|
||||||
// if error we "need" to know why...
|
|
||||||
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
#else
|
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIP)
|
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||||
{
|
{
|
||||||
err = cudaMallocManaged(ptr, size);
|
err = cudaMallocManaged(ptr, size);
|
||||||
|
#if defined(GGML_USE_HIP)
|
||||||
|
if (err == hipSuccess) {
|
||||||
|
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||||
|
}
|
||||||
|
|
||||||
|
// fall back to cudaMalloc if not supported (e.g. on Windows)
|
||||||
|
if (err == hipErrorNotSupported) {
|
||||||
|
static bool warned_unsupported = false;
|
||||||
|
if (!warned_unsupported) {
|
||||||
|
GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
|
||||||
|
warned_unsupported = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = cudaMalloc(ptr, size);
|
||||||
|
}
|
||||||
|
#endif // defined(GGML_USE_HIP)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
err = cudaMalloc(ptr, size);
|
err = cudaMalloc(ptr, size);
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
#else
|
|
||||||
return cudaMalloc(ptr, size);
|
|
||||||
#endif // !defined(GGML_USE_HIP)
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
2
ggml/src/ggml-cuda/vendors/hip.h
vendored
2
ggml/src/ggml-cuda/vendors/hip.h
vendored
@ -71,6 +71,8 @@
|
|||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
#define cudaMalloc hipMalloc
|
#define cudaMalloc hipMalloc
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
|
#define cudaMallocManaged hipMallocManaged
|
||||||
|
#define cudaMemAdvise hipMemAdvise
|
||||||
#define cudaMemcpy hipMemcpy
|
#define cudaMemcpy hipMemcpy
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
@ -89,10 +89,6 @@ endif()
|
|||||||
|
|
||||||
add_compile_definitions(GGML_USE_HIP)
|
add_compile_definitions(GGML_USE_HIP)
|
||||||
|
|
||||||
if (GGML_HIP_UMA)
|
|
||||||
add_compile_definitions(GGML_HIP_UMA)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_CUDA_FORCE_MMQ)
|
if (GGML_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user