CANN: Optimize CANN buffer pool memory management (llama/12875)

Multiple optional memory pools are provided for CANN, including VMM,
priority queue-based, and traditional memory pools.
1.When the memory pool is available and GGML_CANN_DISABLE_VMM_POOL
   is not defined, the VMM pool is selected by default.
2.Otherwise, if GGML_CANN_ENABLE_BUF_PRIO_POOL is defined,
   the priority queue-based memory pool is used.
3.If neither condition is met, the default memory pool is used.
This commit is contained in:
Xinpeng Dou 2025-04-15 10:04:24 +08:00 committed by Georgi Gerganov
parent d049d67065
commit 622f981853
2 changed files with 335 additions and 64 deletions

View File

@ -1783,7 +1783,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
GGML_MAX_DIMS + 1); GGML_MAX_DIMS + 1);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor( aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb, src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
aclTensor* dequant_tensor = ggml_cann_create_tensor( aclTensor* dequant_tensor = ggml_cann_create_tensor(
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t), dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),

View File

@ -29,6 +29,8 @@
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <mutex> #include <mutex>
#include <queue>
#include <chrono>
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
@ -119,9 +121,10 @@ static ggml_cann_device_info ggml_cann_init() {
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = id; prop.location.id = id;
prop.reserve = 0; prop.reserve = 0;
ACL_CHECK(aclrtMemGetAllocationGranularity( err = aclrtMemGetAllocationGranularity(
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
&info.devices[id].vmm_granularity)); &info.devices[id].vmm_granularity);
info.devices[id].vmm = err == ACL_SUCCESS;
size_t free, total; size_t free, total;
ggml_backend_cann_get_device_memory(id, &free, &total); ggml_backend_cann_get_device_memory(id, &free, &total);
@ -148,11 +151,222 @@ const ggml_cann_device_info& ggml_cann_info() {
//#define DEBUG_CANN_MALLOC //#define DEBUG_CANN_MALLOC
/** /**
* @brief A pool of CANN buffers(legacy). * @brief A pool of CANN buffers(priority segment buffer).
* *
* This class manages a pool of CANN buffers for a specific device. * This class manages a pool of CANN buffers for a specific device.
*/ */
struct ggml_cann_pool_leg : public ggml_cann_pool { struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
/**
* @brief The maximum reuse margin for a buffer.
*/
static const size_t max_reuse_margin = 1ull << 22; // 4MB
/**
* @brief The minimum free margin for a buffer.
*/
static const size_t min_free_margin = 1ull << 20; // 1MB
/**
* @brief The alignment for buffer allocation.
*/
static const size_t alignment = 128;
/**
* @brief The device ID associated with this buffer pool.
*/
int device;
/**
* @brief Whether to disable clean during buffer allocation.
*/
bool disable_clean = false;
/**
* @brief Structure representing a CANN buffer.
*/
struct ggml_cann_buffer {
void* ptr = nullptr; ///< Pointer to the buffer.
size_t size = 0; ///< Size of the buffer.
std::chrono::steady_clock::time_point last_used; ///< Last used time.
bool operator>(const ggml_cann_buffer& other) const {
return size > other.size;
}
};
/**
* @brief Array of CANN buffers in the pool.
*/
std::unordered_map<void*, size_t> buffer_pool;
std::priority_queue<ggml_cann_buffer,
std::vector<ggml_cann_buffer>,
std::greater<>> free_buffers ;
/**
* @brief Total size of all buffers in the pool.
*/
size_t pool_size = 0;
/**
* @brief Constructor to initialize the buffer pool for a specific device.
*
* @param device The device ID to associate with this buffer pool.
*/
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
}
/**
* @brief Destructor to free all buffers in the pool.
*/
~ggml_cann_pool_buf_prio() {
ggml_cann_set_device(device);
for (auto& [b_ptr, b_size] : buffer_pool) {
aclrtFree(b_ptr);
pool_size -= b_size;
}
buffer_pool.clear();
GGML_ASSERT(pool_size == 0);
}
/**
* @brief Allocate a buffer of the given size.
*
* @param size The size of the buffer to allocate.
* @param actual_size A pointer to a variable to receive the actual size of
* the allocated buffer.
* @return A pointer to the allocated buffer.
*/
void* alloc(size_t size, size_t* actual_size) override {
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
void* ptr = nullptr;
auto now = std::chrono::steady_clock::now();
std::vector<ggml_cann_buffer> free_buffers_rest;
free_buffers_rest.reserve(free_buffers.size());
while (!free_buffers.empty()) {
auto b = free_buffers.top();
free_buffers.pop();
if (b.size >= size) {
// reuse the buffer if the size is enough
const size_t margin = b.size - size;
if (margin <= max_reuse_margin) {
*actual_size = b.size;
ptr = b.ptr;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: reused %p, "
"pool_size = %5u MB, "
"size = %5u MB, "
"margin = %5u MB\n",
device, b.ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
#endif
break;
}
}
bool should_clean = !disable_clean &&
b.size > min_free_margin &&
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
if (should_clean) {
// free the buffer if the size is needed to be freed
ACL_CHECK(aclrtFree(b.ptr));
pool_size -= b.size;
buffer_pool.erase(b.ptr);
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: clean %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
device, b.ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
#endif
continue;
}
free_buffers_rest.push_back(b);
}
for (ggml_cann_buffer &b : free_buffers_rest) {
free_buffers.push(std::move(b));
}
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
#endif
if (ptr != nullptr) {
return ptr;
}
// allocate a new buffer if no buffer can be reused
ggml_cann_set_device(device);
ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
*actual_size = size;
pool_size += size;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: allocate %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(size, 1048576) / 1048576));
#endif
buffer_pool.emplace(ptr, size);
return ptr;
}
/**
* @brief Free a buffer and return it to the pool.
*
* @param ptr Pointer to the buffer to free.
* @param size Size of the buffer to free.
*/
void free(void* ptr, size_t size) override {
auto it = buffer_pool.find(ptr);
if (it == buffer_pool.end()) {
GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
}
auto now = std::chrono::steady_clock::now();
free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: return %p, "
"pool_size = %5u MB\n",
device, ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
#endif
}
};
/**
* @brief A pool of CANN buffers(segment buffer).
*
* This class manages a pool of CANN buffers for a specific device.
*/
struct ggml_cann_pool_buf : public ggml_cann_pool {
/**
* @brief The maximum reuse margin for a buffer.
*/
static const size_t max_reuse_margin = 1ull << 22; // 4MB
/**
* @brief The minimum free margin for a buffer.
*/
static const size_t min_free_margin = 1ull << 20; // 1MB
/**
* @brief The alignment for buffer allocation.
*/
static const size_t alignment = 128;
/** /**
* @brief The maximum number of buffers in the pool. * @brief The maximum number of buffers in the pool.
*/ */
@ -163,12 +377,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
*/ */
int device; int device;
/**
* @brief Whether to disable clean during buffer allocation.
*/
bool disable_clean = false;
/** /**
* @brief Structure representing a CANN buffer. * @brief Structure representing a CANN buffer.
*/ */
struct ggml_cann_buffer { struct ggml_cann_buffer {
void* ptr = nullptr; ///< Pointer to the buffer memory. void* ptr = nullptr; ///< Pointer to the buffer memory.
size_t size = 0; ///< Size of the buffer. size_t size = 0; ///< Size of the buffer.
bool used = false; ///< Whether the buffer is currently in use.
std::chrono::steady_clock::time_point last_used; ///< Last used time.
}; };
/** /**
@ -186,17 +407,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
* *
* @param device The device ID to associate with this buffer pool. * @param device The device ID to associate with this buffer pool.
*/ */
explicit ggml_cann_pool_leg(int device) : device(device) {} explicit ggml_cann_pool_buf(int device) : device(device) {
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
}
/** /**
* @brief Destructor to free all buffers in the pool. * @brief Destructor to free all buffers in the pool.
*/ */
~ggml_cann_pool_leg() { ~ggml_cann_pool_buf() {
ggml_cann_set_device(device); ggml_cann_set_device(device);
for (int i = 0; i < MAX_BUFFERS; ++i) { for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cann_buffer& b = buffer_pool[i]; ggml_cann_buffer& b = buffer_pool[i];
if (b.ptr != nullptr) { if (b.ptr != nullptr) {
ACL_CHECK(aclrtFree(b.ptr)); aclrtFree(b.ptr);
pool_size -= b.size; pool_size -= b.size;
} }
} }
@ -212,65 +435,95 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
* @return A pointer to the allocated buffer. * @return A pointer to the allocated buffer.
*/ */
void* alloc(size_t size, size_t* actual_size) override { void* alloc(size_t size, size_t* actual_size) override {
const size_t alignment = 128;
size = GGML_PAD(size, alignment); size = GGML_PAD(size, alignment);
if (size == 0) { if (size == 0) {
size = alignment; size = alignment;
} }
#ifdef DEBUG_CANN_MALLOC
int nnz = 0; void* ptr = nullptr;
size_t max_size = 0; auto now = std::chrono::steady_clock::now();
#endif
size_t best_diff = 1ull << 36; int i = 0;
int ibest = -1; for (; i < MAX_BUFFERS; ++i) {
for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cann_buffer& b = buffer_pool[i]; ggml_cann_buffer& b = buffer_pool[i];
if (b.ptr != nullptr) { if (b.ptr == nullptr) {
#ifdef DEBUG_CANN_MALLOC break;
++nnz; }
if (b.size > max_size) max_size = b.size; if (b.used) {
#endif continue;
}
if (b.size >= size) { if (b.size >= size) {
size_t diff = b.size - size; // reuse the buffer if the size is enough
if (diff < best_diff) { const size_t margin = b.size - size;
best_diff = diff; if (margin <= max_reuse_margin) {
ibest = i;
if (!best_diff) {
void* ptr = b.ptr;
*actual_size = b.size; *actual_size = b.size;
b.ptr = nullptr; b.used = true;
b.size = 0; ptr = b.ptr;
return ptr;
}
}
}
}
}
if (ibest >= 0) {
ggml_cann_buffer& b = buffer_pool[ibest];
void* ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
void* ptr;
ggml_cann_set_device(device);
ACL_CHECK(
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
*actual_size = size;
pool_size += size;
#ifdef DEBUG_CANN_MALLOC #ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO( GGML_LOG_INFO(
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " "cann pool[%d]: reused %p, "
"requested %u MB\n", "pool_size = %5u MB, "
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), "size = %5u MB, "
(uint32_t)(pool_size / 1024 / 1024), "margin = %5u MB\n",
(uint32_t)(size / 1024 / 1024)); device, b.ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
#endif #endif
break;
}
}
bool should_clean = !disable_clean &&
b.size > min_free_margin &&
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
if (should_clean) {
// free the buffer if the size is needed to be freed
ACL_CHECK(aclrtFree(b.ptr));
pool_size -= b.size;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: clean %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
device, b.ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
#endif
b.ptr = nullptr;
}
}
if (ptr != nullptr) {
return ptr; return ptr;
} }
if (i < MAX_BUFFERS) {
// allocate a new buffer if no buffer can be reused
ggml_cann_buffer& b = buffer_pool[i];
ggml_cann_set_device(device);
ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
pool_size += size;
*actual_size = size;
b.size = size;
b.used = true;
if (i >= MAX_BUFFERS - 8) {
GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
}
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: allocate %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
device, b.ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
#endif
return b.ptr;
}
GGML_ABORT("cann pool[%d]: slots full\n", device);
}
/** /**
* @brief Free a buffer and return it to the pool. * @brief Free a buffer and return it to the pool.
* *
@ -280,16 +533,21 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
void free(void* ptr, size_t size) override { void free(void* ptr, size_t size) override {
for (int i = 0; i < MAX_BUFFERS; ++i) { for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cann_buffer& b = buffer_pool[i]; ggml_cann_buffer& b = buffer_pool[i];
if (b.ptr == nullptr) { if (b.ptr != ptr) {
b.ptr = ptr; continue;
b.size = size; }
b.used = false;
b.last_used = std::chrono::steady_clock::now();
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: return %p, "
"pool_size = %5u MB\n",
device, b.ptr,
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
#endif
return; return;
} }
} GGML_ABORT("cann pool[%d]: slots full\n", device);
// memory should always buffered. these memory may still needed by
// tasks in stream.
// TODO, fix me.
GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
} }
}; };
@ -347,8 +605,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
* @param device The device ID to associate with this buffer pool. * @param device The device ID to associate with this buffer pool.
*/ */
explicit ggml_cann_pool_vmm(int device) explicit ggml_cann_pool_vmm(int device)
: device(device), : device(device) {
granularity(ggml_cann_info().devices[device].vmm_granularity) {
auto dev = ggml_cann_info().devices[device]; auto dev = ggml_cann_info().devices[device];
granularity = dev.vmm_granularity; granularity = dev.vmm_granularity;
max_size = dev.total_vram; max_size = dev.total_vram;
@ -471,8 +728,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/ */
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device( std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
int device) { int device) {
bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device)); return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
} }
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
if (enable_buf_prio) {
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
}
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
}
// cann buffer // cann buffer
/** /**
@ -1020,8 +1288,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
ggml_cann_set_device(buft_ctx->device); ggml_cann_set_device(buft_ctx->device);
size = std::max(size, (size_t)1); const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
void* dev_ptr; void* dev_ptr;
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
if (err != ACL_SUCCESS) { if (err != ACL_SUCCESS) {