diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 37d41179..f312a620 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1783,7 +1783,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, GGML_MAX_DIMS + 1); aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb, + src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb, GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); aclTensor* dequant_tensor = ggml_cann_create_tensor( dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t), diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index cec36b36..8f8acaf9 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include "ggml-impl.h" #include "ggml-backend-impl.h" @@ -119,9 +121,10 @@ static ggml_cann_device_info ggml_cann_init() { prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; prop.location.id = id; prop.reserve = 0; - ACL_CHECK(aclrtMemGetAllocationGranularity( + err = aclrtMemGetAllocationGranularity( &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, - &info.devices[id].vmm_granularity)); + &info.devices[id].vmm_granularity); + info.devices[id].vmm = err == ACL_SUCCESS; size_t free, total; ggml_backend_cann_get_device_memory(id, &free, &total); @@ -148,11 +151,222 @@ const ggml_cann_device_info& ggml_cann_info() { //#define DEBUG_CANN_MALLOC /** - * @brief A pool of CANN buffers(legacy). + * @brief A pool of CANN buffers(priority segment buffer). * * This class manages a pool of CANN buffers for a specific device. */ -struct ggml_cann_pool_leg : public ggml_cann_pool { +struct ggml_cann_pool_buf_prio : public ggml_cann_pool { + /** + * @brief The maximum reuse margin for a buffer. + */ + static const size_t max_reuse_margin = 1ull << 22; // 4MB + + /** + * @brief The minimum free margin for a buffer. + */ + static const size_t min_free_margin = 1ull << 20; // 1MB + + /** + * @brief The alignment for buffer allocation. + */ + static const size_t alignment = 128; + + /** + * @brief The device ID associated with this buffer pool. + */ + int device; + + /** + * @brief Whether to disable clean during buffer allocation. + */ + bool disable_clean = false; + + /** + * @brief Structure representing a CANN buffer. + */ + struct ggml_cann_buffer { + void* ptr = nullptr; ///< Pointer to the buffer. + size_t size = 0; ///< Size of the buffer. + std::chrono::steady_clock::time_point last_used; ///< Last used time. + + bool operator>(const ggml_cann_buffer& other) const { + return size > other.size; + } + }; + + /** + * @brief Array of CANN buffers in the pool. + */ + std::unordered_map buffer_pool; + std::priority_queue, + std::greater<>> free_buffers ; + + /** + * @brief Total size of all buffers in the pool. + */ + size_t pool_size = 0; + + /** + * @brief Constructor to initialize the buffer pool for a specific device. + * + * @param device The device ID to associate with this buffer pool. + */ + explicit ggml_cann_pool_buf_prio(int device) : device(device) { + disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr; + } + + /** + * @brief Destructor to free all buffers in the pool. + */ + ~ggml_cann_pool_buf_prio() { + ggml_cann_set_device(device); + for (auto& [b_ptr, b_size] : buffer_pool) { + aclrtFree(b_ptr); + pool_size -= b_size; + } + buffer_pool.clear(); + GGML_ASSERT(pool_size == 0); + } + + /** + * @brief Allocate a buffer of the given size. + * + * @param size The size of the buffer to allocate. + * @param actual_size A pointer to a variable to receive the actual size of + * the allocated buffer. + * @return A pointer to the allocated buffer. + */ + void* alloc(size_t size, size_t* actual_size) override { + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } + + void* ptr = nullptr; + auto now = std::chrono::steady_clock::now(); + + std::vector free_buffers_rest; + free_buffers_rest.reserve(free_buffers.size()); + while (!free_buffers.empty()) { + auto b = free_buffers.top(); + free_buffers.pop(); + + if (b.size >= size) { + // reuse the buffer if the size is enough + const size_t margin = b.size - size; + if (margin <= max_reuse_margin) { + *actual_size = b.size; + ptr = b.ptr; + #ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "cann pool[%d]: reused %p, " + "pool_size = %5u MB, " + "size = %5u MB, " + "margin = %5u MB\n", + device, b.ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(margin, 1048576) / 1048576)); + #endif + break; + } + } + + bool should_clean = !disable_clean && + b.size > min_free_margin && + std::chrono::duration_cast(now - b.last_used).count() > 100; + if (should_clean) { + // free the buffer if the size is needed to be freed + ACL_CHECK(aclrtFree(b.ptr)); + pool_size -= b.size; + buffer_pool.erase(b.ptr); + #ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "cann pool[%d]: clean %p, " + "pool_size = %5u MB, " + "size = %5u MB\n", + device, b.ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576)); + #endif + continue; + } + free_buffers_rest.push_back(b); + } + for (ggml_cann_buffer &b : free_buffers_rest) { + free_buffers.push(std::move(b)); + } + + #ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576)); + #endif + if (ptr != nullptr) { + return ptr; + } + + // allocate a new buffer if no buffer can be reused + ggml_cann_set_device(device); + ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + *actual_size = size; + pool_size += size; + #ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "cann pool[%d]: allocate %p, " + "pool_size = %5u MB, " + "size = %5u MB\n", + device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(size, 1048576) / 1048576)); + #endif + buffer_pool.emplace(ptr, size); + return ptr; + } + + /** + * @brief Free a buffer and return it to the pool. + * + * @param ptr Pointer to the buffer to free. + * @param size Size of the buffer to free. + */ + void free(void* ptr, size_t size) override { + auto it = buffer_pool.find(ptr); + if (it == buffer_pool.end()) { + GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr); + } + + auto now = std::chrono::steady_clock::now(); + free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now}); + #ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "cann pool[%d]: return %p, " + "pool_size = %5u MB\n", + device, ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576)); + #endif + } + }; + +/** + * @brief A pool of CANN buffers(segment buffer). + * + * This class manages a pool of CANN buffers for a specific device. + */ +struct ggml_cann_pool_buf : public ggml_cann_pool { + /** + * @brief The maximum reuse margin for a buffer. + */ + static const size_t max_reuse_margin = 1ull << 22; // 4MB + + /** + * @brief The minimum free margin for a buffer. + */ + static const size_t min_free_margin = 1ull << 20; // 1MB + + /** + * @brief The alignment for buffer allocation. + */ + static const size_t alignment = 128; + /** * @brief The maximum number of buffers in the pool. */ @@ -163,12 +377,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { */ int device; + /** + * @brief Whether to disable clean during buffer allocation. + */ + bool disable_clean = false; + /** * @brief Structure representing a CANN buffer. */ struct ggml_cann_buffer { void* ptr = nullptr; ///< Pointer to the buffer memory. size_t size = 0; ///< Size of the buffer. + bool used = false; ///< Whether the buffer is currently in use. + std::chrono::steady_clock::time_point last_used; ///< Last used time. }; /** @@ -186,17 +407,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { * * @param device The device ID to associate with this buffer pool. */ - explicit ggml_cann_pool_leg(int device) : device(device) {} + explicit ggml_cann_pool_buf(int device) : device(device) { + disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr; + } /** * @brief Destructor to free all buffers in the pool. */ - ~ggml_cann_pool_leg() { + ~ggml_cann_pool_buf() { ggml_cann_set_device(device); for (int i = 0; i < MAX_BUFFERS; ++i) { ggml_cann_buffer& b = buffer_pool[i]; if (b.ptr != nullptr) { - ACL_CHECK(aclrtFree(b.ptr)); + aclrtFree(b.ptr); pool_size -= b.size; } } @@ -212,63 +435,93 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { * @return A pointer to the allocated buffer. */ void* alloc(size_t size, size_t* actual_size) override { - const size_t alignment = 128; size = GGML_PAD(size, alignment); if (size == 0) { size = alignment; } -#ifdef DEBUG_CANN_MALLOC - int nnz = 0; - size_t max_size = 0; -#endif - size_t best_diff = 1ull << 36; - int ibest = -1; - for (int i = 0; i < MAX_BUFFERS; ++i) { + + void* ptr = nullptr; + auto now = std::chrono::steady_clock::now(); + + int i = 0; + for (; i < MAX_BUFFERS; ++i) { ggml_cann_buffer& b = buffer_pool[i]; - if (b.ptr != nullptr) { + if (b.ptr == nullptr) { + break; + } + if (b.used) { + continue; + } + if (b.size >= size) { + // reuse the buffer if the size is enough + const size_t margin = b.size - size; + if (margin <= max_reuse_margin) { + *actual_size = b.size; + b.used = true; + ptr = b.ptr; #ifdef DEBUG_CANN_MALLOC - ++nnz; - if (b.size > max_size) max_size = b.size; + GGML_LOG_INFO( + "cann pool[%d]: reused %p, " + "pool_size = %5u MB, " + "size = %5u MB, " + "margin = %5u MB\n", + device, b.ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(margin, 1048576) / 1048576)); #endif - if (b.size >= size) { - size_t diff = b.size - size; - if (diff < best_diff) { - best_diff = diff; - ibest = i; - if (!best_diff) { - void* ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - } + break; } } + + bool should_clean = !disable_clean && + b.size > min_free_margin && + std::chrono::duration_cast(now - b.last_used).count() > 100; + if (should_clean) { + // free the buffer if the size is needed to be freed + ACL_CHECK(aclrtFree(b.ptr)); + pool_size -= b.size; +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "cann pool[%d]: clean %p, " + "pool_size = %5u MB, " + "size = %5u MB\n", + device, b.ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576)); +#endif + b.ptr = nullptr; + } } - if (ibest >= 0) { - ggml_cann_buffer& b = buffer_pool[ibest]; - void* ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; + if (ptr != nullptr) { return ptr; } - void* ptr; - ggml_cann_set_device(device); - ACL_CHECK( - aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); - *actual_size = size; - pool_size += size; + + if (i < MAX_BUFFERS) { + // allocate a new buffer if no buffer can be reused + ggml_cann_buffer& b = buffer_pool[i]; + ggml_cann_set_device(device); + ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + pool_size += size; + *actual_size = size; + b.size = size; + b.used = true; + if (i >= MAX_BUFFERS - 8) { + GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device); + } #ifdef DEBUG_CANN_MALLOC - GGML_LOG_INFO( - "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " - "requested %u MB\n", - __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), - (uint32_t)(pool_size / 1024 / 1024), - (uint32_t)(size / 1024 / 1024)); + GGML_LOG_INFO( + "cann pool[%d]: allocate %p, " + "pool_size = %5u MB, " + "size = %5u MB\n", + device, b.ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576)); #endif - return ptr; + return b.ptr; + } + + GGML_ABORT("cann pool[%d]: slots full\n", device); } /** @@ -280,16 +533,21 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { void free(void* ptr, size_t size) override { for (int i = 0; i < MAX_BUFFERS; ++i) { ggml_cann_buffer& b = buffer_pool[i]; - if (b.ptr == nullptr) { - b.ptr = ptr; - b.size = size; - return; + if (b.ptr != ptr) { + continue; } + b.used = false; + b.last_used = std::chrono::steady_clock::now(); +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "cann pool[%d]: return %p, " + "pool_size = %5u MB\n", + device, b.ptr, + (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576)); +#endif + return; } - // memory should always buffered. these memory may still needed by - // tasks in stream. - // TODO, fix me. - GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + GGML_ABORT("cann pool[%d]: slots full\n", device); } }; @@ -347,8 +605,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { * @param device The device ID to associate with this buffer pool. */ explicit ggml_cann_pool_vmm(int device) - : device(device), - granularity(ggml_cann_info().devices[device].vmm_granularity) { + : device(device) { auto dev = ggml_cann_info().devices[device]; granularity = dev.vmm_granularity; max_size = dev.total_vram; @@ -471,7 +728,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( int device) { - return std::unique_ptr(new ggml_cann_pool_vmm(device)); + bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr); + if (!disable_vmm && ggml_cann_info().devices[device].vmm) { + GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device); + return std::unique_ptr(new ggml_cann_pool_vmm(device)); + } + bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr); + if (enable_buf_prio) { + GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device); + return std::unique_ptr(new ggml_cann_pool_buf_prio(device)); + } + GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device); + return std::unique_ptr(new ggml_cann_pool_buf(device)); } // cann buffer @@ -1020,8 +1288,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, ggml_cann_set_device(buft_ctx->device); - size = std::max(size, (size_t)1); - + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } void* dev_ptr; aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); if (err != ACL_SUCCESS) {