mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-27 14:29:43 +00:00
vulkan: Hybrid waitForFences/getFenceStatus to reduce fence latency (llama/12630)
There seems to be a bubble waking up from waitForFences, which costs a few percent performance and also increased variance in performance. This change inserts an "almost_ready" fence when the graph is about 80% complete and we waitForFences for the almost_ready fence and then spin (with _mm_pauses) waiting for the final fence to be signaled.
This commit is contained in:
parent
785437c253
commit
76231bda56
@ -24,6 +24,28 @@
|
|||||||
#include <future>
|
#include <future>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define NOMINMAX 1
|
||||||
|
# include <windows.h>
|
||||||
|
# define YIELD() YieldProcessor()
|
||||||
|
#elif defined(__clang__) || defined(__GNUC__)
|
||||||
|
# if defined(__x86_64__) ||defined(__i386__)
|
||||||
|
# include <immintrin.h>
|
||||||
|
# define YIELD() _mm_pause()
|
||||||
|
# elif defined(__arm__) || defined(__aarch64__)
|
||||||
|
# if defined(__clang__)
|
||||||
|
# include <arm_acle.h>
|
||||||
|
# define YIELD() __yield()
|
||||||
|
# else
|
||||||
|
# define YIELD() asm volatile("yield")
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(YIELD)
|
||||||
|
#define YIELD()
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
@ -787,7 +809,8 @@ struct ggml_backend_vk_context {
|
|||||||
ggml_vk_garbage_collector gc;
|
ggml_vk_garbage_collector gc;
|
||||||
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
||||||
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
||||||
vk::Fence fence;
|
vk::Fence fence, almost_ready_fence;
|
||||||
|
bool almost_ready_fence_pending {};
|
||||||
|
|
||||||
vk_buffer buffer_pool[MAX_VK_BUFFERS];
|
vk_buffer buffer_pool[MAX_VK_BUFFERS];
|
||||||
|
|
||||||
@ -878,6 +901,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||||||
|
|
||||||
static void ggml_backend_vk_free(ggml_backend_t backend);
|
static void ggml_backend_vk_free(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// Wait for ctx->fence to be signaled.
|
||||||
|
static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
|
||||||
|
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
|
||||||
|
// during this wait.
|
||||||
|
if (ctx->almost_ready_fence_pending) {
|
||||||
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
|
||||||
|
ctx->device->device.resetFences({ ctx->almost_ready_fence });
|
||||||
|
ctx->almost_ready_fence_pending = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spin (w/pause) waiting for the graph to finish executing.
|
||||||
|
vk::Result result;
|
||||||
|
while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
|
||||||
|
if (result != vk::Result::eNotReady) {
|
||||||
|
fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < 100; ++i) {
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
YIELD();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
}
|
||||||
|
|
||||||
// variables to track number of compiles in progress
|
// variables to track number of compiles in progress
|
||||||
static uint32_t compile_count = 0;
|
static uint32_t compile_count = 0;
|
||||||
static std::mutex compile_count_mutex;
|
static std::mutex compile_count_mutex;
|
||||||
@ -3355,6 +3411,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|||||||
ctx->prealloc_size_split_k = 0;
|
ctx->prealloc_size_split_k = 0;
|
||||||
|
|
||||||
ctx->fence = ctx->device->device.createFence({});
|
ctx->fence = ctx->device->device.createFence({});
|
||||||
|
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
||||||
@ -7959,11 +8016,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready);
|
||||||
|
|
||||||
// Returns true if node has enqueued work into the queue, false otherwise
|
// Returns true if node has enqueued work into the queue, false otherwise
|
||||||
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
||||||
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
|
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
|
||||||
if (ggml_is_empty(node) || !node->buffer) {
|
if (ggml_is_empty(node) || !node->buffer) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -8335,7 +8392,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||||||
|
|
||||||
ctx->compute_ctx.reset();
|
ctx->compute_ctx.reset();
|
||||||
|
|
||||||
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
|
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
if (node->op == GGML_OP_UNARY) {
|
if (node->op == GGML_OP_UNARY) {
|
||||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||||
@ -8349,7 +8406,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
|
||||||
ggml_backend_buffer * buf = nullptr;
|
ggml_backend_buffer * buf = nullptr;
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
@ -8452,12 +8509,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
|
||||||
|
ggml_vk_submit(subctx, ctx->almost_ready_fence);
|
||||||
|
ctx->almost_ready_fence_pending = true;
|
||||||
|
} else {
|
||||||
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
|
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
|
||||||
|
}
|
||||||
|
|
||||||
if (use_fence) {
|
if (use_fence) {
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
ggml_vk_wait_for_fence(ctx);
|
||||||
|
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
|
||||||
}
|
}
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
ggml_vk_check_results_1(tensor);
|
ggml_vk_check_results_1(tensor);
|
||||||
@ -8543,6 +8603,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|||||||
ctx->gc.events.clear();
|
ctx->gc.events.clear();
|
||||||
|
|
||||||
ctx->device->device.destroyFence(ctx->fence);
|
ctx->device->device.destroyFence(ctx->fence);
|
||||||
|
ctx->device->device.destroyFence(ctx->almost_ready_fence);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ggml_vk_get_device_count() {
|
static int ggml_vk_get_device_count() {
|
||||||
@ -8889,8 +8950,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_submit(transfer_ctx, ctx->fence);
|
ggml_vk_submit(transfer_ctx, ctx->fence);
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
|
ggml_vk_wait_for_fence(ctx);
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
|
||||||
|
|
||||||
for (auto& cpy : transfer_ctx->out_memcpys) {
|
for (auto& cpy : transfer_ctx->out_memcpys) {
|
||||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||||
@ -8909,7 +8969,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
|
|
||||||
uint64_t total_mat_mul_bytes = 0;
|
uint64_t total_mat_mul_bytes = 0;
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
|
||||||
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
||||||
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||||
}
|
}
|
||||||
@ -8951,11 +9011,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
||||||
|
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
||||||
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
||||||
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
||||||
(i == last_node);
|
(i == last_node) ||
|
||||||
|
(almost_ready && !ctx->almost_ready_fence_pending);
|
||||||
|
|
||||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
|
||||||
|
|
||||||
if (enqueued) {
|
if (enqueued) {
|
||||||
++submitted_nodes;
|
++submitted_nodes;
|
||||||
@ -8967,7 +9030,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (submit) {
|
if (submit && enqueued) {
|
||||||
first_node_in_batch = true;
|
first_node_in_batch = true;
|
||||||
submitted_nodes = 0;
|
submitted_nodes = 0;
|
||||||
mul_mat_bytes = 0;
|
mul_mat_bytes = 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user