From 3fec2119e6b52d1381b02a0fbf281b1b34728c25 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Sep 2023 13:54:04 +0300 Subject: [PATCH] whisper : fix bench regression + fix performance when using CPU BLAS (#1275) * whisper : fix bench regression * ggml : use sched_yield when using BLAS + add comment --- ggml.c | 14 +++++++++++--- whisper.cpp | 33 +++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/ggml.c b/ggml.c index 3f72379c..dcdebd24 100644 --- a/ggml.c +++ b/ggml.c @@ -17283,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } else { // wait for other threads to finish const int last = node_n; - do { - //sched_yield(); + while (true) { + // TODO: this sched_yield can have significant impact on the performance - either positive or negative + // depending on the workload and the operating system. + // since it is not clear what is the best approach, it should potentially become user-configurable + // ref: https://github.com/ggerganov/ggml/issues/291 +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + sched_yield(); +#endif + node_n = atomic_load(&state->shared->node_n); - } while (node_n == last); + if (node_n != last) break; + }; } // check if we should stop diff --git a/whisper.cpp b/whisper.cpp index 5c14b43e..f5a9a71f 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -118,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) { #define WHISPER_USE_SCRATCH #define WHISPER_MAX_SCRATCH_BUFFERS 16 +// +// ggml helpers +// + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + // available whisper models enum e_model { MODEL_UNKNOWN, @@ -666,6 +681,7 @@ struct whisper_state { // memory buffers used by encode / decode contexts std::vector buf_compute; + std::vector buf_work; std::vector buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS]; int buf_last = 0; @@ -1830,8 +1846,8 @@ static bool whisper_encode_internal( { struct ggml_cgraph gf = {}; - ggml_build_forward_expand (&gf, cur); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_build_forward_expand(&gf, cur); + ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads); //ggml_graph_print(&gf); } @@ -1916,7 +1932,7 @@ static bool whisper_encode_internal( ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v)); } - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads); //ggml_graph_print(&gf); } @@ -2329,8 +2345,8 @@ static bool whisper_decode_internal( // run the computation { - ggml_build_forward_expand (&gf, logits); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_build_forward_expand(&gf, logits); + ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads); } // extract logits for all N tokens @@ -5225,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { // b: N*N*sizeof(float) // c: N*N*sizeof(float) // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) - std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*512); + std::vector buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead()); + std::vector work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead()); // put a bunch of random data in the buffer for (size_t i = 0; i < buf.size(); i++) buf[i] = i; @@ -5280,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { double tsum = 0.0; // heat-up - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_graph_compute_helper(work, &gf, n_threads); for (int i = 0; i < n_max; ++i) { const int64_t t0 = ggml_time_us(); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_graph_compute_helper(work, &gf, n_threads); const int64_t t1 = ggml_time_us();