From 339a1cba5d4988e973190af8c814873fa3fb5c63 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 27 Feb 2025 13:35:07 +0100 Subject: [PATCH] whisper : support GGML_BACKEND_DL (#2843) * whisper : support GGML_BACKEND_DL * fix DTW crash * whisper.objc : fix build - add ggml-cpp.h --------- Co-authored-by: Georgi Gerganov --- examples/bench/bench.cpp | 4 +- .../whisper.objc.xcodeproj/project.pbxproj | 2 + ggml/src/CMakeLists.txt | 3 + src/whisper.cpp | 174 ++++++++++++------ 4 files changed, 129 insertions(+), 54 deletions(-) diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index d19c8ac5..54f73110 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -50,11 +50,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what); - fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); - fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false"); fprintf(stderr, " %-7s 0 - whisper\n", ""); fprintf(stderr, " %-7s 1 - memcpy\n", ""); fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", ""); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); + fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false"); fprintf(stderr, "\n"); } diff --git a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj index d58223ef..855f01de 100644 --- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj +++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj @@ -81,6 +81,7 @@ 18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = ""; }; 18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = ""; }; 18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = ""; }; + 18B07DCB2D70411100B3B87C /* ggml-cpp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpp.h"; path = "../../../ggml/include/ggml-cpp.h"; sourceTree = ""; }; 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = ""; }; 18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = ""; }; 18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = ""; }; @@ -135,6 +136,7 @@ 18627C7829052BDF00BD2A04 /* whisper.objc */ = { isa = PBXGroup; children = ( + 18B07DCB2D70411100B3B87C /* ggml-cpp.h */, 433188B92D3A18A400E3FE79 /* gguf.h */, 433188B72D3A187C00E3FE79 /* gguf.cpp */, 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */, diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 0002ac18..fcb354e1 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -226,6 +226,9 @@ add_library(ggml-base gguf.cpp) target_include_directories(ggml-base PRIVATE .) +if (GGML_BACKEND_DL) + target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL) +endif() add_library(ggml ggml-backend-reg.cpp) diff --git a/src/whisper.cpp b/src/whisper.cpp index 447846c9..986a132c 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1,8 +1,7 @@ #include "whisper.h" -#include "ggml-cpu.h" - #include "ggml.h" +#include "ggml-cpp.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -19,19 +18,20 @@ #include #define _USE_MATH_DEFINES #include -#include +#include #include +#include #include #include +#include #include +#include +#include +#include #include #include #include #include -#include -#include -#include -#include // dummy @@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text static bool ggml_graph_compute_helper( struct ggml_cgraph * graph, - std::vector & buf, int n_threads, ggml_abort_callback abort_callback, void * abort_callback_data) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - plan.abort_callback = abort_callback; - plan.abort_callback_data = abort_callback_data; + ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) }; - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data); } - return ggml_graph_compute(graph, &plan); + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + ggml_backend_set_n_threads_fn(backend.get(), n_threads); + } + + return ggml_backend_graph_compute(backend.get(), graph) == GGML_STATUS_SUCCESS; } static bool ggml_graph_compute_helper( @@ -187,6 +191,61 @@ static bool ggml_graph_compute_helper( return t; } +static void whisper_load_backends() { +#ifdef GGML_BACKEND_DL + static std::once_flag flag; + std::call_once(flag, []() { + ggml_backend_load_all(); + }); +#endif +} + +// TODO: move these functions to ggml-base with support for ggml-backend? + +static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(t)); + size_t nels = ggml_nelements(t); + for (int64_t i = 0; i < nels; ++i) { + ((float *) t->data)[i] = v; + } + return t; +} + +static ggml_tensor * whisper_set_i32(struct ggml_tensor * t, int32_t v) { + GGML_ASSERT(t->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_contiguous(t)); + size_t nels = ggml_nelements(t); + for (int64_t i = 0; i < nels; ++i) { + ((int32_t *) t->data)[i] = v; + } + return t; +} + +static float whisper_get_f32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3]; + return *(float *) data; +} + +static void whisper_set_f32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3]; + *(float *) data = v; +} + +static int32_t whisper_get_i32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + GGML_ASSERT(t->type == GGML_TYPE_I32); + void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3]; + return *(int32_t *) data; +} + +static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) { + GGML_ASSERT(t->type == GGML_TYPE_I32); + void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3]; + *(int32_t *) data = v; +} + // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad" // the idea is to represent the original matrix multiplication: // @@ -1237,6 +1296,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) { static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) { ggml_log_set(g_state.log_callback, g_state.log_callback_user_data); + whisper_load_backends(); + ggml_backend_dev_t dev = nullptr; int cnt = 0; @@ -1294,7 +1355,7 @@ static std::vector whisper_backend_init(const whisper_context_pa GGML_UNUSED(params); - result.push_back(ggml_backend_cpu_init()); + result.push_back(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr)); return result; } @@ -4206,22 +4267,28 @@ static int whisper_has_openvino(void) { const char * whisper_print_system_info(void) { static std::string s; + whisper_load_backends(); + s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "WHISPER : "; s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | "; + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (get_features_fn) { + ggml_backend_feature * features = get_features_fn(reg); + s += ggml_backend_reg_name(reg); + s += " : "; + for (; features->name; features++) { + s += features->name; + s += " = "; + s += features->value; + s += " | "; + } + } + } return s.c_str(); } @@ -6653,6 +6720,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) { } WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { + whisper_load_backends(); + static std::string s; s = ""; char strbuf[256]; @@ -6672,7 +6741,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { // c: N*N*sizeof(float) // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) std::vector buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead()); - std::vector work; // put a bunch of random data in the buffer for (size_t i = 0; i < buf.size(); i++) buf[i] = i; @@ -6729,12 +6797,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { double tsum = 0.0; // heat-up - ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr); + ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr); for (int i = 0; i < n_max; ++i) { const int64_t t0 = ggml_time_us(); - ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr); + ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr); const int64_t t1 = ggml_time_us(); @@ -7111,18 +7179,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) { struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1); struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1); - cost = ggml_set_f32(cost, INFINITY); - trace = ggml_set_f32(trace, -1); - ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0); + cost = whisper_set_f32(cost, INFINITY); + trace = whisper_set_i32(trace, -1); + whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0); // dtw // supposedly can be optmized by computing diagonals in parallel ? // Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most. for (int64_t j = 1; j < M + 1; ++j) { for (int64_t i = 1; i < N + 1; ++i) { - float c0 = ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0); - float c1 = ggml_get_f32_nd(cost, i - 1, j, 0, 0); - float c2 = ggml_get_f32_nd(cost, i, j - 1, 0, 0); + float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0); + float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0); + float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0); float c; int32_t t; @@ -7137,9 +7205,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) { t = 2; } - c = ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c; - ggml_set_f32_nd(cost, i, j, 0, 0, c); - ggml_set_i32_nd(trace, i, j, 0, 0, t); + c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c; + whisper_set_f32_nd(cost, i, j, 0, 0, c); + whisper_set_i32_nd(trace, i, j, 0, 0, t); } } @@ -7148,19 +7216,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) { struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2); // trace[0, :] = 2; for (int64_t i = 0; i < M + 1; ++i) - ggml_set_i32_nd(trace, 0, i, 0, 0, 2); + whisper_set_i32_nd(trace, 0, i, 0, 0, 2); //trace[:, 0] = 1; for (int64_t i = 0; i < N + 1; ++i) - ggml_set_i32_nd(trace, i, 0, 0, 0, 1); + whisper_set_i32_nd(trace, i, 0, 0, 0, 1); int bt_row_idx = BT_MAX_ROWS - 1; int64_t i = N; int64_t j = M; while (i > 0 || j > 0) { - ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1); - ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1); + whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1); + whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1); --bt_row_idx; - int32_t t = ggml_get_i32_nd(trace, i, j, 0, 0); + int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0); if (t == 0) { --i; --j; @@ -7181,8 +7249,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) { ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols); for (int64_t i = 0; i < 2; ++i) { for (int64_t j = 0; j < result_n_cols; ++j) { - int32_t v = ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0); - ggml_set_i32_nd(r, i, j, 0, 0, v); + int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0); + whisper_set_i32_nd(r, i, j, 0, 0, v); } } @@ -7217,11 +7285,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor * idx = 2*(a->ne[2] - 1) - idx; } - filter.push_back(ggml_get_f32_nd(a, i, j, idx, 0)); + filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0)); } std::sort(filter.begin(), filter.end()); const float v = filter[filter.size()/2]; - ggml_set_f32_nd(dst, i, j, k, 0, v); + whisper_set_f32_nd(dst, i, j, k, 0, v); filter.clear(); } } @@ -7343,7 +7411,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw( // Compute struct ggml_cgraph * gf = ggml_new_graph(gctx); ggml_build_forward_expand(gf, w); - ggml_graph_compute_with_ctx(gctx, gf, n_threads); + + ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) }; + ggml_backend_graph_compute(backend.get(), gf); ggml_tensor * alignment = dtw_and_backtrace(gctx, w); @@ -7352,9 +7422,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw( auto seg_i = state->result_all.begin() + i_segment; auto tok_i = seg_i->tokens.begin(); for (int i = 0; i < alignment->ne[1]; ++i) { - int32_t v = ggml_get_i32_nd(alignment, 0, i, 0, 0); + int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0); if (v != last_v) { - int32_t time_index = ggml_get_i32_nd(alignment, 1, i, 0, 0); + int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0); int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio last_v = v;