whisper : Metal and ggml-alloc support (#1270)

* metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package
2025-06-23 00:48:59 +00:00 · 2023-09-15 12:18:18 +03:00
parent 3fec2119e6
commit 93935980f8
18 changed files with 1855 additions and 1252 deletions
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -44,13 +44,13 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
-    fprintf(stderr, "                           %-7s  0 - whisper encoder\n",                         "");
+    fprintf(stderr, "                           %-7s  0 - whisper\n",                                 "");
    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
    fprintf(stderr, "\n");
 }

-int whisper_bench_encoder(const whisper_params & params) {
+int whisper_bench_full(const whisper_params & params) {
    // whisper init

    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@ -69,12 +69,49 @@ int whisper_bench_encoder(const whisper_params & params) {
        fprintf(stderr, "error: failed to set mel: %d\n", ret);
        return 3;
    }
-
+    // heat encoder
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

+    whisper_token tokens[512];
+    memset(tokens, 0, sizeof(tokens));
+
+    // prompt heat
+    if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
+
+    // text-generation heat
+    if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
+
+    whisper_reset_timings(ctx);
+
+    // actual run
+    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
+
+    for (int i = 0; i < 16; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to encode model: %d\n", ret);
+            return 4;
+        }
+    }
+
+    for (int i = 0; i < 256; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to encode model: %d\n", ret);
+            return 4;
+        }
+    }
+
    whisper_print_timings(ctx);
    whisper_free(ctx);

@ -103,7 +140,7 @@ int main(int argc, char ** argv) {
    int ret = -1;

    switch (params.what) {
-        case 0: ret = whisper_bench_encoder(params);                break;
+        case 0: ret = whisper_bench_full(params);                break;
        case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
        case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
        default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -7,7 +7,7 @@ if (WHISPER_SDL2)

    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
+    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../ggml-alloc.c ../../whisper.cpp)

    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
@ -8,6 +8,7 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
 set(
        SOURCE_FILES
        ${WHISPER_LIB_DIR}/ggml.c
+        ${WHISPER_LIB_DIR}/ggml-alloc.c
        ${WHISPER_LIB_DIR}/whisper.cpp
        ${CMAKE_SOURCE_DIR}/jni.c
 )
@ -20,7 +21,7 @@ function(build_library target_name)
        SHARED
        ${SOURCE_FILES}
    )
-    
+
    target_link_libraries(${target_name} ${LOG_LIB} android)

    if (${target_name} STREQUAL "whisper_v8fp16_va")
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -28,6 +28,8 @@ This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">

+## Core ML
+
 If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:

 <img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
@ -35,3 +37,13 @@ If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DW
 Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.

 In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
+
+## Metal
+
+You can also enable Metal to make the inference run on the GPU of your device. This might or might not be more efficient
+compared to Core ML depending on the model and device that you use.
+
+To enable Metal, just add `-DGGML_USE_METAL` instead off the `-DWHISPER_USE_COREML` flag and you are ready.
+This will make both the Encoder and the Decoder run on the GPU.
+
+If you want to run the Encoder with Core ML and the Decoder with Metal then simply add both `-DWHISPER_USE_COREML -DGGML_USE_METAL` flags. That's all!
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -7,6 +7,9 @@
 	objects = {

 /* Begin PBXBuildFile section */
+		1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
+		1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
+		184447212AB21B43007D6BFE /* ggml-metal.metal in CopyFiles */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
 		18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7D29052BDF00BD2A04 /* SceneDelegate.m */; };
 		18627C8129052BDF00BD2A04 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8029052BDF00BD2A04 /* ViewController.m */; };
@ -14,7 +17,7 @@
 		18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
 		18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
 		18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
-		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK"; }; };
+		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML"; }; };
 		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; };
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
@ -23,7 +26,24 @@
 		7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */ = {isa = PBXBuildFile; fileRef = 7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */; };
 /* End PBXBuildFile section */

+/* Begin PBXCopyFilesBuildPhase section */
+		184447202AB21B25007D6BFE /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 7;
+			files = (
+				184447212AB21B43007D6BFE /* ggml-metal.metal in CopyFiles */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
 /* Begin PBXFileReference section */
+		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml-alloc.c"; sourceTree = "<group>"; };
+		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml-alloc.h"; sourceTree = "<group>"; };
+		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml-metal.m"; sourceTree = "<group>"; };
+		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml-metal.metal"; sourceTree = "<group>"; };
 		18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -80,6 +100,10 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
+				1844471D2AB2195F007D6BFE /* ggml-metal.metal */,
+				1844471B2AB21655007D6BFE /* ggml-metal.m */,
+				184447182AB211A2007D6BFE /* ggml-alloc.c */,
+				184447192AB211A2007D6BFE /* ggml-alloc.h */,
 				7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */,
 				7FE342442A0C3FA20015A058 /* coreml */,
 				18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */,
@ -126,6 +150,7 @@
 				18627C7229052BDF00BD2A04 /* Sources */,
 				18627C7329052BDF00BD2A04 /* Frameworks */,
 				18627C7429052BDF00BD2A04 /* Resources */,
+				184447202AB21B25007D6BFE /* CopyFiles */,
 			);
 			buildRules = (
 			);
@ -194,8 +219,10 @@
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
+				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
+				1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
 				7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -20,6 +20,7 @@
 		0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC929539EB0003032C3 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -Wno-shorten-64-to-32"; }; };
 		0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; };
 		0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; };
+		18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 18AED47F2AB21F2B009D854F /* ggml-alloc.c */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
@ -41,6 +42,8 @@
 		0AAC5DCA29539EB0003032C3 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ggml.h; sourceTree = "<group>"; };
 		0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
 		0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
+		18AED47F2AB21F2B009D854F /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-alloc.c"; sourceTree = "<group>"; };
+		18AED4802AB21F2B009D854F /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-alloc.h"; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@ -124,6 +127,8 @@
 		0AAC5DC529539E89003032C3 /* whisper.cpp */ = {
 			isa = PBXGroup;
 			children = (
+				18AED47F2AB21F2B009D854F /* ggml-alloc.c */,
+				18AED4802AB21F2B009D854F /* ggml-alloc.h */,
 				0AAC5DC929539EB0003032C3 /* ggml.c */,
 				0AAC5DCA29539EB0003032C3 /* ggml.h */,
 				0AAC5DC729539EB0003032C3 /* whisper.cpp */,
@ -242,6 +247,7 @@
 				0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */,
 				0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */,
 				0AA7514E2953D958001EE061 /* Recorder.swift in Sources */,
+				18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@ -369,7 +375,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_ASSET_PATHS = "\"whisper.swiftui.demo/Supporting files/Preview Content\"";
-				DEVELOPMENT_TEAM = 3TZ9BM962G;
+				DEVELOPMENT_TEAM = P8JZH34X63;
 				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
@ -410,7 +416,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_ASSET_PATHS = "\"whisper.swiftui.demo/Supporting files/Preview Content\"";
-				DEVELOPMENT_TEAM = 3TZ9BM962G;
+				DEVELOPMENT_TEAM = P8JZH34X63;
 				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;