From 7fd8d9c220a638eaf54925117de9fdcaf5f0263b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Nov 2024 19:09:07 +0200 Subject: [PATCH] whisper : adapt to new ggml (wip) --- .gitignore | 4 + Makefile | 261 +++----- Package.swift | 16 +- .../lib/src/main/jni/whisper/CMakeLists.txt | 7 +- .../whisper.objc.xcodeproj/project.pbxproj | 38 +- .../whisper.cpp.swift/LibWhisper.swift | 2 - ggml/ggml_vk_generate_shaders.py | 220 ------- src/ggml-cpu-impl.h | 614 ------------------ src/whisper.cpp | 5 +- 9 files changed, 160 insertions(+), 1007 deletions(-) delete mode 100644 ggml/ggml_vk_generate_shaders.py delete mode 100644 src/ggml-cpu-impl.h diff --git a/.gitignore b/.gitignore index ebb73586..c1e584db 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.o *.a +*.d .cache/ .coreml/ .test/ @@ -19,6 +20,9 @@ build-*/ .swiftpm *.metallib +ggml-metal-embed.metal +ggml-metal-embed.metal.tmp + /main /stream /command diff --git a/Makefile b/Makefile index a3835dd1..8a6c0282 100644 --- a/Makefile +++ b/Makefile @@ -444,17 +444,17 @@ endif else MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d -endif +endif # RISCV ifndef GGML_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif endif # GGML_NO_ACCELERATE @@ -464,29 +464,38 @@ ifndef GGML_NO_OPENMP MK_CXXFLAGS += -fopenmp endif # GGML_NO_OPENMP +ifdef WHISPER_COREML + MK_CXXFLAGS += -DWHISPER_USE_COREML + LDFLAGS += -framework Foundation -framework CoreML + +ifdef WHISPER_COREML_ALLOW_FALLBACK + MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK +endif +endif # WHISPER_COREML + ifdef GGML_OPENBLAS MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_OPENBLAS ifdef GGML_OPENBLAS64 MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_OPENBLAS64 ifdef GGML_BLIS MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_BLIS ifdef GGML_RPC MK_CPPFLAGS += -DGGML_USE_RPC - OBJ_GGML += ggml/src/ggml-rpc.o + OBJ_GGML += ggml/src/ggml-rpc/ggml-rpc.o endif # GGML_RPC OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) @@ -513,7 +522,7 @@ ifdef GGML_CUDA MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_NVCCFLAGS += -use_fast_math - OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(OBJ_CUDA_TMPL) ifdef WHISPER_FATAL_WARNINGS @@ -615,11 +624,11 @@ ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/common.cuh $(NVCC_COMPILE) -ggml/src/ggml-cuda.o: \ - ggml/src/ggml-cuda.cu \ +ggml/src/ggml-cuda/ggml-cuda.o: \ + ggml/src/ggml-cuda/ggml-cuda.cu \ + ggml/include/ggml-cuda.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h \ - ggml/include/ggml-cuda.h \ ggml/src/ggml-backend-impl.h \ ggml/src/ggml-common.h \ $(wildcard ggml/src/ggml-cuda/*.cuh) @@ -742,50 +751,43 @@ endif # GGML_HIPBLAS ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJ_GGML += ggml/src/ggml-metal.o + OBJ_GGML += ggml/src/ggml-metal/ggml-metal.o ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif ifdef GGML_METAL_EMBED_LIBRARY MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY - OBJ_GGML += ggml/src/ggml-metal-embed.o + OBJ_GGML += ggml/src/ggml-metal/ggml-metal-embed.o endif endif # GGML_METAL -ifdef WHISPER_COREML - MK_CXXFLAGS += -DWHISPER_USE_COREML - LDFLAGS += -framework Foundation -framework CoreML - -ifdef WHISPER_COREML_ALLOW_FALLBACK - MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK -endif -endif - -# === - ifdef GGML_METAL -ggml/src/ggml-metal.o: \ - ggml/src/ggml-metal.m \ +ggml/src/ggml-metal/ggml-metal.o: \ + ggml/src/ggml-metal/ggml-metal.m \ + ggml/src/ggml-metal/ggml-metal-impl.h \ ggml/include/ggml-metal.h \ ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ ifdef GGML_METAL_EMBED_LIBRARY -ggml/src/ggml-metal-embed.o: \ - ggml/src/ggml-metal.metal \ +ggml/src/ggml-metal/ggml-metal-embed.o: \ + ggml/src/ggml-metal/ggml-metal.metal \ + ggml/src/ggml-metal/ggml-metal-impl.h \ ggml/src/ggml-common.h @echo "Embedding Metal library" - @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal - $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) - @$(AS) $(TEMP_ASSEMBLY) -o $@ - @rm -f ${TEMP_ASSEMBLY} + @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp + @sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal + $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ + @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s + @rmdir ${TEMP_ASSEMBLY} endif endif # GGML_METAL @@ -801,11 +803,17 @@ endif OBJ_GGML += \ ggml/src/ggml.o \ - ggml/src/ggml-cpu.o \ + ggml/src/ggml-aarch64.o \ ggml/src/ggml-alloc.o \ ggml/src/ggml-backend.o \ + ggml/src/ggml-backend-reg.o \ + ggml/src/ggml-opt.o \ ggml/src/ggml-quants.o \ - ggml/src/ggml-aarch64.o + ggml/src/ggml-threading.o \ + ggml/src/ggml-cpu/ggml-cpu.o \ + ggml/src/ggml-cpu/ggml-cpu-cpp.o \ + ggml/src/ggml-cpu/ggml-cpu-aarch64.o \ + ggml/src/ggml-cpu/ggml-cpu-quants.o OBJ_WHISPER += \ src/whisper.o @@ -910,114 +918,64 @@ endif # Build libraries # -# ggml +LIB_GGML = libggml.so +LIB_GGML_S = libggml.a -ggml/src/ggml.o: \ - ggml/src/ggml.c \ - ggml/include/ggml.h - $(CC) $(CFLAGS) -c $< -o $@ +LIB_LLAMA = libllama.so +LIB_LLAMA_S = libllama.a -ggml/src/ggml-cpu.o: \ - ggml/src/ggml-cpu.c \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h - $(CC) $(CFLAGS) -c $< -o $@ +LIB_COMMON = libcommon.so +LIB_COMMON_S = libcommon.a -ggml/src/ggml-alloc.o: \ - ggml/src/ggml-alloc.c \ - ggml/include/ggml.h \ - ggml/include/ggml-alloc.h - $(CC) $(CFLAGS) -c $< -o $@ +LIB_COMMON_SDL = libcommon-sdl.so +LIB_COMMON_SDL_S = libcommon-sdl.a -ggml/src/ggml-backend.o: \ - ggml/src/ggml-backend.cpp \ - ggml/include/ggml.h \ - ggml/include/ggml-backend.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +# Targets +BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S) -ggml/src/ggml-quants.o: \ - ggml/src/ggml-quants.c \ - ggml/include/ggml.h \ - ggml/src/ggml-quants.h \ - ggml/src/ggml-common.h - $(CC) $(CFLAGS) -c $< -o $@ +# Dependency files +DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) -ggml/src/ggml-aarch64.o: \ - ggml/src/ggml-aarch64.c \ - ggml/include/ggml.h \ - ggml/src/ggml-aarch64.h \ - ggml/src/ggml-common.h - $(CC) $(CFLAGS) -c $< -o $@ +# Default target +all: $(BUILD_TARGETS) -ggml/src/ggml-blas.o: \ - ggml/src/ggml-blas.cpp \ - ggml/include/ggml-blas.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ifdef GGML_LLAMAFILE -ggml/src/sgemm.o: \ - ggml/src/sgemm.cpp \ - ggml/src/sgemm.h \ - ggml/include/ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # GGML_LLAMAFILE - -ifdef GGML_RPC -ggml/src/ggml-rpc.o: \ - ggml/src/ggml-rpc.cpp \ - ggml/include/ggml-rpc.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # GGML_RPC - -$(LIB_GGML): \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) - -$(LIB_GGML_S): \ - $(OBJ_GGML) - ar rcs $(LIB_GGML_S) $^ - -# whisper - -src/whisper.o: \ - src/whisper.cpp \ - include/whisper.h \ +# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files +# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml +ggml/src/ggml-cpu/ggml-cpu-cpp.o: \ + ggml/src/ggml-cpu/ggml-cpu.cpp \ + ggml/include/ggml-backend.h \ ggml/include/ggml.h \ ggml/include/ggml-alloc.h \ - ggml/include/ggml-backend.h \ - ggml/include/ggml-cuda.h \ - ggml/include/ggml-metal.h - $(CXX) $(CXXFLAGS) -c $< -o $@ + ggml/src/ggml-backend-impl.h \ + ggml/include/ggml-cpu.h \ + ggml/src/ggml-impl.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -$(LIB_WHISPER): \ - $(OBJ_WHISPER) \ - $(LIB_GGML) +# Rules for building object files +ggml/%.o: ggml/%.c + $(CC) $(CFLAGS) -MMD -c $< -o $@ + +ggml/%.o: ggml/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ + +src/%.o: src/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ + +examples/%.o: examples/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ + +# Rules for building libraries +$(LIB_GGML): $(OBJ_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_WHISPER_S): \ - $(OBJ_WHISPER) \ - $(OBJ_GGML) - ar rcs $(LIB_WHISPER_S) $^ +$(LIB_GGML_S): $(OBJ_GGML) + ar rcs $(LIB_GGML_S) $^ -# common - -examples/common.o: \ - examples/common.cpp \ - examples/common.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -examples/common-ggml.o: \ - examples/common-ggml.cpp \ - examples/common-ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -$(LIB_COMMON): \ - $(OBJ_COMMON) +$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_COMMON_S): \ - $(OBJ_COMMON) - ar rcs $(LIB_COMMON_S) $^ +$(LIB_LLAMA_S): $(OBJ_LLAMA) + ar rcs $(LIB_LLAMA_S) $^ # common-sdl @@ -1029,34 +987,21 @@ examples/common-sdl.o: \ examples/common-sdl.h $(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $@ -$(LIB_COMMON_SDL): \ - $(OBJ_SDL) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(LDFLAGS_SDL) +$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_COMMON_SDL_S): \ - $(OBJ_SDL) - ar rcs $(LIB_COMMON_SDL_S) $^ +$(LIB_COMMON_S): $(OBJ_COMMON) + ar rcs $(LIB_COMMON_S) $^ +# Include dependency files +-include $(DEP_FILES) + +# Clean rule clean: - rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS) - rm -rvf src/*.o - rm -rvf src/coreml/*.o - rm -rvf tests/*.o - rm -rvf examples/*.o - rm -rvf *.a - rm -rvf *.dll - rm -rvf *.so - rm -rvf *.dot - rm -rvf ggml/*.a - rm -rvf ggml/*.dll - rm -rvf ggml/*.so - rm -vrf ggml/src/*.o - rm -vrf ggml/src/ggml-metal-embed.metal - rm -vrf ggml/src/ggml-cuda/*.o - rm -vrf ggml/src/ggml-cuda/template-instances/*.o - rm -rvf $(BUILD_TARGETS) - rm -rvf $(TEST_TARGETS) - find examples -type f -name "*.o" -delete + rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS) + rm -rvf *.a *.dll *.so *.dot + find ggml src tests examples -type f -name "*.o" -delete + find ggml src tests examples -type f -name "*.d" -delete # # Examples diff --git a/Package.swift b/Package.swift index 9d224940..367ce03b 100644 --- a/Package.swift +++ b/Package.swift @@ -28,7 +28,7 @@ let package = Package( "tests", "CMakeLists.txt", "Makefile", - "ggml/src/ggml-metal-embed.metal" + "ggml/src/ggml-metal/ggml-metal-embed.metal" ], sources: [ "ggml/src/ggml.c", @@ -36,16 +36,22 @@ let package = Package( "ggml/src/ggml-aarch64.c", "ggml/src/ggml-alloc.c", "ggml/src/ggml-backend.cpp", - "ggml/src/ggml-cpu.c", + "ggml/src/ggml-backend-reg.cpp", + "ggml/src/ggml-cpu/ggml-cpu.c", + "ggml/src/ggml-cpu/ggml-cpu.cpp", + "ggml/src/ggml-cpu/ggml-cpu-aarch64.c", + "ggml/src/ggml-cpu/ggml-cpu-quants.c", "ggml/src/ggml-quants.c", - "ggml/src/ggml-metal.m" + "ggml/src/ggml-threading.cpp", + "ggml/src/ggml-metal/ggml-metal.m" ], - resources: [.process("ggml/src/ggml-metal.metal")], + resources: [.process("ggml/src/ggml-metal/ggml-metal.metal")], publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_ACCELERATE"), .unsafeFlags(["-fno-objc-arc"]), + .headerSearchPath("ggml/src"), + .define("GGML_USE_ACCELERATE"), .define("GGML_USE_METAL") // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 diff --git a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt index 2e44317c..1243d732 100644 --- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt +++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt @@ -19,11 +19,16 @@ if (NOT GGML_HOME) SOURCE_FILES ${SOURCE_FILES} ${WHISPER_LIB_DIR}/ggml/src/ggml.c - ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu.c ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp + ${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c + ${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp + ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c + ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp + ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.c + ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c ) endif() diff --git a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj index b1287f78..11447909 100644 --- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj +++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj @@ -25,6 +25,11 @@ 18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; }; 18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; }; 18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; }; + 18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; }; + 18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; }; + 18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */; }; + 18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; }; + 18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; }; 7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; }; 7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; }; 7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; }; @@ -50,8 +55,8 @@ 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = ""; }; 184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = ""; }; 184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = ""; }; - 1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = ""; }; - 1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = ""; }; + 1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal/ggml-metal.m"; sourceTree = ""; }; + 1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal/ggml-metal.metal"; sourceTree = ""; }; 18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; }; 18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; 18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = ""; }; @@ -77,8 +82,17 @@ 18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = ""; }; 18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = ""; }; 18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = ""; }; - 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu.c"; sourceTree = ""; }; + 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = ""; }; 18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = ""; }; + 18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = ""; }; + 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = ""; }; + 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = ""; }; + 18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = ""; }; + 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.c"; sourceTree = ""; }; + 18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = ""; }; + 18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = ""; }; + 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = ""; }; + 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = ""; }; 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = ""; }; 7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = ""; }; 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = ""; }; @@ -118,6 +132,15 @@ 18627C7829052BDF00BD2A04 /* whisper.objc */ = { isa = PBXGroup; children = ( + 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */, + 18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */, + 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */, + 18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */, + 18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */, + 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */, + 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */, + 18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */, + 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */, 18E864AA2CE73C580094B8B3 /* ggml-cpu.h */, 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */, 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */, @@ -252,11 +275,16 @@ 18627C9629052C5800BD2A04 /* ggml.c in Sources */, 18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */, 7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */, + 18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */, + 18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */, 1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */, + 18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */, + 18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */, 18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */, 18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */, 18627C8C29052BE000BD2A04 /* main.m in Sources */, 18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */, + 18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */, 1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */, 7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */, ); @@ -335,6 +363,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = ""; IPHONEOS_DEPLOYMENT_TARGET = 16.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; @@ -388,6 +417,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = ""; IPHONEOS_DEPLOYMENT_TARGET = 16.0; MTL_ENABLE_DEBUG_INFO = NO; MTL_FAST_MATH = YES; @@ -410,6 +440,7 @@ DEVELOPMENT_TEAM = P8JZH34X63; GCC_WARN_64_TO_32_BIT_CONVERSION = NO; GENERATE_INFOPLIST_FILE = YES; + HEADER_SEARCH_PATHS = ../../../ggml/src/; INFOPLIST_FILE = whisper.objc/Info.plist; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; @@ -439,6 +470,7 @@ DEVELOPMENT_TEAM = P8JZH34X63; GCC_WARN_64_TO_32_BIT_CONVERSION = NO; GENERATE_INFOPLIST_FILE = YES; + HEADER_SEARCH_PATHS = ../../../ggml/src/; INFOPLIST_FILE = whisper.objc/Info.plist; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; diff --git a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift index 54cb3bd4..125be4f8 100644 --- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift +++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift @@ -67,8 +67,6 @@ actor WhisperContext { private func systemInfo() -> String { var info = "" if (ggml_cpu_has_neon() != 0) { info += "NEON " } - if (ggml_cpu_has_metal() != 0) { info += "METAL " } - if (ggml_cpu_has_blas() != 0) { info += "BLAS " } return String(info.dropLast()) } diff --git a/ggml/ggml_vk_generate_shaders.py b/ggml/ggml_vk_generate_shaders.py deleted file mode 100644 index 38914eed..00000000 --- a/ggml/ggml_vk_generate_shaders.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python - -import logging -import argparse -import asyncio -import os -from tempfile import gettempdir - -logger = logging.getLogger("ggml-vk-generate-shaders") - -GLSLC = "glslc" - -type_names = [ - "f32", - "f16", - "q4_0", - "q4_1", - "q5_0", - "q5_1", - "q8_0", - "q2_k", - "q3_k", - "q4_k", - "q5_k", - "q6_k", -] - -ASYNCIO_CONCURRENCY = 64 - -input_dir = "vulkan-shaders" -output_dir = gettempdir() - -lock = asyncio.Lock() -shader_fnames = [] - - -async def string_to_spv(name, in_fname, defines, fp16=True): - name = f"{name}{'_fp32' if not fp16 else ''}" - out_fname = os.path.join(output_dir, f"{name}.spv") - - in_path = os.path.join(input_dir, in_fname) - - cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname] - - cmd.extend([f"-D{key}={value}" for key, value in defines.items()]) - - proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) - - stdout, stderr = await proc.communicate() - - stdout = stdout.decode() - error = stderr.decode() - - if proc.returncode: - cmd = " ".join(cmd) - logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}") - return - - async with lock: - shader_fnames.append((name, out_fname)) - - -def matmul_shaders(tasks, fp16, matmul_id): - if fp16: - load_vec = "8" - aligned_b_type_f32 = "mat2x4" - aligned_b_type_f16 = "f16mat2x4" - else: - load_vec = "4" - aligned_b_type_f32 = "vec4" - aligned_b_type_f16 = "f16vec4" - - base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"} - shader_name = "matmul" - - if matmul_id: - base_dict["MUL_MAT_ID"] = "1" - shader_name = "matmul_id" - - if fp16: - base_dict["FLOAT16"] = "1" - - # Shaders with f16 B_TYPE - tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16)) - tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16)) - - tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16)) - tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16)) - - for tname in type_names: - data_a_key = f"DATA_A_{tname.upper()}" - load_vec_a = load_vec if tname in ("f32", "f16") else "2" - tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) - tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16)) - - -async def main(): - logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V") - - tasks = [] - - for fp16 in (False, True): - # MUL_MAT - matmul_shaders(tasks, fp16, False) - # MUL_MAT_ID - matmul_shaders(tasks, fp16, True) - - for tname in type_names: - base_dict = {"FLOAT_TYPE": "float"} - - # mul mat vec - data_a_key = f"DATA_A_{tname.upper()}" - shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp" - - tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"})) - - tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"})) - - # Dequant shaders - if tname != "f16": - tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"})) - - # get_rows - if not tname.endswith("_k"): - shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp" - - if tname == "f16": - tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"})) - else: - tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"})) - - tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) - - # Norms - tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"})) - - tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"})) - - tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - - tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {})) - - tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - - tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - - tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - - tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - - tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - - tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - - tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - - tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"})) - - tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - - tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - - tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"})) - - tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"})) - - # Helper to decorate tasks with semaphore acquisition. - async def withSemaphore(sem, task): - async with sem: - return await task - - # Run tasks concurrently guarded by a concurrency limit. - sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY) - await asyncio.gather(*(withSemaphore(sem, task) for task in tasks)) - - with open("ggml-vulkan-shaders.hpp", "w") as f: - f.write("#include \n\n") - for name, path in sorted(shader_fnames): - - with open(path, "rb") as spv: - counter = 0 - newline_counter = 0 - f.write(f"unsigned char {name}_data[] = {{\n") - for val in spv.read(): - f.write(f"0x{val:02x},") - newline_counter += 1 - counter += 1 - if newline_counter >= 12: - newline_counter = 0 - f.write("\n") - f.write("\n};\n") - f.write(f"const uint64_t {name}_len = {counter};\n\n") - os.remove(path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator") - - parser.add_argument("--glslc", help="Path to glslc") - parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - - args = parser.parse_args() - - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - - if args.glslc: - GLSLC = args.glslc - - asyncio.run(main()) diff --git a/src/ggml-cpu-impl.h b/src/ggml-cpu-impl.h deleted file mode 100644 index 5b45155b..00000000 --- a/src/ggml-cpu-impl.h +++ /dev/null @@ -1,614 +0,0 @@ -#pragma once - -// GGML CPU internal header - -#include "ggml.h" -#include "ggml-impl.h" -#include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ -//#include -#include -#include // memcpy -#include // fabsf - - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(_MSC_VER) - -#define m512bh(p) p -#define m512i(p) p - -#else - -#define m512bh(p) (__m512bh)(p) -#define m512i(p) (__m512i)(p) - -#endif - -/** - * Converts brain16 to float32. - * - * The bfloat16 floating point format has the following structure: - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───┐ - * 0b0000000000000000 brain16 - * - * Since bf16 has the same number of exponent bits as a 32bit float, - * encoding and decoding numbers becomes relatively straightforward. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───────────────────┐ - * 0b00000000000000000000000000000000 IEEE binary32 - * - * For comparison, the standard fp16 format has fewer exponent bits. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌─┴─┐┌─┴──────┐ - * 0b0000000000000000 IEEE binary16 - * - * @see IEEE 754-2008 - */ -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -/** - * Converts float32 to brain16. - * - * This is binary identical with Google Brain float conversion. - * Floats shall round to nearest even, and NANs shall be quiet. - * Subnormals aren't flushed to zero, except perhaps when used. - * This code should vectorize nicely if using modern compilers. - */ -static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { - ggml_bf16_t h; - union { - float f; - uint32_t i; - } u; - u.f = s; - if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ - h.bits = (u.i >> 16) | 64; /* force to quiet */ - return h; - } - h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; - return h; -} - -#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) -#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) - -// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 -#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __FMA__ -#define __FMA__ -#endif -#ifndef __F16C__ -#define __F16C__ -#endif -#endif - -// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available -#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __SSE3__ -#define __SSE3__ -#endif -#ifndef __SSSE3__ -#define __SSSE3__ -#endif -#endif - -#if defined(__ARM_FEATURE_SVE) -#include -#include -#endif - -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -#if defined(__ARM_NEON) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#ifdef _MSC_VER - -typedef uint16_t ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } - -#else - -typedef __fp16 ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } - -#endif // _MSC_VER - -#if !defined(__aarch64__) - -// 32-bit ARM compatibility - -// vaddlvq_s16 -// vpaddq_s16 -// vpaddq_s32 -// vaddvq_s32 -// vaddvq_f32 -// vmaxvq_f32 -// vcvtnq_s32_f32 -// vzip1_u8 -// vzip2_u8 - -inline static int32_t vaddlvq_s16(int16x8_t v) { - int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); - return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { - int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); - int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); - return vcombine_s32(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -// vld1q_s16_x2 -// vld1q_u8_x2 -// vld1q_u8_x4 -// vld1q_s8_x2 -// vld1q_s8_x4 -// TODO: double-check these work correctly - -typedef struct ggml_int16x8x2_t { - int16x8_t val[2]; -} ggml_int16x8x2_t; - -inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { - ggml_int16x8x2_t res; - - res.val[0] = vld1q_s16(ptr + 0); - res.val[1] = vld1q_s16(ptr + 8); - - return res; -} - -typedef struct ggml_uint8x16x2_t { - uint8x16_t val[2]; -} ggml_uint8x16x2_t; - -inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { - ggml_uint8x16x2_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - - return res; -} - -typedef struct ggml_uint8x16x4_t { - uint8x16_t val[4]; -} ggml_uint8x16x4_t; - -inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { - ggml_uint8x16x4_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - res.val[2] = vld1q_u8(ptr + 32); - res.val[3] = vld1q_u8(ptr + 48); - - return res; -} - -typedef struct ggml_int8x16x2_t { - int8x16_t val[2]; -} ggml_int8x16x2_t; - -inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { - ggml_int8x16x2_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - - return res; -} - -typedef struct ggml_int8x16x4_t { - int8x16_t val[4]; -} ggml_int8x16x4_t; - -inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { - ggml_int8x16x4_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - res.val[2] = vld1q_s8(ptr + 32); - res.val[3] = vld1q_s8(ptr + 48); - - return res; -} - -// NOTE: not tested -inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { - int8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -// NOTE: not tested -inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -#else - -#define ggml_int16x8x2_t int16x8x2_t -#define ggml_uint8x16x2_t uint8x16x2_t -#define ggml_uint8x16x4_t uint8x16x4_t -#define ggml_int8x16x2_t int8x16x2_t -#define ggml_int8x16x4_t int8x16x4_t - -#define ggml_vld1q_s16_x2 vld1q_s16_x2 -#define ggml_vld1q_u8_x2 vld1q_u8_x2 -#define ggml_vld1q_u8_x4 vld1q_u8_x4 -#define ggml_vld1q_s8_x2 vld1q_s8_x2 -#define ggml_vld1q_s8_x4 vld1q_s8_x4 -#define ggml_vqtbl1q_s8 vqtbl1q_s8 -#define ggml_vqtbl1q_u8 vqtbl1q_u8 - -#endif // !defined(__aarch64__) - -#if !defined(__ARM_FEATURE_DOTPROD) - -inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { - const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); - const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - - return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); -} - -#else - -#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) - -#endif // !defined(__ARM_FEATURE_DOTPROD) - -#endif // defined(__ARM_NEON) - -#if defined(__ARM_NEON) && !defined(_MSC_VER) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - ggml_fp16_internal_t tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - ggml_fp16_internal_t tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; -} - -#else - -#ifdef __wasm_simd128__ -#include -#else -#ifdef __POWER9_VECTOR__ -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#if defined(__loongarch64) -#if defined(__loongarch_asx) -#include -#endif -#if defined(__loongarch_sx) -#include -#endif -#endif - -#if defined(__loongarch_asx) - -typedef union { - int32_t i; - float f; -} ft_union; - -/* float type data load instructions */ -static __m128 __lsx_vreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); -} - -static __m256 __lasx_xvreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); -} -#endif - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif - -#elif defined(__POWER9_VECTOR__) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - register double d; - register ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // __F16C__ - -#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) - -#ifdef __ARM_FEATURE_SVE -#include -#endif // __ARM_FEATURE_SVE - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -extern float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; -} - -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif - -#if !defined(GGML_FP32_TO_FP16) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif - -#ifdef __cplusplus -} -#endif diff --git a/src/whisper.cpp b/src/whisper.cpp index 1c4965b6..e3b6025d 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -4268,18 +4268,15 @@ const char * whisper_print_system_info(void) { s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "METAL = " + std::to_string(ggml_cpu_has_metal()) + " | "; s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | "; s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | "; - s += "CANN = " + std::to_string(ggml_cpu_has_cann()) ; + return s.c_str(); }