ggml : aligned malloc -> malloc

ggml : allocate contexts on the heap (v2)
whisper : reduce ggml_context usage
2025-06-24 17:15:19 +00:00 · 2024-10-31 21:40:11 +02:00 · 2024-10-31 21:29:48 +02:00 · 2024-10-30 13:39:14 +02:00
261 changed files with 46195 additions and 55071 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -3,7 +3,6 @@ on: [push, pull_request]

 env:
  ubuntu_image: "ubuntu:22.04"
-  VCPKG_BINARY_SOURCES: "clear;x-gha,readwrite"

 jobs:
  ubuntu-latest:
@ -309,11 +308,11 @@ jobs:
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+            cmake -B build -DGGML_OPENBLAS=ON
            cmake --build build --config ${{ matrix.build }} -j $(nproc)

  windows:
-    runs-on: windows-2019
+    runs-on: windows-latest

    strategy:
      matrix:
@ -373,7 +372,7 @@ jobs:
          path: build/bin/${{ matrix.build }}

  windows-blas:
-    runs-on: windows-2019
+    runs-on: windows-latest

    strategy:
      matrix:
@ -383,8 +382,10 @@ jobs:
        sdl2: [ON]
        include:
          - arch: Win32
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
          - arch: x64
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
          - sdl2: ON
            s2ver: 2.28.5
@ -393,21 +394,17 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4

-      - name: Export GitHub Actions cache environment variables
-        uses: actions/github-script@v7
-        with:
-          script: |
-            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-
      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v2

-      - name: Install OpenBLAS and pkgconfiglite
+      - name: Fetch OpenBLAS
        if: matrix.blas == 'ON'
        run: |
-          vcpkg install --triplet=${{ matrix.s2arc }}-windows openblas
-          choco install pkgconfiglite
+          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+          7z x blas.zip -oblas -y
+          copy blas/include/cblas.h .
+          copy blas/include/openblas_config.h .
+          echo "OPENBLAS_PATH=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV

      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
@ -419,10 +416,9 @@ jobs:
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
-          -DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DGGML_BLAS=${{ matrix.blas }}
-          -DGGML_BLAS_VENDOR=OpenBLAS
+          -DGGML_OPENBLAS=${{ matrix.blas }}
+          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}

      - name: Build
@ -430,9 +426,9 @@ jobs:
          cd ./build
          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

-      - name: Copy openblas.dll
+      - name: Copy libopenblas.dll
        if: matrix.blas == 'ON'
-        run: copy "C:/vcpkg/packages/openblas_${{ matrix.s2arc }}-windows/bin/openblas.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:OPENBLAS_PATH/bin/libopenblas.dll" build/bin/${{ matrix.build }}

      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
@ -564,6 +560,12 @@ jobs:
        with:
          path: whisper

+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          repository: ggerganov/ggml
+          path: ggml
+
      - name: Install Java
        uses: actions/setup-java@v4
        with:
@ -582,7 +584,7 @@ jobs:
        run: |
          export PATH_TO_GGML=$PWD/ggml
          cd whisper/examples/whisper.android
-          ./gradlew assembleRelease --no-daemon
+          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML

 # TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
 #  android_java:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -45,7 +45,7 @@ jobs:
        with:
          context: .
          push: true
-          platforms: ${{ matrix.config.platform }}
+          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

@ -54,6 +54,6 @@ jobs:
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platform }}
+          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,5 @@
 *.o
 *.a
-*.d
 .cache/
 .coreml/
 .test/
@ -20,9 +19,6 @@ build-*/
 .swiftpm
 *.metallib

-ggml-metal-embed.metal
-ggml-metal-embed.metal.tmp
-
 /main
 /stream
 /command
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
-project("whisper.cpp" VERSION 1.7.2)
+project("whisper.cpp" VERSION 1.7.1)
 include(CheckIncludeFileCXX)

 set(SOVERSION 1)
--- a/256
+++ b/256
@ -444,17 +444,17 @@ endif
 else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif # RISCV
+endif

 ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
-		OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+		OBJ_GGML    += ggml/src/ggml-blas.o
 	endif
 endif # GGML_NO_ACCELERATE

@ -464,38 +464,29 @@ ifndef GGML_NO_OPENMP
 	MK_CXXFLAGS += -fopenmp
 endif # GGML_NO_OPENMP

-ifdef WHISPER_COREML
-	MK_CXXFLAGS += -DWHISPER_USE_COREML
-	LDFLAGS     += -framework Foundation -framework CoreML
-
-ifdef WHISPER_COREML_ALLOW_FALLBACK
-	MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
-endif
-endif # WHISPER_COREML
-
 ifdef GGML_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_OPENBLAS

 ifdef GGML_OPENBLAS64
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_OPENBLAS64

 ifdef GGML_BLIS
 	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_BLIS

 ifdef GGML_RPC
 	MK_CPPFLAGS += -DGGML_USE_RPC
-	OBJ_GGML    += ggml/src/ggml-rpc/ggml-rpc.o
+	OBJ_GGML    += ggml/src/ggml-rpc.o
 endif # GGML_RPC

 OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
@ -522,7 +513,7 @@ ifdef GGML_CUDA
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_NVCCFLAGS += -use_fast_math

-	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 ifdef WHISPER_FATAL_WARNINGS
@ -624,11 +615,11 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/common.cuh
 	$(NVCC_COMPILE)

-ggml/src/ggml-cuda/ggml-cuda.o: \
-	ggml/src/ggml-cuda/ggml-cuda.cu \
-	ggml/include/ggml-cuda.h \
+ggml/src/ggml-cuda.o: \
+	ggml/src/ggml-cuda.cu \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
+	ggml/include/ggml-cuda.h \
 	ggml/src/ggml-backend-impl.h \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
@ -751,43 +742,50 @@ endif # GGML_HIPBLAS
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML	+= ggml/src/ggml-metal/ggml-metal.o
+	OBJ_GGML	+= ggml/src/ggml-metal.o
 ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif

 ifdef GGML_METAL_EMBED_LIBRARY
 	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML    += ggml/src/ggml-metal/ggml-metal-embed.o
+	OBJ_GGML    += ggml/src/ggml-metal-embed.o
 endif
 endif # GGML_METAL

+ifdef WHISPER_COREML
+	MK_CXXFLAGS += -DWHISPER_USE_COREML
+	LDFLAGS     += -framework Foundation -framework CoreML
+
+ifdef WHISPER_COREML_ALLOW_FALLBACK
+	MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
+endif
+endif
+
+# ===
+
 ifdef GGML_METAL
-ggml/src/ggml-metal/ggml-metal.o: \
-	ggml/src/ggml-metal/ggml-metal.m \
-	ggml/src/ggml-metal/ggml-metal-impl.h \
+ggml/src/ggml-metal.o: \
+	ggml/src/ggml-metal.m \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@

 ifdef GGML_METAL_EMBED_LIBRARY
-ggml/src/ggml-metal/ggml-metal-embed.o: \
-	ggml/src/ggml-metal/ggml-metal.metal \
-	ggml/src/ggml-metal/ggml-metal-impl.h \
+ggml/src/ggml-metal-embed.o: \
+	ggml/src/ggml-metal.metal \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
-	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
-	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
-	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
-	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
-	@rmdir ${TEMP_ASSEMBLY}
+	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
+	$(eval TEMP_ASSEMBLY=$(shell mktemp))
+	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
+	@$(AS) $(TEMP_ASSEMBLY) -o $@
+	@rm -f ${TEMP_ASSEMBLY}
 endif
 endif # GGML_METAL

@ -803,17 +801,10 @@ endif

 OBJ_GGML += \
 	ggml/src/ggml.o \
-	ggml/src/ggml-aarch64.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-backend-reg.o \
-	ggml/src/ggml-opt.o \
 	ggml/src/ggml-quants.o \
-	ggml/src/ggml-threading.o \
-	ggml/src/ggml-cpu/ggml-cpu.o \
-	ggml/src/ggml-cpu/ggml-cpu-cpp.o \
-	ggml/src/ggml-cpu/ggml-cpu-aarch64.o \
-	ggml/src/ggml-cpu/ggml-cpu-quants.o
+	ggml/src/ggml-aarch64.o

 OBJ_WHISPER += \
 	src/whisper.o
@ -918,64 +909,108 @@ endif
 # Build libraries
 #

-LIB_GGML   = libggml.so
-LIB_GGML_S = libggml.a
+# ggml

-LIB_LLAMA   = libllama.so
-LIB_LLAMA_S = libllama.a
+ggml/src/ggml.o: \
+	ggml/src/ggml.c \
+	ggml/include/ggml.h
+	$(CC)  $(CFLAGS)   -c $< -o $@

-LIB_COMMON   = libcommon.so
-LIB_COMMON_S = libcommon.a
-
-LIB_COMMON_SDL   = libcommon-sdl.so
-LIB_COMMON_SDL_S = libcommon-sdl.a
-
-# Targets
-BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
-
-# Dependency files
-DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
-
-# Default target
-all: $(BUILD_TARGETS)
-
-# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
-ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
-	ggml/src/ggml-cpu/ggml-cpu.cpp \
-	ggml/include/ggml-backend.h \
+ggml/src/ggml-alloc.o: \
+	ggml/src/ggml-alloc.c \
 	ggml/include/ggml.h \
-	ggml/include/ggml-alloc.h \
-	ggml/src/ggml-backend-impl.h \
-	ggml/include/ggml-cpu.h \
-	ggml/src/ggml-impl.h
-	$(CXX) $(CXXFLAGS)   -c $< -o $@
+	ggml/include/ggml-alloc.h
+	$(CC)  $(CFLAGS)   -c $< -o $@

-# Rules for building object files
-ggml/%.o: ggml/%.c
-	$(CC) $(CFLAGS) -MMD -c $< -o $@
+ggml/src/ggml-backend.o: \
+	ggml/src/ggml-backend.cpp \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@

-ggml/%.o: ggml/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+ggml/src/ggml-quants.o: \
+	ggml/src/ggml-quants.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-quants.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@

-src/%.o: src/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+ggml/src/ggml-aarch64.o: \
+	ggml/src/ggml-aarch64.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@

-examples/%.o: examples/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+ggml/src/ggml-blas.o: \
+	ggml/src/ggml-blas.cpp \
+	ggml/include/ggml-blas.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@

-# Rules for building libraries
-$(LIB_GGML): $(OBJ_GGML)
+ifdef GGML_LLAMAFILE
+ggml/src/sgemm.o: \
+	ggml/src/sgemm.cpp \
+	ggml/src/sgemm.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_LLAMAFILE
+
+ifdef GGML_RPC
+ggml/src/ggml-rpc.o: \
+	ggml/src/ggml-rpc.cpp \
+	ggml/include/ggml-rpc.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_RPC
+
+$(LIB_GGML): \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

-$(LIB_GGML_S): $(OBJ_GGML)
+$(LIB_GGML_S): \
+	$(OBJ_GGML)
 	ar rcs $(LIB_GGML_S) $^

-$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
+# whisper
+
+src/whisper.o: \
+	src/whisper.cpp \
+	include/whisper.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h \
+	ggml/include/ggml-backend.h \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml-metal.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(LIB_WHISPER): \
+	$(OBJ_WHISPER) \
+	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

-$(LIB_LLAMA_S): $(OBJ_LLAMA)
-	ar rcs $(LIB_LLAMA_S) $^
+$(LIB_WHISPER_S): \
+	$(OBJ_WHISPER) \
+	$(OBJ_GGML)
+	ar rcs $(LIB_WHISPER_S) $^
+
+# common
+
+examples/common.o: \
+	examples/common.cpp \
+	examples/common.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+examples/common-ggml.o: \
+	examples/common-ggml.cpp \
+	examples/common-ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(LIB_COMMON): \
+	$(OBJ_COMMON)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_COMMON_S): \
+	$(OBJ_COMMON)
+	ar rcs $(LIB_COMMON_S) $^

 # common-sdl

@ -987,21 +1022,34 @@ examples/common-sdl.o: \
 	examples/common-sdl.h
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $@

-$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+$(LIB_COMMON_SDL): \
+	$(OBJ_SDL)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(LDFLAGS_SDL)

-$(LIB_COMMON_S): $(OBJ_COMMON)
-	ar rcs $(LIB_COMMON_S) $^
+$(LIB_COMMON_SDL_S): \
+	$(OBJ_SDL)
+	ar rcs $(LIB_COMMON_SDL_S) $^

-# Include dependency files
-include $(DEP_FILES)
-
-# Clean rule
 clean:
-	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -rvf *.a *.dll *.so *.dot
-	find ggml src tests examples -type f -name "*.o" -delete
-	find ggml src tests examples -type f -name "*.d" -delete
+	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -rvf src/*.o
+	rm -rvf src/coreml/*.o
+	rm -rvf tests/*.o
+	rm -rvf examples/*.o
+	rm -rvf *.a
+	rm -rvf *.dll
+	rm -rvf *.so
+	rm -rvf *.dot
+	rm -rvf ggml/*.a
+	rm -rvf ggml/*.dll
+	rm -rvf ggml/*.so
+	rm -vrf ggml/src/*.o
+	rm -vrf ggml/src/ggml-metal-embed.metal
+	rm -vrf ggml/src/ggml-cuda/*.o
+	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
+	rm -rvf $(BUILD_TARGETS)
+	rm -rvf $(TEST_TARGETS)
+	find examples -type f -name "*.o" -delete

 #
 # Examples
--- a/Package.swift
+++ b/Package.swift
@ -18,17 +18,16 @@ let package = Package(
            name: "whisper",
            path: ".",
            exclude: [
-               "build",
               "bindings",
               "cmake",
+               "coreml",
               "examples",
-               "scripts",
+               "extra",
               "models",
               "samples",
               "tests",
               "CMakeLists.txt",
-               "Makefile",
-               "ggml/src/ggml-metal/ggml-metal-embed.metal"
+               "Makefile"
            ],
            sources: [
                "ggml/src/ggml.c",
@ -36,22 +35,15 @@ let package = Package(
                "ggml/src/ggml-aarch64.c",
                "ggml/src/ggml-alloc.c",
                "ggml/src/ggml-backend.cpp",
-                "ggml/src/ggml-backend-reg.cpp",
-                "ggml/src/ggml-cpu/ggml-cpu.c",
-                "ggml/src/ggml-cpu/ggml-cpu.cpp",
-                "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
-                "ggml/src/ggml-cpu/ggml-cpu-quants.c",
                "ggml/src/ggml-quants.c",
-                "ggml/src/ggml-threading.cpp",
-                "ggml/src/ggml-metal/ggml-metal.m"
+                "ggml/src/ggml-metal.m"
            ],
-            resources: [.process("ggml/src/ggml-metal/ggml-metal.metal")],
+            resources: [.process("ggml-metal.metal")],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .headerSearchPath("ggml/src"),
                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags(["-fno-objc-arc"]),
                .define("GGML_USE_METAL")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.7.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -16,7 +16,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
- [Integer quantization support](#quantization)
+- [4-bit and 5-bit integer quantization support](#quantization)
 - Zero memory allocations at runtime
 - [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.7.2",
+  "version": "1.7.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@ -31,7 +31,7 @@ params.duration = 60_000
 params.max_text_tokens = 300
 params.translate = true
 params.print_timestamps = false
-params.initial_prompt = "Initial prompt here."
+params.prompt = "Initial prompt here."

 whisper.transcribe("path/to/audio.wav", params) do |whole_text|
  puts whole_text
@ -107,63 +107,5 @@ whisper.transcribe("path/to/audio.wav", params)

 ```

-You can see model information:
-
-```ruby
-whisper = Whisper::Context.new("path/to/model.bin")
-model = whisper.model
-
-model.n_vocab # => 51864
-model.n_audio_ctx # => 1500
-model.n_audio_state # => 512
-model.n_audio_head # => 8
-model.n_audio_layer # => 6
-model.n_text_ctx # => 448
-model.n_text_state # => 512
-model.n_text_head # => 8
-model.n_text_layer # => 6
-model.n_mels # => 80
-model.ftype # => 1
-model.type # => "base"
-
-```
-
-You can set log callback:
-
-```ruby
-prefix = "[MyApp] "
-log_callback = ->(level, buffer, user_data) {
-  case level
-  when Whisper::LOG_LEVEL_NONE
-    puts "#{user_data}none: #{buffer}"
-  when Whisper::LOG_LEVEL_INFO
-    puts "#{user_data}info: #{buffer}"
-  when Whisper::LOG_LEVEL_WARN
-    puts "#{user_data}warn: #{buffer}"
-  when Whisper::LOG_LEVEL_ERROR
-    puts "#{user_data}error: #{buffer}"
-  when Whisper::LOG_LEVEL_DEBUG
-    puts "#{user_data}debug: #{buffer}"
-  when Whisper::LOG_LEVEL_CONT
-    puts "#{user_data}same to previous: #{buffer}"
-  end
-}
-Whisper.log_set log_callback, prefix
-```
-
-Using this feature, you are also able to suppress log:
-
-```ruby
-Whisper.log_set ->(level, buffer, user_data) {
-  # do nothing
-}, nil
-Whisper::Context.new(MODEL)
-```
-
-License
-------
-
-The same to [whisper.cpp][].
-
 [whisper.cpp]: https://github.com/ggerganov/whisper.cpp
 [models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@ -23,39 +23,30 @@ CLEAN.include FileList[
                "ext/depend"
              ]

-task build: FileList[
-       "ext/Makefile",
-       "ext/ruby_whisper.h",
-       "ext/ruby_whisper.cpp",
-       "whispercpp.gemspec",
-     ]
+task build: SOURCES + FileList[
+                        "ext/extconf.rb",
+                        "ext/ruby_whisper.h",
+                        "ext/ruby_whisper.cpp",
+                        "whispercpp.gemspec",
+                      ]

 directory "pkg"
 CLOBBER.include "pkg"

 TEST_MODEL = "../../models/ggml-base.en.bin"
 LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
-SO_FILE = File.join("ext", LIB_NAME)
 LIB_FILE = File.join("lib", LIB_NAME)

-file "ext/Makefile" => ["ext/extconf.rb", "ext/ruby_whisper.h", "ext/ruby_whisper.cpp"] + SOURCES do |t|
-  Dir.chdir "ext" do
-    ruby "extconf.rb"
-  end
-end
-
-file SO_FILE => "ext/Makefile" do |t|
+directory "lib"
+task LIB_FILE => SOURCES + ["lib"] do |t|
  Dir.chdir "ext" do
+    sh "ruby extconf.rb"
    sh "make"
  end
+  mv "ext/#{LIB_NAME}", t.name
 end
 CLEAN.include LIB_FILE

-directory "lib"
-file LIB_FILE => [SO_FILE, "lib"] do |t|
-  copy t.source, t.name
-end
-
 Rake::TestTask.new do |t|
  t.test_files = FileList["tests/test_*.rb"]
 end
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -2,9 +2,6 @@ require 'mkmf'

 # need to use c++ compiler flags
 $CXXFLAGS << ' -std=c++11'
-
-$LDFLAGS << ' -lstdc++'
-
 # Set to true when building binary gems
 if enable_config('static-stdlib', false)
  $LDFLAGS << ' -static-libgcc -static-libstdc++'
@ -15,6 +12,34 @@ if enable_config('march-tune-native', false)
  $CXXFLAGS << ' -march=native -mtune=native'
 end

+def with_disabling_unsupported_files
+  disabled_files = []
+
+  unless $GGML_METAL
+    disabled_files << 'ggml-metal.h' << 'ggml-metal.m'
+  end
+
+  unless $GGML_METAL_EMBED_LIBRARY
+    disabled_files << 'ggml-metal.metal'
+  end
+
+  unless $OBJ_ALL&.include? 'ggml-blas.o'
+    disabled_files << 'ggml-blas.h' << 'ggml-blas.cpp'
+  end
+
+  disabled_files.filter! {|file| File.exist? file}
+
+  disabled_files.each do |file|
+    File.rename file, "#{file}.disabled"
+  end
+
+  yield
+
+  disabled_files.each do |file|
+    File.rename "#{file}.disabled", file
+  end
+end
+
 if ENV['WHISPER_METAL']
  $GGML_METAL ||= true
  $DEPRECATE_WARNING ||= true
@ -41,10 +66,10 @@ $MK_CXXFLAGS = '-std=c++11 -fPIC'
 $MK_NVCCFLAGS = '-std=c++11'
 $MK_LDFLAGS = ''

-$OBJ_GGML = []
-$OBJ_WHISPER = []
-$OBJ_COMMON = []
-$OBJ_SDL = []
+$OBJ_GGML = ''
+$OBJ_WHISPER = ''
+$OBJ_COMMON = ''
+$OBJ_SDL = ''

 $MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'

@ -127,7 +152,7 @@ unless ENV['GGML_NO_ACCELERATE']
    $MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
    $MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
    $MK_LDFLAGS  << ' -framework Accelerate'
-    $OBJ_GGML    << 'ggml-blas.o'
+    $OBJ_GGML    << ' ggml-blas.o'
  end
 end

@ -135,20 +160,20 @@ if ENV['GGML_OPENBLAS']
  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
  $MK_LDFLAGS  << " #{`pkg-config --libs openblas`}"
-  $OBJ_GGML    << 'ggml-blas.o'
+  $OBJ_GGML    << ' ggml-blas.o'
 end

 if ENV['GGML_OPENBLAS64']
  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
  $MK_LDFLAGS  << " #{`pkg-config --libs openblas64`}"
-  $OBJ_GGML    << 'ggml-blas.o'
+  $OBJ_GGML    << ' ggml-blas.o'
 end

 if $GGML_METAL
  $MK_CPPFLAGS << ' -DGGML_USE_METAL'
  $MK_LDFLAGS  << ' -framework Foundation -framework Metal -framework MetalKit'
-  $OBJ_GGML    << 'ggml-metal.o'
+  $OBJ_GGML    << ' ggml-metal.o'

  if ENV['GGML_METAL_NDEBUG']
    $MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
@ -156,23 +181,21 @@ if $GGML_METAL

  if $GGML_METAL_EMBED_LIBRARY
    $MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
-    $OBJ_GGML    << 'ggml-metal-embed.o'
+    $OBJ_GGML    << ' ggml-metal-embed.o'
  end
 end

 $OBJ_GGML <<
-  'ggml.o' <<
-  'ggml-cpu.o' <<
-  'ggml-alloc.o' <<
-  'ggml-backend.o' <<
-  'ggml-quants.o' <<
-  'ggml-aarch64.o'
+  ' ggml.o' <<
+  ' ggml-alloc.o' <<
+  ' ggml-backend.o' <<
+  ' ggml-quants.o' <<
+  ' ggml-aarch64.o'

 $OBJ_WHISPER <<
-  'whisper.o'
+  ' whisper.o'

-$objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
-$objs << "ruby_whisper.o"
+$OBJ_ALL = "#{$OBJ_GGML} #{$OBJ_WHISPER} #{$OBJ_COMMON} #{$OBJ_SDL}"

 $CPPFLAGS  = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
 $CFLAGS    = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
@ -181,13 +204,26 @@ $CXXFLAGS  = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
 $NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
 $LDFLAGS   = "#{$MK_LDFLAGS} #{$LDFLAGS}"

-create_makefile('whisper')
+if $GGML_METAL_EMBED_LIBRARY
+  File.write 'depend', "$(OBJS): $(OBJS) ggml-metal-embed.o\n"
+end
+
+with_disabling_unsupported_files do
+
+  create_makefile('whisper')
+
+end

 File.open 'Makefile', 'a' do |file|
  file.puts 'include get-flags.mk'

  if $GGML_METAL
    if $GGML_METAL_EMBED_LIBRARY
+      # mkmf determines object files to compile dependent on existing *.{c,cpp,m} files
+      # but ggml-metal-embed.c doesn't exist on creating Makefile.
+      file.puts "objs := $(OBJS)"
+      file.puts "OBJS = $(objs) 'ggml-metal-embed.o'"
+
      file.puts 'include metal-embed.mk'
    end
  end
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -41,8 +41,6 @@ static ID id_call;
 static ID id___method__;
 static ID id_to_enum;

-static bool is_log_callback_finalized = false;
-
 /*
 * call-seq:
 *   lang_max_id -> Integer
@ -90,39 +88,6 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
  return rb_str_new2(str_full);
 }

-static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
-  is_log_callback_finalized = true;
-  return Qnil;
-}
-
-/*
- * call-seq:
- *   log_set ->(level, buffer, user_data) { ... }, user_data -> nil
- */
-static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_data) {
-  VALUE old_callback = rb_iv_get(self, "@log_callback");
-  if (!NIL_P(old_callback)) {
-    rb_undefine_finalizer(old_callback);
-  }
-
-  rb_iv_set(self, "@log_callback", log_callback);
-  rb_iv_set(self, "@user_data", user_data);
-
-  VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
-  rb_define_finalizer(log_callback, finalize_log_callback);
-
-  whisper_log_set([](ggml_log_level level, const char * buffer, void * user_data) {
-    if (is_log_callback_finalized) {
-      return;
-    }
-    VALUE log_callback = rb_iv_get(mWhisper, "@log_callback");
-    VALUE udata = rb_iv_get(mWhisper, "@user_data");
-    rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
-  }, nullptr);
-
-  return Qnil;
-}
-
 static void ruby_whisper_free(ruby_whisper *rw) {
  if (rw->context) {
    whisper_free(rw->context);
@ -424,126 +389,6 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  return self;
 }

-/*
- * call-seq:
- *   model_n_vocab -> Integer
- */
-VALUE ruby_whisper_model_n_vocab(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_vocab(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_audio_ctx -> Integer
- */
-VALUE ruby_whisper_model_n_audio_ctx(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_audio_state -> Integer
- */
-VALUE ruby_whisper_model_n_audio_state(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_state(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_audio_head -> Integer
- */
-VALUE ruby_whisper_model_n_audio_head(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_head(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_audio_layer -> Integer
- */
-VALUE ruby_whisper_model_n_audio_layer(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_layer(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_text_ctx -> Integer
- */
-VALUE ruby_whisper_model_n_text_ctx(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_ctx(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_text_state -> Integer
- */
-VALUE ruby_whisper_model_n_text_state(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_state(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_text_head -> Integer
- */
-VALUE ruby_whisper_model_n_text_head(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_head(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_text_layer -> Integer
- */
-VALUE ruby_whisper_model_n_text_layer(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_layer(rw->context));
-}
-
-/*
- * call-seq:
- *   model_n_mels -> Integer
- */
-VALUE ruby_whisper_model_n_mels(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_mels(rw->context));
-}
-
-/*
- * call-seq:
- *   model_ftype -> Integer
- */
-VALUE ruby_whisper_model_ftype(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return INT2NUM(whisper_model_ftype(rw->context));
-}
-
-/*
- * call-seq:
- *   model_type -> String
- */
-VALUE ruby_whisper_model_type(VALUE self) {
-  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
-  return rb_str_new2(whisper_model_type_readable(rw->context));
-}
-
 /*
 * Number of segments.
 *
@ -1170,12 +1015,7 @@ typedef struct {
  int index;
 } ruby_whisper_segment;

-typedef struct {
-  VALUE context;
-} ruby_whisper_model;
-
 VALUE cSegment;
-VALUE cModel;

 static void rb_whisper_segment_mark(ruby_whisper_segment *rws) {
  rb_gc_mark(rws->context);
@ -1348,176 +1188,6 @@ static VALUE ruby_whisper_segment_get_text(VALUE self) {
  return rb_str_new2(text);
 }

-static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
-  rb_gc_mark(rwm->context);
-}
-
-static VALUE ruby_whisper_model_allocate(VALUE klass) {
-  ruby_whisper_model *rwm;
-  rwm = ALLOC(ruby_whisper_model);
-  return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
-}
-
-static VALUE rb_whisper_model_initialize(VALUE context) {
-  ruby_whisper_model *rwm;
-  const VALUE model = ruby_whisper_model_allocate(cModel);
-  Data_Get_Struct(model, ruby_whisper_model, rwm);
-  rwm->context = context;
-  return model;
-};
-
-/*
- * call-seq:
- *   model -> Whisper::Model
- */
-static VALUE ruby_whisper_get_model(VALUE self) {
-  return rb_whisper_model_initialize(self);
-}
-
-/*
- * call-seq:
- *   n_vocab -> Integer
- */
-static VALUE ruby_whisper_c_model_n_vocab(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_vocab(rw->context));
-}
-
-/*
- * call-seq:
- *   n_audio_ctx -> Integer
- */
-static VALUE ruby_whisper_c_model_n_audio_ctx(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
-}
-
-/*
- * call-seq:
- *   n_audio_state -> Integer
- */
-static VALUE ruby_whisper_c_model_n_audio_state(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_state(rw->context));
-}
-
-/*
- * call-seq:
- *   n_audio_head -> Integer
- */
-static VALUE ruby_whisper_c_model_n_audio_head(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_head(rw->context));
-}
-
-/*
- * call-seq:
- *   n_audio_layer -> Integer
- */
-static VALUE ruby_whisper_c_model_n_audio_layer(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_audio_layer(rw->context));
-}
-
-/*
- * call-seq:
- *   n_text_ctx -> Integer
- */
-static VALUE ruby_whisper_c_model_n_text_ctx(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_ctx(rw->context));
-}
-
-/*
- * call-seq:
- *   n_text_state -> Integer
- */
-static VALUE ruby_whisper_c_model_n_text_state(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_state(rw->context));
-}
-
-/*
- * call-seq:
- *   n_text_head -> Integer
- */
-static VALUE ruby_whisper_c_model_n_text_head(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_head(rw->context));
-}
-
-/*
- * call-seq:
- *   n_text_layer -> Integer
- */
-static VALUE ruby_whisper_c_model_n_text_layer(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_text_layer(rw->context));
-}
-
-/*
- * call-seq:
- *   n_mels -> Integer
- */
-static VALUE ruby_whisper_c_model_n_mels(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_n_mels(rw->context));
-}
-
-/*
- * call-seq:
- *   ftype -> Integer
- */
-static VALUE ruby_whisper_c_model_ftype(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return INT2NUM(whisper_model_ftype(rw->context));
-}
-
-/*
- * call-seq:
- *   type -> String
- */
-static VALUE ruby_whisper_c_model_type(VALUE self) {
-  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
-  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
-  return rb_str_new2(whisper_model_type_readable(rw->context));
-}
-
 void Init_whisper() {
  id_to_s = rb_intern("to_s");
  id_call = rb_intern("call");
@ -1528,36 +1198,15 @@ void Init_whisper() {
  cContext = rb_define_class_under(mWhisper, "Context", rb_cObject);
  cParams  = rb_define_class_under(mWhisper, "Params", rb_cObject);

-  rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
-  rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
-  rb_define_const(mWhisper, "LOG_LEVEL_WARN", INT2NUM(GGML_LOG_LEVEL_WARN));
-  rb_define_const(mWhisper, "LOG_LEVEL_ERROR", INT2NUM(GGML_LOG_LEVEL_ERROR));
-  rb_define_const(mWhisper, "LOG_LEVEL_DEBUG", INT2NUM(GGML_LOG_LEVEL_DEBUG));
-  rb_define_const(mWhisper, "LOG_LEVEL_CONT", INT2NUM(GGML_LOG_LEVEL_CONT));
-
  rb_define_singleton_method(mWhisper, "lang_max_id", ruby_whisper_s_lang_max_id, 0);
  rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
  rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
  rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
-  rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
-  rb_define_singleton_method(mWhisper, "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);

  rb_define_alloc_func(cContext, ruby_whisper_allocate);
  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);

  rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
-  rb_define_method(cContext, "model_n_vocab", ruby_whisper_model_n_vocab, 0);
-  rb_define_method(cContext, "model_n_audio_ctx", ruby_whisper_model_n_audio_ctx, 0);
-  rb_define_method(cContext, "model_n_audio_state", ruby_whisper_model_n_audio_state, 0);
-  rb_define_method(cContext, "model_n_audio_head", ruby_whisper_model_n_audio_head, 0);
-  rb_define_method(cContext, "model_n_audio_layer", ruby_whisper_model_n_audio_layer, 0);
-  rb_define_method(cContext, "model_n_text_ctx", ruby_whisper_model_n_text_ctx, 0);
-  rb_define_method(cContext, "model_n_text_state", ruby_whisper_model_n_text_state, 0);
-  rb_define_method(cContext, "model_n_text_head", ruby_whisper_model_n_text_head, 0);
-  rb_define_method(cContext, "model_n_text_layer", ruby_whisper_model_n_text_layer, 0);
-  rb_define_method(cContext, "model_n_mels", ruby_whisper_model_n_mels, 0);
-  rb_define_method(cContext, "model_ftype", ruby_whisper_model_ftype, 0);
-  rb_define_method(cContext, "model_type", ruby_whisper_model_type, 0);
  rb_define_method(cContext, "full_n_segments", ruby_whisper_full_n_segments, 0);
  rb_define_method(cContext, "full_lang_id", ruby_whisper_full_lang_id, 0);
  rb_define_method(cContext, "full_get_segment_t0", ruby_whisper_full_get_segment_t0, 1);
@ -1635,22 +1284,6 @@ void Init_whisper() {
  rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
  rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
  rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
-
-  cModel = rb_define_class_under(mWhisper, "Model", rb_cObject);
-  rb_define_alloc_func(cModel, ruby_whisper_model_allocate);
-  rb_define_method(cContext, "model", ruby_whisper_get_model, 0);
-  rb_define_method(cModel, "n_vocab", ruby_whisper_c_model_n_vocab, 0);
-  rb_define_method(cModel, "n_audio_ctx", ruby_whisper_c_model_n_audio_ctx, 0);
-  rb_define_method(cModel, "n_audio_state", ruby_whisper_c_model_n_audio_state, 0);
-  rb_define_method(cModel, "n_audio_head", ruby_whisper_c_model_n_audio_head, 0);
-  rb_define_method(cModel, "n_audio_layer", ruby_whisper_c_model_n_audio_layer, 0);
-  rb_define_method(cModel, "n_text_ctx", ruby_whisper_c_model_n_text_ctx, 0);
-  rb_define_method(cModel, "n_text_state", ruby_whisper_c_model_n_text_state, 0);
-  rb_define_method(cModel, "n_text_head", ruby_whisper_c_model_n_text_head, 0);
-  rb_define_method(cModel, "n_text_layer", ruby_whisper_c_model_n_text_layer, 0);
-  rb_define_method(cModel, "n_mels", ruby_whisper_c_model_n_mels, 0);
-  rb_define_method(cModel, "ftype", ruby_whisper_c_model_ftype, 0);
-  rb_define_method(cModel, "type", ruby_whisper_c_model_type, 0);
 }
 #ifdef __cplusplus
 }
--- a/bindings/ruby/extsources.yaml
+++ b/bindings/ruby/extsources.yaml
@ -2,7 +2,6 @@
 - ../../src/whisper.cpp
 - ../../include/whisper.h
 - ../../ggml/src/ggml.c
- ../../ggml/src/ggml-cpu.c
 - ../../ggml/src/ggml-impl.h
 - ../../ggml/src/ggml-aarch64.h
 - ../../ggml/src/ggml-aarch64.c
@ -19,7 +18,6 @@
 - ../../ggml/include/ggml.h
 - ../../ggml/include/ggml-alloc.h
 - ../../ggml/include/ggml-backend.h
- ../../ggml/include/ggml-cpu.h
 - ../../ggml/include/ggml-cuda.h
 - ../../ggml/include/ggml-kompute.h
 - ../../ggml/include/ggml-metal.h
--- a/bindings/ruby/tests/helper.rb
+++ b/bindings/ruby/tests/helper.rb
@ -1,7 +0,0 @@
-require "test/unit"
-require "whisper"
-
-class TestBase < Test::Unit::TestCase
-  MODEL = File.join(__dir__, "..", "..", "..", "models", "ggml-base.en.bin")
-  AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
-end
--- a/bindings/ruby/tests/test_model.rb
+++ b/bindings/ruby/tests/test_model.rb
@ -1,44 +0,0 @@
-require_relative "helper"
-
-class TestModel < TestBase
-  def test_model
-    whisper = Whisper::Context.new(MODEL)
-    assert_instance_of Whisper::Model, whisper.model
-  end
-
-  def test_attributes
-    whisper = Whisper::Context.new(MODEL)
-    model = whisper.model
-
-    assert_equal 51864, model.n_vocab
-    assert_equal 1500, model.n_audio_ctx
-    assert_equal 512, model.n_audio_state
-    assert_equal 8, model.n_audio_head
-    assert_equal 6, model.n_audio_layer
-    assert_equal 448, model.n_text_ctx
-    assert_equal 512, model.n_text_state
-    assert_equal 8, model.n_text_head
-    assert_equal 6, model.n_text_layer
-    assert_equal 80, model.n_mels
-    assert_equal 1, model.ftype
-    assert_equal "base", model.type
-  end
-
-  def test_gc
-    model = Whisper::Context.new(MODEL).model
-    GC.start
-
-    assert_equal 51864, model.n_vocab
-    assert_equal 1500, model.n_audio_ctx
-    assert_equal 512, model.n_audio_state
-    assert_equal 8, model.n_audio_head
-    assert_equal 6, model.n_audio_layer
-    assert_equal 448, model.n_text_ctx
-    assert_equal 512, model.n_text_state
-    assert_equal 8, model.n_text_head
-    assert_equal 6, model.n_text_layer
-    assert_equal 80, model.n_mels
-    assert_equal 1, model.ftype
-    assert_equal "base", model.type
-  end
-end
--- a/bindings/ruby/tests/test_package.rb
+++ b/bindings/ruby/tests/test_package.rb
@ -1,9 +1,9 @@
-require_relative "helper"
+require 'test/unit'
 require 'tempfile'
 require 'tmpdir'
 require 'shellwords'

-class TestPackage < TestBase
+class TestPackage < Test::Unit::TestCase
  def test_build
    Tempfile.create do |file|
      assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@ -1,6 +1,7 @@
-require_relative "helper"
+require 'test/unit'
+require 'whisper'

-class TestParams < TestBase
+class TestParams < Test::Unit::TestCase
  def setup
    @params  = Whisper::Params.new
  end
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@ -1,14 +1,18 @@
-require_relative "helper"
+require "test/unit"
+require "whisper"
+
+class TestSegment < Test::Unit::TestCase
+  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))

-class TestSegment < TestBase
  class << self
    attr_reader :whisper

    def startup
-      @whisper = Whisper::Context.new(TestBase::MODEL)
+      @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
      params = Whisper::Params.new
      params.print_timestamps = false
-      @whisper.transcribe(TestBase::AUDIO, params)
+      jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
+      @whisper.transcribe(jfk, params)
    end
  end

@ -56,7 +60,7 @@ class TestSegment < TestBase
      end
      index += 1
    end
-    whisper.transcribe(AUDIO, params)
+    whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
    assert_equal 0, seg.start_time
    assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
  end
@ -72,7 +76,7 @@ class TestSegment < TestBase
      assert_same seg, segment
      return
    end
-    whisper.transcribe(AUDIO, params)
+    whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
  end

  private
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -1,20 +1,20 @@
-require_relative "helper"
-require "stringio"
+require 'whisper'
+require 'test/unit'

-# Exists to detect memory-related bug
-Whisper.log_set ->(level, buffer, user_data) {}, nil
+class TestWhisper < Test::Unit::TestCase
+  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))

-class TestWhisper < TestBase
  def setup
    @params  = Whisper::Params.new
  end

  def test_whisper
-    @whisper = Whisper::Context.new(MODEL)
+    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    params  = Whisper::Params.new
    params.print_timestamps = false

-    @whisper.transcribe(AUDIO, params) {|text|
+    jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
+    @whisper.transcribe(jfk, params) {|text|
      assert_match /ask not what your country can do for you, ask what you can do for your country/, text
    }
  end
@ -24,10 +24,11 @@ class TestWhisper < TestBase
      attr_reader :whisper

      def startup
-        @whisper = Whisper::Context.new(TestBase::MODEL)
+        @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
        params = Whisper::Params.new
        params.print_timestamps = false
-        @whisper.transcribe(TestBase::AUDIO, params)
+        jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
+        @whisper.transcribe(jfk, params)
      end
    end

@ -95,33 +96,4 @@ class TestWhisper < TestBase
      Whisper.lang_str_full(Whisper.lang_max_id + 1)
    end
  end
-
-  def test_log_set
-    user_data = Object.new
-    logs = []
-    log_callback = ->(level, buffer, udata) {
-      logs << [level, buffer, udata]
-    }
-    Whisper.log_set log_callback, user_data
-    Whisper::Context.new(MODEL)
-
-    assert logs.length > 30
-    logs.each do |log|
-      assert_equal Whisper::LOG_LEVEL_INFO, log[0]
-      assert_same user_data, log[2]
-    end
-  end
-
-  def test_log_suppress
-    stderr = $stderr
-    Whisper.log_set ->(level, buffer, user_data) {
-      # do nothing
-    }, nil
-    dev = StringIO.new("")
-    $stderr = dev
-    Whisper::Context.new(MODEL)
-    assert_empty dev.string
-  ensure
-    $stderr = stderr
-  end
 end
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -137,7 +137,7 @@ if (WHISPER_SDL2)
    set_target_properties(lsp PROPERTIES FOLDER "examples")
    if (GGML_SYCL)
        add_subdirectory(sycl)
-        set_target_properties(ls-sycl-device PROPERTIES FOLDER "examples")
+        set_target_properties(sycl PROPERTIES FOLDER "examples")
    endif()
 endif (WHISPER_SDL2)
 endif()
--- a/examples/ffmpeg-transcode.cpp
+++ b/examples/ffmpeg-transcode.cpp
@ -204,6 +204,8 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
    const size_t errbuffsize = 1024;
    char errbuff[errbuffsize];

+    av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions)
+
    fmt_ctx = avformat_alloc_context();
    avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
    LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -5,5 +5,5 @@
 set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -7,16 +7,13 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference

 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-#for other features from the examples, e.g. stream and talk link with SDL2:
-#cmake .. -DGGML_SYCL=ON -DWHISPER_SDL2=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 #build example/main only
 #cmake --build . --config Release --target main

 #build all binary
-cmake --build . --config Release -v
+cmake --build . --config Release -v
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
@ -63,30 +63,6 @@ static void llama_log_softmax(float * array, size_t size) {
 }
 */

-static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
-    if (temp <= 0.0f) {
-        // find the token with the highest logit and set the rest to -inf
-        size_t max_i = 0;
-        float  max_l = cur_p->data[0].logit;
-
-        for (size_t i = 1; i < cur_p->size; ++i) {
-            if (cur_p->data[i    ].logit > max_l) {
-                cur_p->data[max_i].logit = -INFINITY;
-                max_i = i;
-                max_l = cur_p->data[i].logit;
-            } else {
-                cur_p->data[i].logit = -INFINITY;
-            }
-        }
-
-        return;
-    }
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].logit /= temp;
-    }
-}
-
 static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
    GGML_ASSERT(cur_p->size > 0);

@ -113,7 +89,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
 }

 static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
-    // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
+    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
    // if (k >= (int32_t)cur_p->size) {
    //     return;
    // }
@ -451,9 +427,6 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*

 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_dist *) smpl->ctx;
-
-    llama_sampler_softmax_impl(cur_p);
-
    cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }

@ -733,6 +706,101 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
    };
 }

+// tail-free
+
+struct llama_sampler_tail_free {
+    const float  z;
+    const size_t min_keep;
+};
+
+static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
+    return "tail-free";
+}
+
+static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
+
+    if (ctx->z >= 1.0f || cur_p->size <= 2) {
+        return;
+    }
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Compute the first and second derivatives
+    std::vector<float> first_derivatives(cur_p->size - 1);
+    std::vector<float> second_derivatives(cur_p->size - 2);
+
+    for (size_t i = 0; i < first_derivatives.size(); ++i) {
+        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
+    }
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
+    }
+
+    // Calculate absolute value of second derivatives
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = std::abs(second_derivatives[i]);
+    }
+
+    // Normalize the second derivatives
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
+    }
+
+    float cum_sum = 0.0f;
+    size_t last_idx = cur_p->size;
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        cum_sum += second_derivatives[i];
+
+        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
+        if (cum_sum > ctx->z && i >= ctx->min_keep) {
+            last_idx = i;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the tokens above the tail location
+    cur_p->size = last_idx;
+}
+
+static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
+    return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
+}
+
+static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_tail_free *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_tail_free_i = {
+    /* .name   = */ llama_sampler_tail_free_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_tail_free_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_tail_free_clone,
+    /* .free   = */ llama_sampler_tail_free_free,
+};
+
+struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_tail_free_i,
+        /* .ctx   = */ new llama_sampler_tail_free {
+            /* .z        = */ z,
+            /*. min_keep = */ min_keep,
+        },
+    };
+}
+
 // typical

 struct llama_sampler_typical {
@ -844,8 +912,9 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*

 static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    const auto * ctx = (llama_sampler_temp *) smpl->ctx;
-
-    llama_sampler_temp_impl(cur_p, ctx->temp);
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].logit /= ctx->temp;
+    }
 }

 static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
@ -892,7 +961,6 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
    if (ctx->delta > 0) {
        const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
        const float max_temp = ctx->temp + ctx->delta;
-
        float exponent_val = ctx->exponent;

        // no need to do anything if there is only one (or zero) candidates
@ -930,7 +998,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
    #endif

        // Apply the dynamically calculated temperature scaling
-        llama_sampler_temp_impl(cur_p, dyn_temp);
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].logit /= dyn_temp;
+        }

        // Re-compute softmax probabilities after scaling logits with dynamic temperature
        const double max_l_double = cur_p->data[0].logit;
@ -954,7 +1024,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
        }
    #endif
    } else {
-        llama_sampler_temp_impl(cur_p, ctx->temp);
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].logit /= ctx->temp;
+        }
    }
 }

@ -987,101 +1059,6 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
    };
 }

-// xtc
-
-struct llama_sampler_xtc {
-    const float    probability;
-    const float    threshold;
-    const size_t   min_keep;
-
-    const uint32_t seed;
-    uint32_t       seed_cur;
-
-    std::mt19937   rng;
-};
-
-static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
-    return "xtc";
-}
-
-static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
-
-    if (ctx->probability <= 0.0f
-        || ctx->threshold > 0.5f
-        || cur_p->size < 2) {
-        return;
-    }
-
-    std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
-    float chance = distribution(ctx->rng);
-    if (chance > ctx->probability) return;
-
-    // in case it's not sorted/recalculated yet
-    llama_sampler_softmax_impl(cur_p);
-
-    int pos_last = 0;
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].p >= ctx->threshold) {
-            pos_last = i;
-        } else break;
-    }
-
-    if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
-        cur_p->data += pos_last;
-        cur_p->size -= pos_last;
-    }
-}
-
-static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
-    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
-
-    // copy the state
-    {
-        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
-
-        result_ctx->rng = ctx->rng;
-    }
-
-    return result;
-}
-
-static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_xtc *) smpl->ctx;
-}
-
-static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
-    ctx->seed_cur = get_rng_seed(ctx->seed);
-    ctx->rng.seed(ctx->seed_cur);
-}
-
-static struct llama_sampler_i llama_sampler_xtc_i = {
-    /* .name   = */ llama_sampler_xtc_name,
-    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sample_xtc_apply,
-    /* .reset  = */ llama_sampler_xtc_reset,
-    /* .clone  = */ llama_sampler_xtc_clone,
-    /* .free   = */ llama_sampler_xtc_free,
-};
-
-struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
-    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_xtc_i,
-        /* .ctx   = */ new llama_sampler_xtc {
-            /* .probability   = */ p,
-            /* .threshold     = */ t,
-            /* .min_keep      = */ min_keep,
-            /* .seed          = */ seed,
-            /* .seed_cur      = */ seed_cur,
-            /* .rng           = */ std::mt19937(seed_cur),
-        },
-    };
-}
-
 // mirostat

 struct llama_sampler_mirostat {
@ -1588,400 +1565,6 @@ struct llama_sampler * llama_sampler_init_penalties(
    };
 }

-// DRY
-
-struct llama_sampler_dry {
-    int32_t total_context_size;
-
-    const float   dry_multiplier;
-    const float   dry_base;
-    const int32_t dry_allowed_length;
-    const int32_t dry_penalty_last_n;
-
-    std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
-    std::vector<int> dry_repeat_count;
-    std::unordered_map<llama_token, int> dry_max_token_repeat;
-    ring_buffer<llama_token> last_tokens;
-};
-
-// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
-static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
-    for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
-        std::string word = llama_detokenize(vocab, {token_id}, true);
-        if (word.find(str) != std::string::npos) {
-            token_sequences.emplace(token_id, std::vector<llama_token>());
-        } else {
-            size_t word_len = word.size(), str_len = str.size();
-            size_t pos = -1;
-            while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
-                bool match = true;
-                size_t i;
-                for (i = 1; i < str_len && i + pos < word_len; ++i) {
-                    if (word[pos + i] != str[i]) {
-                        match = false;
-                        break;
-                    }
-                }
-                if (match) {
-                    std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
-                    if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
-                        tokenization.resize(max_tail_len);
-                    }
-
-                    // Ensure we don't already have a duplicate matching tokenization
-                    auto its = token_sequences.equal_range(token_id);
-                    bool found = false;
-                    for (auto it = its.first; it != its.second; ++it) {
-                        if (tokenization == it->second) {
-                            found = true;
-                            break;
-                        }
-                    }
-                    if (!found) {
-                        token_sequences.emplace(token_id, tokenization);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
-    return "dry";
-}
-
-static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
-    auto * ctx = (llama_sampler_dry *) smpl->ctx;
-    if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
-        return;
-    }
-
-    ctx->last_tokens.push_back(token);
-}
-
-// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
-static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_dry *) smpl->ctx;
-
-    if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
-        return;
-    }
-
-    int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
-    int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
-
-    if (last_n_repeat <= ctx->dry_allowed_length) {
-        return;
-    }
-
-    ctx->dry_repeat_count.assign(last_n_repeat, 0);
-    ctx->dry_max_token_repeat.clear();
-
-    // Step 1: Look for restart sequences to limit the maximum repetition length.
-    // Work backwards through the context looking for any token that begins a restart sequence.
-    //
-    // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
-    // sequences that together comprise a restart sequence. This allows us to quickly check
-    // whether each token is the head of a complete sequence. Most restart sequences are actually
-    // a single token, and for these the "tail" is an empty vector.
-    //
-    // If the token is a "head", test all restart sequences that begin with this token
-    // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
-    // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
-    // longest matching sequence (if any) is used to limit the maximum repetition length.
-    //
-    // Note that in the case case of a short sequence contained in a longer one, this might fail to
-    // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
-    // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
-    // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
-    //
-    // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
-    // have already clamped the maximum tail sequence length when generating `restart_sequences`.
-    // With clamping, this scan is O(N) in the context length.
-
-    int rep_limit = last_n_repeat;
-    for (int i = 0; i < last_n_repeat; ++i) {
-        llama_token token = ctx->last_tokens.rat(i);
-        auto its = ctx->dry_processed_breakers.equal_range(token);
-        if (its.first == ctx->dry_processed_breakers.end()) {
-            continue;
-        }
-        int longest_match = -1;
-        for (auto it = its.first; it != its.second; ++it) {
-            // Note that (*it) does not contain the head character, so seq_len will be
-            // the restart sequence length minus 1.
-            // In the common case of a single-token restart sequence, (*it) will be empty
-            // and we will trivially match.
-            int seq_len = (int)it->second.size();
-            if (seq_len > longest_match && seq_len <= (int)i) {
-                bool match = true;
-                for (int offset = 0; offset < seq_len; ++offset) {
-                    // The -1 when indexing `last_tokens` is because we already matched the head.
-                    if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
-                        match = false;
-                        break;
-                    }
-                }
-                if (match) {
-                    longest_match = seq_len;
-                }
-            }
-        }
-        if (longest_match >= 0) {
-            // We found a restart sequence starting `i` tokens from the end and continuing for
-            // `longest_match` tokens.
-            rep_limit = i - longest_match;
-            break;
-        }
-    }
-    if (rep_limit < ctx->dry_allowed_length) {
-        return;
-    }
-
-    // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
-    // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
-    // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
-    //
-    // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
-    // https://ivanyu.me/blog/2014/10/15/z-algorithm/
-    //
-    // The code below is adapted from the public domain implementation by the same author here:
-    // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
-    //
-    // Example:
-    // Last N tokens: a b c c b c y a b c
-    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
-    //                    ^
-    //   This `3` means that the last three tokens of the context (a b c) also appear here.
-    //
-    // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
-    // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
-    // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
-    // ensure that the inner while loops only examine each token in the context once as the outer
-    // for loop iterates over the context.
-
-    {
-        const int last = last_n_repeat - 1;
-        int rt = 0, lt = 0;
-
-        for (int k = 1; k < last_n_repeat; ++k) {
-            if (k > rt) {
-                // If k is outside the current Z-box, do naive computation.
-                int n = 0;
-                while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
-                    ++n;
-                }
-                ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
-                if (n > 0) {
-                    lt = k;
-                    rt = k+n-1;
-                }
-            } else {
-                // If k is inside the current Z-box, consider two cases.
-
-                int p = k - lt; // Pair index.
-                int right_part_len = rt - k + 1;
-
-                if (ctx->dry_repeat_count[last - p] < right_part_len) {
-                    int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
-                    ctx->dry_repeat_count[last - k] = n;
-                } else {
-                    int i = rt + 1;
-                    while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
-                        i += 1;
-                    }
-
-                    int n = std::min(i - k, rep_limit);
-                    ctx->dry_repeat_count[last - k] = n;
-                    lt = k;
-                    rt = i - 1;
-                }
-            }
-        }
-    }
-
-    // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
-    // that would be generated by emitting each new token that would extend a sequence.
-    //
-    // Following the same example as above:
-    // Last N tokens: a b c c b c y a b c
-    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
-    //
-    // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
-    // c: 3 -> 4 (from `a b c` to `a b c c`)
-    // b: 1 -> 2 (from `c` to `c b`)
-    // y: 2 -> 3 (from `b c` to `b c y`)
-
-    for (int i = 0; i < last_n_repeat - 1; ++i) {
-        int repeat_len = ctx->dry_repeat_count[i];
-        if (repeat_len >= ctx->dry_allowed_length) {
-            // This token ends a repeat, so the next token would continue one.
-            // By convention, the value of `repeat_len` only includes the tokens currently
-            // in the context, not the new token that would be added.
-            llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
-            // Track the maximum sequence ending in this token.
-            const auto& it = ctx->dry_max_token_repeat.find(token);
-            if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
-                ctx->dry_max_token_repeat[token] = repeat_len;
-            }
-        }
-    }
-
-    // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
-
-    // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
-    // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
-    const float FLOAT_MAX_LOG = 88.7228391f;
-    int max_exponent = 0;
-    if (ctx->dry_base > 1.000001f) {
-        max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
-    }
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
-        if (af_kvp != ctx->dry_max_token_repeat.end()) {
-            // Check all sequence breakers starting with this token
-            auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
-            bool is_single_token_breaker = false;
-
-            for (auto it = range.first; it != range.second; ++it) {
-                if (it->second.empty()) {
-                    is_single_token_breaker = true;
-                    break;
-                }
-            }
-
-            // Apply penalty only if it's not a single-token sequence breaker
-            if (!is_single_token_breaker) {
-                int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
-                if (max_exponent > 0 && repeat_exp > max_exponent) {
-                    repeat_exp = max_exponent;
-                }
-                float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
-                cur_p->data[i].logit -= penalty;
-            }
-        }
-    }
-
-    cur_p->sorted = false;
-}
-
-static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
-    auto * ctx = (llama_sampler_dry *) smpl->ctx;
-    ctx->last_tokens.clear();
-    ctx->dry_repeat_count.clear();
-    ctx->dry_max_token_repeat.clear();
-}
-
-static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (llama_sampler_dry *) smpl->ctx;
-
-    llama_vocab dummy_vocab;
-
-    // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
-    auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
-
-    // Copy the state, including the processed breakers
-    {
-        auto * result_ctx = (llama_sampler_dry *) result->ctx;
-        result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
-        result_ctx->dry_repeat_count = ctx->dry_repeat_count;
-        result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
-        result_ctx->last_tokens = ctx->last_tokens;
-    }
-
-    return result;
-}
-
-static void llama_sampler_dry_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_dry *) smpl->ctx;
-}
-
-static struct llama_sampler_i llama_sampler_dry_i = {
-    /* .name   = */ llama_sampler_dry_name,
-    /* .accept = */ llama_sampler_dry_accept,
-    /* .apply  = */ llama_sampler_dry_apply,
-    /* .reset  = */ llama_sampler_dry_reset,
-    /* .clone  = */ llama_sampler_dry_clone,
-    /* .free   = */ llama_sampler_dry_free,
-};
-
-struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
-    int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
-    std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
-    const int MAX_CHAR_LEN = 40;
-    const int MAX_SEQ_LEN = 20;
-
-    const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
-
-    if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
-        // Process sequence breakers
-        for (size_t i = 0; i < num_breakers; ++i) {
-            if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
-                LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
-                continue;
-            }
-
-            std::string sequence_break(seq_breakers[i]);
-            if (sequence_break.empty()) {
-                LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
-                continue;
-            }
-
-            if (sequence_break.size() > MAX_CHAR_LEN) {
-                LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
-                sequence_break.resize(MAX_CHAR_LEN);
-            }
-
-            get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
-        }
-    }
-
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_dry_i,
-        /* .ctx   = */ new llama_sampler_dry {
-            /* .total_context_size     = */ context_size,
-            /* .dry_multiplier         = */ dry_multiplier,
-            /* .dry_base               = */ dry_base,
-            /* .dry_allowed_length     = */ dry_allowed_length,
-            /* .dry_penalty_last_n     = */ dry_penalty_last_n,
-            /* .dry_processed_breakers = */ std::move(processed_breakers),
-            /* .dry_repeat_count       = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
-            /* .dry_max_token_repeat   = */ {},
-            /* .last_tokens            = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
-        },
-    };
-}
-
-// wrapper for test-sampling.cpp
-struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
-    llama_vocab dummy_vocab;
-    auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
-    auto * ctx = (llama_sampler_dry *) result->ctx;
-
-    // Process the token-based sequence breakers
-    ctx->dry_processed_breakers.clear();
-    if (seq_breakers.empty()) {
-        LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
-    } else {
-        for (const auto& breaker : seq_breakers) {
-            if (breaker.empty()) {
-                LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
-                continue;
-            }
-            llama_token head_token = breaker[0];
-            std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
-            ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
-        }
-
-        if (ctx->dry_processed_breakers.empty()) {
-            LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
-        }
-    }
-
-    return result;
-}
-
 // logit-bias

 struct llama_sampler_logit_bias {
@ -2061,229 +1644,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
    };
 }

-// infill
-
-//#define GGML_DEBUG_SAMPLER_INFILL
-
-struct llama_sampler_infill {
-    const struct llama_vocab * vocab;
-
-    std::vector<char> buf0;
-    std::vector<char> buf1;
-};
-
-static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
-    return "infill";
-}
-
-static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_infill *) smpl->ctx;
-
-    llama_sampler_softmax_impl(cur_p);
-
-#if defined(GGML_DEBUG_SAMPLER_INFILL)
-#define LOG_DBG_CUR LLAMA_LOG_DEBUG
-#else
-#define LOG_DBG_CUR(...)
-#endif
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
-    }
-
-    float p_txt_sum = 0.0f;
-    float p_eog_sum = 0.0f;
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
-            p_eog_sum += cur_p->data[i].p;
-        } else {
-            p_txt_sum += cur_p->data[i].p;
-        }
-    }
-
-    const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
-
-    LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
-
-    if (3*p_eog_sum*cur_p->size > p_txt_sum) {
-        LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
-
-        // keep just the EOG tokens
-        const auto size_org = cur_p->size;
-
-        cur_p->size = 0;
-
-        float p_sum = 0.0f;
-
-        for (size_t i = 0; i < size_org; ++i) {
-            if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
-                p_sum += cur_p->data[i].p;
-
-                cur_p->data[cur_p->size++] = cur_p->data[i];
-            }
-        }
-
-        // normalize probs
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            cur_p->data[i].p /= p_sum;
-        }
-
-        return;
-    }
-
-    size_t n_combined = 0; GGML_UNUSED(n_combined);
-
-    // combine tokens with common prefix
-    for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
-        for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
-            if (cur_p->data[i0].logit == -INFINITY) {
-                break;
-            }
-
-            if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
-                continue;
-            }
-
-            int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
-            if (len0 < 0) {
-                ctx->buf0.resize(len0);
-                len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
-                assert(len0 > 0);
-            }
-
-            int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
-            if (len1 < 0) {
-                ctx->buf1.resize(len1);
-                len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
-                assert(len1 > 0);
-            }
-
-            // token i0 is a prefix of token i1
-            if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
-                int dst = i0;
-                int src = i1;
-
-                // merge into the token with higher probability
-                if (cur_p->data[i1].p > cur_p->data[i0].p) {
-                    std::swap(dst, src);
-                }
-
-                cur_p->data[dst].p += cur_p->data[src].p;
-                cur_p->data[src].logit = -INFINITY;
-                cur_p->data[src].p     = 0.0f;
-
-                n_combined++;
-            }
-        }
-    }
-
-    size_t n_non_eog = 0;
-
-    size_t size_org = cur_p->size;
-
-    float p_sum = 0.0f;
-    float thold = 0.2f;
-
-    cur_p->size = 0;
-
-    LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
-
-    for (size_t i = 0; i < size_org; ++i) {
-        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
-
-        if (cur_p->data[i].p < thold && !is_eog) {
-            continue;
-        }
-
-        if (!is_eog) {
-            ++n_non_eog;
-        }
-
-        p_sum += cur_p->data[i].p;
-
-        // keep this token
-        cur_p->data[cur_p->size++] = cur_p->data[i];
-    }
-
-    LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
-
-    // if no non-EOG tokens are left -> reduce cur_p to single EOT token
-    if (n_non_eog == 0) {
-        cur_p->size = 1;
-        cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
-        cur_p->data[0].logit = 1.0f;
-
-        return;
-    }
-
-    // normalize probs
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].p /= p_sum;
-
-        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
-    }
-
-    size_org = cur_p->size;
-    p_sum = 0.0f;
-    thold = 1.0/(n_non_eog + 1);
-
-    cur_p->size = 0;
-
-    LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
-
-    for (size_t i = 0; i < size_org; ++i) {
-        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
-
-        if (cur_p->data[i].p < thold && !is_eog) {
-            continue;
-        }
-
-        p_sum += cur_p->data[i].p;
-
-        cur_p->data[cur_p->size++] = cur_p->data[i];
-    }
-
-    // normalize probs
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].p /= p_sum;
-
-        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
-    }
-
-#undef LOG_DBG_CUR
-}
-
-static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
-    return llama_sampler_init_infill_impl(*ctx->vocab);
-}
-
-static void llama_sampler_infill_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_infill *) smpl->ctx;
-}
-
-static struct llama_sampler_i llama_sampler_infill_i = {
-    /* .name   = */ llama_sampler_infill_name,
-    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_infill_apply,
-    /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_infill_clone,
-    /* .free   = */ llama_sampler_infill_free,
-};
-
-struct llama_sampler * llama_sampler_init_infill_impl(
-        const struct llama_vocab & vocab) {
-    return new llama_sampler {
-        /* .iface = */ &llama_sampler_infill_i,
-        /* .ctx   = */ new llama_sampler_infill {
-            /* .vocab = */ &vocab,
-            /* .buf0 = */ std::vector<char>(512),
-            /* .buf1 = */ std::vector<char>(512),
-        },
-    };
-}
-
 // utils

 uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
--- a/examples/talk-llama/llama-sampling.h
+++ b/examples/talk-llama/llama-sampling.h
@ -4,6 +4,8 @@

 #include "llama-grammar.h"

+#include <unordered_map>
+
 struct llama_vocab;
 struct llama_grammar;

@ -25,24 +27,3 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
                      const char * grammar_root);
-
-struct llama_sampler * llama_sampler_init_infill_impl(
-        const struct llama_vocab & vocab);
-
-struct llama_sampler * llama_sampler_init_dry_impl(
-        const struct llama_vocab &  vocab,
-                         int32_t    context_size,
-                           float    dry_multiplier,
-                           float    dry_base,
-                         int32_t    dry_allowed_length,
-                         int32_t    dry_penalty_last_n,
-                      const char ** seq_breakers,
-                          size_t    num_breakers);
-
-struct llama_sampler * llama_sampler_init_dry_testing(
-                         int32_t   context_size,
-                           float   dry_multiplier,
-                           float   dry_base,
-                         int32_t   dry_allowed_length,
-                         int32_t   dry_penalty_last_n,
-  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
        }

        // seed the work queue with all possible 2-character tokens.
-        for (int i = 1; i < (int) symbols.size(); ++i) {
+        for (size_t i = 1; i < symbols.size(); ++i) {
            try_add_bigram(i - 1, i);
        }

@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
                index++;
                symbols.emplace_back(sym);
            }
-            for (int i = 1; i < (int) symbols.size(); ++i) {
+            for (size_t i = 1; i < symbols.size(); ++i) {
                add_new_bigram(i - 1, i);
            }

@ -1663,14 +1663,6 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
    return vocab.special_eos_id;
 }

-llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eot_id;
-}
-
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eom_id;
-}
-
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
    return vocab.special_cls_id;
 }
@ -1696,39 +1688,23 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
 }

 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_pre_id;
+    return vocab.special_prefix_id;
 }

 llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_mid_id;
+    return vocab.special_middle_id;
 }

 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_suf_id;
+    return vocab.special_suffix_id;
 }

-llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_pre_id;
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eot_id;
 }

-llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_suf_id;
-}
-
-llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_mid_id;
-}
-
-llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_pad_id;
-}
-
-llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_rep_id;
-}
-
-llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
-    return vocab.special_fim_sep_id;
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
 }

 int32_t llama_tokenize_impl(
@ -1966,19 +1942,3 @@ int32_t llama_detokenize_impl(

    return total <= text_len_max ? total : -total;
 }
-
-std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
-    std::string text;
-    text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
-    if (n_chars < 0) {
-        text.resize(-n_chars);
-        n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
-        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
-    }
-
-    text.resize(n_chars);
-
-    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
-    return text;
-}
--- a/examples/talk-llama/llama-vocab.h
+++ b/examples/talk-llama/llama-vocab.h
@ -37,26 +37,20 @@ struct llama_vocab {
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;

    // default LLaMA special tokens
-    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
    id special_bos_id  = 1;
    id special_eos_id  = 2;
-    id special_eot_id  = LLAMA_TOKEN_NULL;
-    id special_eom_id  = LLAMA_TOKEN_NULL;
    id special_unk_id  = 0;
-    id special_sep_id  = LLAMA_TOKEN_NULL;
-    id special_pad_id  = LLAMA_TOKEN_NULL;
-    id special_cls_id  = LLAMA_TOKEN_NULL;
-    id special_mask_id = LLAMA_TOKEN_NULL;
+    id special_sep_id  = -1;
+    id special_pad_id  = -1;
+    id special_cls_id  = -1;
+    id special_mask_id = -1;

-    id linefeed_id = 13;
-
-    // fim tokens
-    id special_fim_pre_id = LLAMA_TOKEN_NULL;
-    id special_fim_suf_id = LLAMA_TOKEN_NULL;
-    id special_fim_mid_id = LLAMA_TOKEN_NULL;
-    id special_fim_pad_id = LLAMA_TOKEN_NULL;
-    id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
-    id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
+    id linefeed_id       = 13;
+    id special_prefix_id = -1;
+    id special_suffix_id = -1;
+    id special_middle_id = -1;
+    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = -1;

    // set of all tokens that cause "end of generation"
    std::set<id> special_eog_ids;
@ -110,26 +104,19 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t

 llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
 llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);

+bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
+
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
-
-llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
-
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);

 int32_t llama_tokenize_impl(
        const struct llama_vocab & vocab,
@ -149,12 +136,6 @@ int32_t llama_token_to_piece_impl(
                         int32_t   lstrip,
                            bool   special);

-// check if token0 is contained as a prefix in token1
-bool llama_token_is_prefix_impl(
-        const struct llama_vocab & vocab,
-                     llama_token   token0,
-                     llama_token   token1);
-
 int32_t llama_detokenize_impl(
        const struct llama_vocab & vocab,
               const llama_token * tokens,
@ -163,8 +144,3 @@ int32_t llama_detokenize_impl(
                         int32_t   text_len_max,
                            bool   remove_special,
                            bool   unparse_special);
-
-std::string llama_detokenize(
-        const struct llama_vocab & vocab,
-  const std::vector<llama_token> & tokens,
-                            bool   special);
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -2,7 +2,6 @@
 #define LLAMA_H

 #include "ggml.h"
-#include "ggml-cpu.h"
 #include "ggml-backend.h"

 #include <stddef.h>
@ -206,7 +205,7 @@ extern "C" {
    enum llama_split_mode {
        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
+        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
    };

    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@ -218,7 +217,6 @@ extern "C" {

    typedef struct llama_token_data_array {
        // TODO: consider SoA
-        // NOTE: this pointer can be modified by the samplers
        llama_token_data * data;
        size_t size;
        int64_t selected; // this is the index in the data array (i.e. not the token id)
@ -234,11 +232,8 @@ extern "C" {
    // - token  : the token ids of the input (used when embd is NULL)
    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
    // - pos    : the positions of the respective token in the sequence
-    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
    // - seq_id : the sequence to which the respective token belongs
-    //            (if set to NULL, the sequence ID will be assumed to be 0)
    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-    //            (if set to NULL, only the logits for last token will be returned)
    //
    typedef struct llama_batch {
        int32_t n_tokens;
@ -249,6 +244,15 @@ extern "C" {
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
        int8_t       *  logits; // TODO: rename this to "output"
+
+        // NOTE: helpers for smooth API transition - can be deprecated in the future
+        //       for future-proof code, use the above fields instead and ignore everything below
+        //
+        // pos[i] = all_pos_0 + i*all_pos_1
+        //
+        llama_pos    all_pos_0;  // used if pos == NULL
+        llama_pos    all_pos_1;  // used if pos == NULL
+        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

    enum llama_model_kv_override_type {
@ -275,7 +279,10 @@ extern "C" {
        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

-        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
+        // main_gpu interpretation depends on split_mode:
+        // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_MODE_LAYER: ignored
        int32_t main_gpu;

        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -426,7 +433,6 @@ extern "C" {
    LLAMA_API bool llama_supports_mmap       (void);
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);
-    LLAMA_API bool llama_supports_rpc        (void);

    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@ -667,9 +673,6 @@ extern "C" {
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);

-    // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
-
    //
    // State / sessions
    //
@ -772,15 +775,15 @@ extern "C" {
    // Decoding
    //

-    // Return batch for single sequence of tokens
-    // The sequence ID will be fixed to 0
-    // The position of the tokens will be tracked automatically by llama_decode
+    // Return batch for single sequence of tokens starting at pos_0
    //
    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
    //
    LLAMA_API struct llama_batch llama_batch_get_one(
                  llama_token * tokens,
-                      int32_t   n_tokens);
+                      int32_t   n_tokens,
+                    llama_pos   pos_0,
+                 llama_seq_id   seq_id);

    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
    // Each token can be assigned up to n_seq_max sequence ids
@ -800,7 +803,7 @@ extern "C" {
    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
    // Stores the encoder output internally for later use by the decoder cross-attention layers.
    //   0 - success
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // < 0 - error
    LLAMA_API int32_t llama_encode(
            struct llama_context * ctx,
              struct llama_batch   batch);
@ -808,7 +811,7 @@ extern "C" {
    // Positive return values does not mean a fatal error, but rather a warning.
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // < 0 - error
    LLAMA_API int32_t llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);
@ -893,7 +896,6 @@ extern "C" {
    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-    LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@ -902,17 +904,11 @@ extern "C" {
    LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
    LLAMA_API bool llama_add_eos_token(const struct llama_model * model);

-    // infill tokens
-    DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
-    DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
-    DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
-
-    LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
+    // Codellama infill tokens
+    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle

    //
    // Tokenization
@ -1071,13 +1067,12 @@ extern "C" {

    // available samplers:

-    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-        "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
@ -1088,18 +1083,16 @@ extern "C" {
    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);

+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
+
    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
-
-    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);

    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);

-    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
-    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
-
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -1139,43 +1132,11 @@ extern "C" {
                                bool   penalize_nl,     // consider newlines as a repeatable token
                                bool   ignore_eos);     // ignore the end-of-sequence token

-    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
-    LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
-            const struct llama_model *  model,
-                               float    dry_multiplier,
-                               float    dry_base,
-                             int32_t    dry_allowed_length,
-                             int32_t    dry_penalty_last_n,
-                          const char ** seq_breakers,
-                              size_t    num_breakers);
-
    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
                             int32_t   n_vocab,
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

-    // this sampler is meant to be used for fill-in-the-middle infilling
-    // it's supposed to be used after top_k + top_p sampling
-    //
-    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
-    // 2. combine probs of tokens that have the same prefix
-    //
-    // example:
-    //
-    // - before:
-    //   "hel":   0.5
-    //   "hell":  0.2
-    //   "hello": 0.1
-    //   "dummy": 0.1
-    //
-    // - after:
-    //   "hel":   0.8
-    //   "dummy": 0.1
-    //
-    // 3. discard non-EOG tokens with low prob
-    // 4. if no tokens are left -> pick EOT
-    //
-    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);

    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
@ -1247,6 +1208,8 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);

+    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
+
 #ifdef __cplusplus
 }
 #endif
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };

-// list is always in ascending order, to enable binary search
+// list is always in ascending order, to enable binary searh
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
 {0x01E921, 0x01E943},
 };

-// list is always in ascending order, to enable binary search
+// list is always in ascending order, to enable binary searh
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -22,13 +22,7 @@ if (NOT GGML_HOME)
        ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp
        ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
        )
 endif()

--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -24,12 +24,6 @@
 		18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
 		18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
 		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
-		18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
-		18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
-		18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
-		18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */; };
-		18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
-		18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
 		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
 		7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
@ -55,8 +49,8 @@
 		18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
 		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
 		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
-		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal/ggml-metal.m"; sourceTree = "<group>"; };
-		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal/ggml-metal.metal"; sourceTree = "<group>"; };
+		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
+		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = "<group>"; };
 		18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -82,17 +76,6 @@
 		18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
 		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
 		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
-		18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
-		18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
-		18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
-		18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
-		18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
-		18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
-		18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.c"; sourceTree = "<group>"; };
-		18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
-		18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
-		18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
-		18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = "<group>"; };
 		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
 		7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
 		7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
@ -132,17 +115,6 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
-				18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
-				18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
-				18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */,
-				18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
-				18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
-				18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
-				18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */,
-				18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */,
-				18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */,
-				18E864AA2CE73C580094B8B3 /* ggml-cpu.h */,
-				18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */,
 				18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
 				18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
 				18A275FF2C2A9563001C8D37 /* ggml-common.h */,
@ -275,16 +247,10 @@
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
-				18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
-				18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
 				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
-				18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */,
-				18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
-				18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
 				18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
-				18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */,
 				1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
 				7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
 			);
@ -363,7 +329,6 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = "";
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
@ -417,7 +382,6 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = "";
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
@ -440,7 +404,6 @@
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
-				HEADER_SEARCH_PATHS = ../../../ggml/src/;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
@ -470,7 +433,6 @@
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
-				HEADER_SEARCH_PATHS = ../../../ggml/src/;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
--- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
+++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
@ -1,5 +1,4 @@
 import Foundation
-import UIKit
 import whisper

 enum WhisperError: Error {
@ -56,91 +55,11 @@ actor WhisperContext {
        return transcription
    }

-    static func benchMemcpy(nThreads: Int32) async -> String {
-        return String.init(cString: whisper_bench_memcpy_str(nThreads))
-    }
-
-    static func benchGgmlMulMat(nThreads: Int32) async -> String {
-        return String.init(cString: whisper_bench_ggml_mul_mat_str(nThreads))
-    }
-
-    private func systemInfo() -> String {
-        var info = ""
-        if (ggml_cpu_has_neon() != 0) { info += "NEON " }
-        return String(info.dropLast())
-    }
-
-    func benchFull(modelName: String, nThreads: Int32) async -> String {
-        let nMels = whisper_model_n_mels(context)
-        if (whisper_set_mel(context, nil, 0, nMels) != 0) {
-            return "error: failed to set mel"
-        }
-        
-        // heat encoder
-        if (whisper_encode(context, 0, nThreads) != 0) {
-            return "error: failed to encode"
-        }
-        
-        var tokens = [whisper_token](repeating: 0, count: 512)
-        
-        // prompt heat
-        if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) {
-            return "error: failed to decode"
-        }
-        
-        // text-generation heat
-        if (whisper_decode(context, &tokens, 1, 256, nThreads) != 0) {
-            return "error: failed to decode"
-        }
-        
-        whisper_reset_timings(context)
-        
-        // actual run
-        if (whisper_encode(context, 0, nThreads) != 0) {
-            return "error: failed to encode"
-        }
-        
-        // text-generation
-        for i in 0..<256 {
-            if (whisper_decode(context, &tokens, 1, Int32(i), nThreads) != 0) {
-                return "error: failed to decode"
-            }
-        }
-        
-        // batched decoding
-        for _ in 0..<64 {
-            if (whisper_decode(context, &tokens, 5, 0, nThreads) != 0) {
-                return "error: failed to decode"
-            }
-        }
-        
-        // prompt processing
-        for _ in 0..<16 {
-            if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) {
-                return "error: failed to decode"
-            }
-        }
-
-        whisper_print_timings(context)
-
-        let deviceModel = await UIDevice.current.model
-        let systemName = await UIDevice.current.systemName
-        let systemInfo = self.systemInfo()
-        let timings: whisper_timings = whisper_get_timings(context).pointee
-        let encodeMs = String(format: "%.2f", timings.encode_ms)
-        let decodeMs = String(format: "%.2f", timings.decode_ms)
-        let batchdMs = String(format: "%.2f", timings.batchd_ms)
-        let promptMs = String(format: "%.2f", timings.prompt_ms)
-        return "| \(deviceModel) | \(systemName) | \(systemInfo) | \(modelName) | \(nThreads) | 1 | \(encodeMs) | \(decodeMs) | \(batchdMs) | \(promptMs) | <todo> |"
-    }
-
    static func createContext(path: String) throws -> WhisperContext {
        var params = whisper_context_default_params()
 #if targetEnvironment(simulator)
        params.use_gpu = false
        print("Running on the simulator, using CPU")
-#else
-        params.flash_attn = true // Enabled by default for Metal
 #endif
        let context = whisper_init_from_file_with_params(path, params)
        if let context {
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Models/Model.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Models/Model.swift
@ -1,17 +0,0 @@
-import Foundation
-
-struct Model: Identifiable {
-    var id = UUID()
-    var name: String
-    var info: String
-    var url: String
-
-    var filename: String
-    var fileURL: URL {
-        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
-    }
-
-    func fileExists() -> Bool {
-        FileManager.default.fileExists(atPath: fileURL.path)
-    }
-}
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Models/WhisperState.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Models/WhisperState.swift
@ -14,7 +14,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    private var recordedFile: URL? = nil
    private var audioPlayer: AVAudioPlayer?
    
-    private var builtInModelUrl: URL? {
+    private var modelUrl: URL? {
        Bundle.main.url(forResource: "ggml-base.en", withExtension: "bin", subdirectory: "models")
    }
    
@ -28,59 +28,23 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    
    override init() {
        super.init()
-        loadModel()
-    }
-    
-    func loadModel(path: URL? = nil, log: Bool = true) {
        do {
-            whisperContext = nil
-            if (log) { messageLog += "Loading model...\n" }
-            let modelUrl = path ?? builtInModelUrl
-            if let modelUrl {
-                whisperContext = try WhisperContext.createContext(path: modelUrl.path())
-                if (log) { messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" }
-            } else {
-                if (log) { messageLog += "Could not locate model\n" }
-            }
+            try loadModel()
            canTranscribe = true
        } catch {
            print(error.localizedDescription)
-            if (log) { messageLog += "\(error.localizedDescription)\n" }
+            messageLog += "\(error.localizedDescription)\n"
        }
    }
-
-    func benchCurrentModel() async {
-        if whisperContext == nil {
-            messageLog += "Cannot bench without loaded model\n"
-            return
+    
+    private func loadModel() throws {
+        messageLog += "Loading model...\n"
+        if let modelUrl {
+            whisperContext = try WhisperContext.createContext(path: modelUrl.path())
+            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
+        } else {
+            messageLog += "Could not locate model\n"
        }
-        messageLog += "Running benchmark for loaded model\n"
-        let result = await whisperContext?.benchFull(modelName: "<current>", nThreads: Int32(min(4, cpuCount())))
-        if (result != nil) { messageLog += result! + "\n" }
-    }
-
-    func bench(models: [Model]) async {
-        let nThreads = Int32(min(4, cpuCount()))
-
-//        messageLog += "Running memcpy benchmark\n"
-//        messageLog += await WhisperContext.benchMemcpy(nThreads: nThreads) + "\n"
-//
-//        messageLog += "Running ggml_mul_mat benchmark with \(nThreads) threads\n"
-//        messageLog += await WhisperContext.benchGgmlMulMat(nThreads: nThreads) + "\n"
-
-        messageLog += "Running benchmark for all downloaded models\n"
-        messageLog += "| CPU | OS | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |\n"
-        messageLog += "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"
-        for model in models {
-            loadModel(path: model.fileURL, log: false)
-            if whisperContext == nil {
-                messageLog += "Cannot bench without loaded model\n"
-                break
-            }
-            let result = await whisperContext?.benchFull(modelName: model.name, nThreads: nThreads)
-            if (result != nil) { messageLog += result! + "\n" }
-        }
-        messageLog += "Benchmarking completed\n"
    }
    
    func transcribeSample() async {
@ -196,8 +160,3 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
        isRecording = false
    }
 }
-
-
-fileprivate func cpuCount() -> Int {
-    ProcessInfo.processInfo.processorCount
-}
--- a/examples/whisper.swiftui/whisper.swiftui.demo/UI/ContentView.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/UI/ContentView.swift
@ -1,6 +1,5 @@
 import SwiftUI
 import AVFoundation
-import Foundation

 struct ContentView: View {
    @StateObject var whisperState = WhisperState()
@ -30,125 +29,15 @@ struct ContentView: View {
                    Text(verbatim: whisperState.messageLog)
                        .frame(maxWidth: .infinity, alignment: .leading)
                }
-                .font(.footnote)
-                .padding()
-                .background(Color.gray.opacity(0.1))
-                .cornerRadius(10)
-
-                HStack {
-                    Button("Clear Logs", action: {
-                        whisperState.messageLog = ""
-                    })
-                    .font(.footnote)
-                    .buttonStyle(.bordered)
-
-                    Button("Copy Logs", action: {
-                        UIPasteboard.general.string = whisperState.messageLog
-                    })
-                    .font(.footnote)
-                    .buttonStyle(.bordered)
-
-                    Button("Bench", action: {
-                        Task {
-                            await whisperState.benchCurrentModel()
-                        }
-                    })
-                    .font(.footnote)
-                    .buttonStyle(.bordered)
-                    .disabled(!whisperState.canTranscribe)
-
-                    Button("Bench All", action: {
-                        Task {
-                            await whisperState.bench(models: ModelsView.getDownloadedModels())
-                        }
-                    })
-                    .font(.footnote)
-                    .buttonStyle(.bordered)
-                    .disabled(!whisperState.canTranscribe)
-                }
-
-                NavigationLink(destination: ModelsView(whisperState: whisperState)) {
-                    Text("View Models")
-                }
-                .font(.footnote)
-                .padding()
            }
            .navigationTitle("Whisper SwiftUI Demo")
            .padding()
        }
    }
-
-    struct ModelsView: View {
-        @ObservedObject var whisperState: WhisperState
-        @Environment(\.dismiss) var dismiss
-        
-        private static let models: [Model] = [
-            Model(name: "tiny", info: "(F16, 75 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin", filename: "tiny.bin"),
-            Model(name: "tiny-q5_1", info: "(31 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q5_1.bin", filename: "tiny-q5_1.bin"),
-            Model(name: "tiny-q8_0", info: "(42 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q8_0.bin", filename: "tiny-q8_0.bin"),
-            Model(name: "tiny.en", info: "(F16, 75 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin", filename: "tiny.en.bin"),
-            Model(name: "tiny.en-q5_1", info: "(31 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin", filename: "tiny.en-q5_1.bin"),
-            Model(name: "tiny.en-q8_0", info: "(42 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q8_0.bin", filename: "tiny.en-q8_0.bin"),
-            Model(name: "base", info: "(F16, 142 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin", filename: "base.bin"),
-            Model(name: "base-q5_1", info: "(57 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q5_1.bin", filename: "base-q5_1.bin"),
-            Model(name: "base-q8_0", info: "(78 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q8_0.bin", filename: "base-q8_0.bin"),
-            Model(name: "base.en", info: "(F16, 142 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin", filename: "base.en.bin"),
-            Model(name: "base.en-q5_1", info: "(57 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin", filename: "base.en-q5_1.bin"),
-            Model(name: "base.en-q8_0", info: "(78 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q8_0.bin", filename: "base.en-q8_0.bin"),
-            Model(name: "small", info: "(F16, 466 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin", filename: "small.bin"),
-            Model(name: "small-q5_1", info: "(181 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q5_1.bin", filename: "small-q5_1.bin"),
-            Model(name: "small-q8_0", info: "(252 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q8_0.bin", filename: "small-q8_0.bin"),
-            Model(name: "small.en", info: "(F16, 466 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin", filename: "small.en.bin"),
-            Model(name: "small.en-q5_1", info: "(181 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q5_1.bin", filename: "small.en-q5_1.bin"),
-            Model(name: "small.en-q8_0", info: "(252 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q8_0.bin", filename: "small.en-q8_0.bin"),
-            Model(name: "medium", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin", filename: "medium.bin"),
-            Model(name: "medium-q5_0", info: "(514 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin", filename: "medium-q5_0.bin"),
-            Model(name: "medium-q8_0", info: "(785 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q8_0.bin", filename: "medium-q8_0.bin"),
-            Model(name: "medium.en", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin", filename: "medium.en.bin"),
-            Model(name: "medium.en-q5_0", info: "(514 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q5_0.bin", filename: "medium.en-q5_0.bin"),
-            Model(name: "medium.en-q8_0", info: "(785 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q8_0.bin", filename: "medium.en-q8_0.bin"),
-            Model(name: "large-v1", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large.bin", filename: "large.bin"),
-            Model(name: "large-v2", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin", filename: "large-v2.bin"),
-            Model(name: "large-v2-q5_0", info: "(1.1 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q5_0.bin", filename: "large-v2-q5_0.bin"),
-            Model(name: "large-v2-q8_0", info: "(1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q8_0.bin", filename: "large-v2-q8_0.bin"),
-            Model(name: "large-v3", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin", filename: "large-v3.bin"),
-            Model(name: "large-v3-q5_0", info: "(1.1 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-q5_0.bin", filename: "large-v3-q5_0.bin"),
-            Model(name: "large-v3-turbo", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin", filename: "large-v3-turbo.bin"),
-            Model(name: "large-v3-turbo-q5_0", info: "(547 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin", filename: "large-v3-turbo-q5_0.bin"),
-            Model(name: "large-v3-turbo-q8_0", info: "(834 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q8_0.bin", filename: "large-v3-turbo-q8_0.bin"),
-        ]
-
-        static func getDownloadedModels() -> [Model] {
-            // Filter models that have been downloaded
-            return models.filter {
-                FileManager.default.fileExists(atPath: $0.fileURL.path())
-            }
-        }
-
-        func loadModel(model: Model) {
-            Task {
-                dismiss()
-                whisperState.loadModel(path: model.fileURL)
-            }
-        }
-
-        var body: some View {
-            List {
-                Section(header: Text("Models")) {
-                    ForEach(ModelsView.models) { model in
-                        DownloadButton(model: model)
-                            .onLoad(perform: loadModel)
-                    }
-                }
-            }
-            .listStyle(GroupedListStyle())
-            .navigationBarTitle("Models", displayMode: .inline).toolbar {}
-        }
-    }
 }

-//struct ContentView_Previews: PreviewProvider {
-//    static var previews: some View {
-//        ContentView()
-//    }
-//}
+struct ContentView_Previews: PreviewProvider {
+    static var previews: some View {
+        ContentView()
+    }
+}
--- a/examples/whisper.swiftui/whisper.swiftui.demo/UI/DownloadButton.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/UI/DownloadButton.swift
@ -1,102 +0,0 @@
-import SwiftUI
-
-struct DownloadButton: View {
-    private var model: Model
-
-    @State private var status: String
-
-    @State private var downloadTask: URLSessionDownloadTask?
-    @State private var progress = 0.0
-    @State private var observation: NSKeyValueObservation?
-
-    private var onLoad: ((_ model: Model) -> Void)?
-
-    init(model: Model) {
-        self.model = model
-        status = model.fileExists() ? "downloaded" : "download"
-    }
-
-    func onLoad(perform action: @escaping (_ model: Model) -> Void) -> DownloadButton {
-        var button = self
-        button.onLoad = action
-        return button
-    }
-
-    private func download() {
-        status = "downloading"
-        print("Downloading model \(model.name) from \(model.url)")
-        guard let url = URL(string: model.url) else { return }
-
-        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
-            if let error = error {
-                print("Error: \(error.localizedDescription)")
-                return
-            }
-
-            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
-                print("Server error!")
-                return
-            }
-
-            do {
-                if let temporaryURL = temporaryURL {
-                    try FileManager.default.copyItem(at: temporaryURL, to: model.fileURL)
-                    print("Writing to \(model.filename) completed")
-                    status = "downloaded"
-                }
-            } catch let err {
-                print("Error: \(err.localizedDescription)")
-            }
-        }
-
-        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
-            self.progress = progress.fractionCompleted
-        }
-
-        downloadTask?.resume()
-    }
-
-    var body: some View {
-        VStack {
-            Button(action: {
-                if (status == "download") {
-                    download()
-                } else if (status == "downloading") {
-                    downloadTask?.cancel()
-                    status = "download"
-                } else if (status == "downloaded") {
-                    if !model.fileExists() {
-                        download()
-                    }
-                    onLoad?(model)
-                }
-            }) {
-                let title = "\(model.name) \(model.info)"
-                if (status == "download") {
-                    Text("Download \(title)")
-                } else if (status == "downloading") {
-                    Text("\(title) (Downloading \(Int(progress * 100))%)")
-                } else if (status == "downloaded") {
-                    Text("Load \(title)")
-                } else {
-                    Text("Unknown status")
-                }
-            }.swipeActions {
-                if (status == "downloaded") {
-                    Button("Delete") {
-                        do {
-                            try FileManager.default.removeItem(at: model.fileURL)
-                        } catch {
-                            print("Error deleting file: \(error)")
-                        }
-                        status = "download"
-                    }
-                    .tint(.red)
-                }
-            }
-        }
-        .onDisappear() {
-            downloadTask?.cancel()
-        }
-    }
-}
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -17,8 +17,6 @@
 		0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9E29539CD0003032C3 /* Assets.xcassets */; };
 		0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; };
 		0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; };
-		7F79E0EE2CE0A78000ACD7BF /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */; };
-		7F79E0F02CE0C6F700ACD7BF /* Model.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F79E0EF2CE0C6F700ACD7BF /* Model.swift */; };
 		E3F92DC52AFA8E3800A6A9D4 /* whisper in Frameworks */ = {isa = PBXBuildFile; productRef = E3F92DC42AFA8E3800A6A9D4 /* whisper */; };
 /* End PBXBuildFile section */

@ -35,8 +33,6 @@
 		0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = WhisperCppDemo.entitlements; sourceTree = "<group>"; };
 		0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
 		0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
-		7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
-		7F79E0EF2CE0C6F700ACD7BF /* Model.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Model.swift; sourceTree = "<group>"; };
 		E3F92DC22AFA8DD800A6A9D4 /* whisper.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = whisper.cpp; path = ../..; sourceTree = "<group>"; };
 /* End PBXFileReference section */

@ -56,7 +52,6 @@
 			isa = PBXGroup;
 			children = (
 				0AAC5DCD2953A05C003032C3 /* WhisperState.swift */,
-				7F79E0EF2CE0C6F700ACD7BF /* Model.swift */,
 			);
 			path = Models;
 			sourceTree = "<group>";
@ -124,7 +119,6 @@
 			isa = PBXGroup;
 			children = (
 				0AAC5D9C29539CCF003032C3 /* ContentView.swift */,
-				7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */,
 			);
 			path = UI;
 			sourceTree = "<group>";
@ -226,9 +220,7 @@
 				0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */,
 				0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */,
 				0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */,
-				7F79E0EE2CE0A78000ACD7BF /* DownloadButton.swift in Sources */,
 				0AA7514E2953D958001EE061 /* Recorder.swift in Sources */,
-				7F79E0F02CE0C6F700ACD7BF /* Model.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@ -378,9 +370,7 @@
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
-				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = YES;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 5.0;
@ -425,9 +415,7 @@
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
-				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = YES;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -92,7 +92,6 @@ else()
 endif()

 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
-option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)

 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
@ -100,9 +99,6 @@ option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
 option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
-option(GGML_AMX_TILE    "ggml: enable AMX-TILE"         OFF)
-option(GGML_AMX_INT8    "ggml: enable AMX-INT8"         OFF)
-option(GGML_AMX_BF16    "ggml: enable AMX-BF16"         OFF)
 option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
 if (NOT MSVC)
    option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@ -117,7 +113,6 @@ endif()

 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
-option(GGML_CPU                             "ggml: enable CPU backend"                        ON)

 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
@ -128,9 +123,14 @@ option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"

 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
+option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
+set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
+set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
+set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
+                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
@ -138,7 +138,7 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})

-option(GGML_HIP                             "ggml: use HIP"                                   OFF)
+option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
@ -150,7 +150,6 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
-option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
@ -159,13 +158,10 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
-option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
-set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
-                                            "ggml: sycl device architecture")

 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
@ -218,14 +214,13 @@ include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
-    include/ggml-cpu.h
    include/ggml-alloc.h
    include/ggml-backend.h
    include/ggml-blas.h
    include/ggml-cann.h
    include/ggml-cuda.h
+    include/ggml.h
    include/ggml-kompute.h
-    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
@ -235,14 +230,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
-install(TARGETS ggml LIBRARY PUBLIC_HEADER)
-install(TARGETS ggml-base LIBRARY)
+install(TARGETS ggml PUBLIC_HEADER)
+
+if (BUILD_SHARED_LIBS)
+    install(TARGETS ggml LIBRARY)
+endif()

-# FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
-    # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
    install(
-        FILES src/ggml-metal/ggml-metal.metal
+        FILES src/ggml-metal.metal
        PERMISSIONS
            OWNER_READ
            OWNER_WRITE
--- a/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
+++ b/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
--- a/ggml/ggml_vk_generate_shaders.py
+++ b/ggml/ggml_vk_generate_shaders.py
@ -0,0 +1,220 @@
+#!/usr/bin/env python
+
+import logging
+import argparse
+import asyncio
+import os
+from tempfile import gettempdir
+
+logger = logging.getLogger("ggml-vk-generate-shaders")
+
+GLSLC = "glslc"
+
+type_names = [
+    "f32",
+    "f16",
+    "q4_0",
+    "q4_1",
+    "q5_0",
+    "q5_1",
+    "q8_0",
+    "q2_k",
+    "q3_k",
+    "q4_k",
+    "q5_k",
+    "q6_k",
+]
+
+ASYNCIO_CONCURRENCY = 64
+
+input_dir = "vulkan-shaders"
+output_dir = gettempdir()
+
+lock = asyncio.Lock()
+shader_fnames = []
+
+
+async def string_to_spv(name, in_fname, defines, fp16=True):
+    name = f"{name}{'_fp32' if not fp16 else ''}"
+    out_fname = os.path.join(output_dir, f"{name}.spv")
+
+    in_path = os.path.join(input_dir, in_fname)
+
+    cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
+
+    cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
+
+    proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
+
+    stdout, stderr = await proc.communicate()
+
+    stdout = stdout.decode()
+    error = stderr.decode()
+
+    if proc.returncode:
+        cmd = " ".join(cmd)
+        logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
+        return
+
+    async with lock:
+        shader_fnames.append((name, out_fname))
+
+
+def matmul_shaders(tasks, fp16, matmul_id):
+    if fp16:
+        load_vec = "8"
+        aligned_b_type_f32 = "mat2x4"
+        aligned_b_type_f16 = "f16mat2x4"
+    else:
+        load_vec = "4"
+        aligned_b_type_f32 = "vec4"
+        aligned_b_type_f16 = "f16vec4"
+
+    base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
+    shader_name = "matmul"
+
+    if matmul_id:
+        base_dict["MUL_MAT_ID"] = "1"
+        shader_name = "matmul_id"
+
+    if fp16:
+        base_dict["FLOAT16"] = "1"
+
+    # Shaders with f16 B_TYPE
+    tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
+    tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
+
+    tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
+    tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
+
+    for tname in type_names:
+        data_a_key = f"DATA_A_{tname.upper()}"
+        load_vec_a = load_vec if tname in ("f32", "f16") else "2"
+        tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
+
+
+async def main():
+    logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
+
+    tasks = []
+
+    for fp16 in (False, True):
+        # MUL_MAT
+        matmul_shaders(tasks, fp16, False)
+        # MUL_MAT_ID
+        matmul_shaders(tasks, fp16, True)
+
+    for tname in type_names:
+        base_dict = {"FLOAT_TYPE": "float"}
+
+        # mul mat vec
+        data_a_key = f"DATA_A_{tname.upper()}"
+        shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
+
+        tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
+        tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
+
+        tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
+
+        # Dequant shaders
+        if tname != "f16":
+            tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
+
+        # get_rows
+        if not tname.endswith("_k"):
+            shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
+
+            if tname == "f16":
+                tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
+            else:
+                tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
+            tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
+
+    tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
+
+    # Norms
+    tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
+
+    tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
+
+    tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+
+    tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
+
+    tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+
+    tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+
+    tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+
+    tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+
+    tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+
+    tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+
+    tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+
+    tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
+
+    tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
+
+    tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
+
+    tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
+
+    tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
+
+    # Helper to decorate tasks with semaphore acquisition.
+    async def withSemaphore(sem, task):
+        async with sem:
+            return await task
+
+    # Run tasks concurrently guarded by a concurrency limit.
+    sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
+    await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
+
+    with open("ggml-vulkan-shaders.hpp", "w") as f:
+        f.write("#include <cstdint>\n\n")
+        for name, path in sorted(shader_fnames):
+
+            with open(path, "rb") as spv:
+                counter = 0
+                newline_counter = 0
+                f.write(f"unsigned char {name}_data[] = {{\n")
+                for val in spv.read():
+                    f.write(f"0x{val:02x},")
+                    newline_counter += 1
+                    counter += 1
+                    if newline_counter >= 12:
+                        newline_counter = 0
+                        f.write("\n")
+            f.write("\n};\n")
+            f.write(f"const uint64_t {name}_len = {counter};\n\n")
+            os.remove(path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
+
+    parser.add_argument("--glslc", help="Path to glslc")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    if args.glslc:
+        GLSLC = args.glslc
+
+    asyncio.run(main())
--- a/ggml/include/ggml-amx.h
+++ b/ggml/include/ggml-amx.h
@ -1,25 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// buffer_type API
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-
-GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
-
-GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -3,20 +3,6 @@
 #include "ggml.h"
 #include "ggml-alloc.h"

-#ifdef GGML_BACKEND_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef GGML_BACKEND_BUILD
-#            define GGML_BACKEND_API __declspec(dllexport) extern
-#        else
-#            define GGML_BACKEND_API __declspec(dllimport) extern
-#        endif
-#    else
-#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
-#    endif
-#else
-#    define GGML_BACKEND_API extern
-#endif
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
@ -86,7 +72,7 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

-    // "offset" refers to the offset in tensor->data for setting/getting data
+    // "offset" refers to the offset of the tensor data for setting/getting data
    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
@ -128,12 +114,11 @@ extern "C" {
    //

    enum ggml_backend_dev_type {
-        // CPU device using system memory
        GGML_BACKEND_DEVICE_TYPE_CPU,
-        // GPU device using dedicated memory
        GGML_BACKEND_DEVICE_TYPE_GPU,
-        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
-        GGML_BACKEND_DEVICE_TYPE_ACCEL
+        // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
+        GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
+        GGML_BACKEND_DEVICE_TYPE_GPU_FULL
    };

    // functionality supported by the device
@ -142,8 +127,6 @@ extern "C" {
        bool async;
        // pinned host buffer
        bool host_buffer;
-        // creating buffers from host ptr
-        bool buffer_from_host_ptr;
        // event synchronization
        bool events;
    };
@ -182,14 +165,9 @@ extern "C" {
    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);

-    // Common functions that may be obtained using ggml_backend_reg_get_proc_address

-    // Split buffer type for tensor parallelism
-    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
-    // Set the number of threads for the backend
-    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
-    // Get additional buffer types provided by the device (returns a NULL-terminated array)
-    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Functions that may be obtained using ggml_backend_reg_get_proc_address
+    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);

    //
    // Backend registry
@ -211,7 +189,7 @@ extern "C" {
    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
-    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
    GGML_API ggml_backend_t ggml_backend_init_best(void);

    //
@ -242,20 +220,14 @@ extern "C" {
        ggml_backend_sched_reserve(sched, reserve_graph);

        // compute
-        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
-        for (int i = 0; i < 10; ++i) {
-            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
-        }
+        graph = build_graph(sched);
+        ggml_backend_sched_graph_compute(sched, graph);

        // if there are graph inputs:
-        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
-        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
-        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
-        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
-        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
-
-        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
-        // allocate them statically via ggml_backend_alloc_ctx_tensors
+        ggml_backend_sched_reset(sched);
+        ggml_backend_sched_alloc_graph(sched, graph);
+        ggml_backend_tensor_set(input_tensor, ...);
+        ggml_backend_sched_graph_compute(sched, graph);
    }
    */

@ -270,7 +242,7 @@ extern "C" {
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

-    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
+    // Initialize a backend scheduler
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

@ -295,9 +267,7 @@ extern "C" {
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);

-    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
-    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
-    // The correct way to use this API is to discard the deallocated tensors and create new ones.
+    // Reset all assignments and allocators - must be called before changing the node backends
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);

    // Set a callback to be called for each resulting node during graph compute
@ -327,10 +297,27 @@ extern "C" {
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);

-    // CPU buffer types are always available
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    // Create a backend buffer from an existing pointer
    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

+    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@ -9,15 +9,13 @@ extern "C" {
 #endif

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
+GGML_API ggml_backend_t ggml_backend_blas_init(void);

-GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);

 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);


 #ifdef  __cplusplus
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -34,8 +34,6 @@ extern "C" {
 */
 #define GGML_CANN_MAX_DEVICES 16

-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
-
 /**
 * @brief Initializes the CANN backend for a specified device.
 *
@ -46,7 +44,7 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
 * @param device The index of the device to initialize.
 * @return A pointer to the initialized backend instance, or nullptr on failure.
 */
-GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);

 /**
 * @brief Checks if a given backend is a CANN backend.
@ -57,7 +55,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 * @param backend The backend instance to check.
 * @return True if the backend is a CANN backend, false otherwise.
 */
-GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);

 /**
 * @brief Retrieves the CANN buffer type for a specified device.
@ -69,7 +67,7 @@ GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
 * @return A pointer to the buffer type interface for the specified device, or
 * nullptr if the device index is out of range.
 */
-GGML_BACKEND_API ggml_backend_buffer_type_t
+GGML_API ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);

 /**
@ -80,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
 *
 * @return The number of CANN devices available.
 */
-GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
+GGML_API int32_t ggml_backend_cann_get_device_count(void);

 /**
 * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
 *
 * @return A pointer to the host buffer type interface.
 */
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);

 /**
 * @brief Retrieves the description of a specific CANN device.
@ -99,7 +97,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(v
 * @param description Pointer to a buffer where the description will be written.
 * @param description_size Size of the description buffer.
 */
-GGML_BACKEND_API void ggml_backend_cann_get_device_description(
+GGML_API void ggml_backend_cann_get_device_description(
    int32_t device, char* description, size_t description_size);

 /**
@ -114,7 +112,7 @@ GGML_BACKEND_API void ggml_backend_cann_get_device_description(
 * @param total Pointer to a variable where the total memory size will be
 * stored.
 */
-GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
+GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                  size_t* free,
                                                  size_t* total);

--- a/ggml/include/ggml-cpp.h
+++ b/ggml/include/ggml-cpp.h
@ -1,38 +0,0 @@
-#pragma once
-
-#ifndef __cplusplus
-#error "This header is for C++ only"
-#endif
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include <memory>
-
-// Smart pointers for ggml types
-
-// ggml
-
-struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
-struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
-
-typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
-typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
-
-// ggml-alloc
-
-struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
-
-typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
-
-// ggml-backend
-
-struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
-struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
-struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
-struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
-
-typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
-typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
-typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
-typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -1,177 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    // Scheduling priorities
-    enum ggml_sched_priority {
-        GGML_SCHED_PRIO_NORMAL,
-        GGML_SCHED_PRIO_MEDIUM,
-        GGML_SCHED_PRIO_HIGH,
-        GGML_SCHED_PRIO_REALTIME
-    };
-
-    // Threadpool params
-    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
-    struct ggml_threadpool_params {
-        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
-        int                 n_threads;                   // number of threads
-        enum ggml_sched_priority prio;                   // thread priority
-        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
-        bool                strict_cpu;                  // strict cpu placement
-        bool                paused;                      // start in paused state
-    };
-
-    struct ggml_threadpool;     // forward declaration, see ggml.c
-
-    typedef struct ggml_threadpool * ggml_threadpool_t;
-
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
-    // numa strategies
-    enum ggml_numa_strategy {
-        GGML_NUMA_STRATEGY_DISABLED   = 0,
-        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
-        GGML_NUMA_STRATEGY_ISOLATE    = 2,
-        GGML_NUMA_STRATEGY_NUMACTL    = 3,
-        GGML_NUMA_STRATEGY_MIRROR     = 4,
-        GGML_NUMA_STRATEGY_COUNT
-    };
-
-    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
-
-    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
-    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
-
-    GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_BACKEND_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
-
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-
-    //
-    // system info
-    //
-
-    // x86
-    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
-    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
-    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
-    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
-    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
-    // ARM
-    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
-    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
-    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
-    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
-    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
-    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
-    // other
-    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
-    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
-    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
-
-    // Internal types and functions exposed for tests and benchmarks
-
-    typedef void (*ggml_from_float_to_mat_t)
-                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
-    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                       const void * GGML_RESTRICT y, int nr, int nc);
-    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                       const void * GGML_RESTRICT y, int nr, int nc);
-
-    struct ggml_type_traits_cpu {
-        ggml_from_float_t        from_float;
-        ggml_from_float_to_mat_t from_float_to_mat;
-        ggml_vec_dot_t           vec_dot;
-        enum ggml_type           vec_dot_type;
-        int64_t                  nrows; // number of rows to process simultaneously
-        int64_t                  ncols; // number of columns to process simultaneously
-        ggml_gemv_t              gemv;
-        ggml_gemm_t              gemm;
-    };
-
-    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
-
-    GGML_BACKEND_API void ggml_cpu_init(void);
-
-    //
-    // CPU backend
-    //
-
-    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
-    GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@ -7,7 +7,7 @@
 extern "C" {
 #endif

-#ifdef GGML_USE_HIP
+#ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #elif defined(GGML_USE_MUSA)
@ -20,27 +20,27 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_API ggml_backend_t ggml_backend_cuda_init(int device);

-GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);

 // device buffer
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);

 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);

 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

-GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
-GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API int  ggml_backend_cuda_get_device_count(void);
+GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);

-GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);

-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);

 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@ -11,8 +11,6 @@
 extern "C" {
 #endif

-#define GGML_KOMPUTE_MAX_DEVICES 16
-
 struct ggml_vk_device {
    int index;
    int type; // same as VkPhysicalDeviceType
@ -37,13 +35,11 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;

-GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
+GGML_API ggml_backend_t ggml_backend_kompute_init(int device);

-GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);

 #ifdef __cplusplus
 }
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -39,27 +39,23 @@ extern "C" {
 // user-code should use only these functions
 //

-GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
+GGML_API ggml_backend_t ggml_backend_metal_init(void);

-GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

-GGML_DEPRECATED(
-        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
+GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);

-GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);

 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);

 #ifdef __cplusplus
 }
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -1,216 +0,0 @@
-// This file contains functionality for training models using GGML.
-// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
-// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
-//
-// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stdint.h>
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    struct ggml_opt_dataset;
-    struct ggml_opt_context;
-    struct ggml_opt_result;
-
-    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
-    typedef struct ggml_opt_context * ggml_opt_context_t;
-    typedef struct ggml_opt_result  * ggml_opt_result_t;
-
-    // ====== Loss ======
-
-    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
-    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
-    enum ggml_opt_loss_type {
-        GGML_OPT_LOSS_TYPE_MEAN,
-        GGML_OPT_LOSS_TYPE_SUM,
-        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
-        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
-    };
-
-    // ====== Dataset ======
-
-    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
-            int64_t ne_label,     // number of elements per label
-            int64_t ndata,        // total number of datapoints/labels
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
-    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
-
-    // get underlying tensors that store the data
-    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
-    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
-
-    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
-    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
-
-    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
-    GGML_API void ggml_opt_dataset_get_batch(
-            ggml_opt_dataset_t   dataset,
-            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
-            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
-            int64_t              ibatch);
-
-    // ====== Model / Context ======
-
-    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
-        GGML_OPT_BUILD_TYPE_GRAD,
-        GGML_OPT_BUILD_TYPE_OPT,
-    };
-
-    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
-    struct ggml_opt_optimizer_params {
-        // AdamW optimizer parameters
-        struct {
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay for AdamW, use 0.0f to disable
-        } adamw;
-    };
-
-    // callback to calculate optimizer parameters prior to a backward pass
-    // userdata can be used to pass arbitrary data
-    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
-
-    // returns the default optimizer params (constant)
-    // userdata is not used
-    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
-
-    // parameters for initializing a new optimization context
-    struct ggml_opt_params {
-        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
-
-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
-
-        // the forward graph is defined by inputs and outputs
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
-        struct ggml_tensor * inputs;
-        struct ggml_tensor * outputs;
-
-        enum ggml_opt_loss_type  loss_type;
-        enum ggml_opt_build_type build_type;
-
-        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
-
-        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
-        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
-    };
-
-    // get parameters for an optimization context with defaults set where possible
-    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
-            struct ggml_context     * ctx_compute,
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
-
-    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
-    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
-
-    // set gradients to zero, initilize loss, and optionally reset the optimizer
-    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
-
-    // get underlying tensors that store data
-    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
-    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
-    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
-    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
-    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
-    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
-
-    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
-
-    // ====== Optimization Result ======
-
-    GGML_API ggml_opt_result_t ggml_opt_result_init();
-    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
-    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
-
-    // get data from result, uncertainties are optional and can be ignored by passing NULL
-    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
-    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
-    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
-    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
-
-    // ====== Computation ======
-
-    // do forward pass, increment result if not NULL
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
-
-    // do forward pass, increment result if not NULL, do backward pass
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
-
-    // ############################################################################
-    // ## The high-level functions start here. They do not depend on any private ##
-    // ## functions or structs and can be copied to and adapted for user code.   ##
-    // ############################################################################
-
-    // ====== Intended Usage ======
-    //
-    // 1. Select the appropriate loss for your problem.
-    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
-    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
-    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
-    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
-    //    The second context should contain all other tensors and will be (re)allocated automatically.
-    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
-    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
-    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
-
-    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
-    typedef void (*ggml_opt_epoch_callback)(
-            bool               train,       // true after training evaluation, false after validation evaluation
-            ggml_opt_context_t opt_ctx,
-            ggml_opt_dataset_t dataset,
-            ggml_opt_result_t  result,      // result associated with the dataset subsection
-            int64_t            ibatch,      // number of batches that have been evaluated so far
-            int64_t            ibatch_max,  // total number of batches in this dataset subsection
-            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
-
-    // do training on front of dataset, do evaluation only on back of dataset
-    GGML_API void ggml_opt_epoch(
-            ggml_opt_context_t      opt_ctx,
-            ggml_opt_dataset_t      dataset,
-            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
-            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
-            int64_t                 idata_split,    // data index at which to split training and evaluation
-            ggml_opt_epoch_callback callback_train,
-            ggml_opt_epoch_callback callback_eval);
-
-    // callback that prints a progress bar on stderr
-    GGML_API void ggml_opt_epoch_callback_progress_bar(
-            bool               train,
-            ggml_opt_context_t opt_ctx,
-            ggml_opt_dataset_t dataset,
-            ggml_opt_result_t  result,
-            int64_t            ibatch,
-            int64_t            ibatch_max,
-            int64_t            t_start_us);
-
-    // fit model defined by inputs and outputs to dataset
-    GGML_API void ggml_opt_fit(
-            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
-            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
-            enum ggml_opt_loss_type         loss_type,      // loss to minimize
-            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
-            int64_t                         nepoch,         // how many times the dataset should be iterated over
-            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
-            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
-            bool                            silent);        // whether or not info prints to stderr should be suppressed
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@ -10,18 +10,14 @@ extern "C" {
 #define GGML_RPC_MAX_SERVERS       16

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
-GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);

-GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
-
-GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);

 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -17,33 +17,26 @@ extern "C" {
 #endif

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
-
-GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+GGML_API ggml_backend_t ggml_backend_sycl_init(int device);

 // devide buffer
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);

 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);

 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);

-GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
-                                                       char *description,
-                                                       size_t description_size);
-GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
-GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+GGML_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
+GGML_API int  ggml_backend_sycl_get_device_count();
+GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);

 // SYCL doesn't support registering host memory, keep here for reference
-// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
-
+// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -10,21 +10,19 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16

-GGML_BACKEND_API void ggml_vk_instance_init(void);
+GGML_API void ggml_vk_instance_init(void);

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

-GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
-GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_API int  ggml_backend_vk_get_device_count(void);
+GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);

 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -176,15 +176,15 @@
 #ifdef GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport) extern
+#            define GGML_API __declspec(dllexport)
 #        else
-#            define GGML_API __declspec(dllimport) extern
+#            define GGML_API __declspec(dllimport)
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default"))) extern
+#        define GGML_API __attribute__ ((visibility ("default")))
 #    endif
 #else
-#    define GGML_API extern
+#    define GGML_API
 #endif

 // TODO: support for clang
@ -509,7 +509,7 @@ extern "C" {
        GGML_OP_WIN_UNPART,
        GGML_OP_GET_REL_POS,
        GGML_OP_ADD_REL_POS,
-        GGML_OP_RWKV_WKV6,
+        GGML_OP_RWKV_WKV,

        GGML_OP_UNARY,

@ -558,10 +558,10 @@ extern "C" {

    enum ggml_log_level {
        GGML_LOG_LEVEL_NONE  = 0,
-        GGML_LOG_LEVEL_DEBUG = 1,
-        GGML_LOG_LEVEL_INFO  = 2,
-        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_ERROR = 4,
+        GGML_LOG_LEVEL_INFO  = 1,
+        GGML_LOG_LEVEL_WARN  = 2,
+        GGML_LOG_LEVEL_ERROR = 3,
+        GGML_LOG_LEVEL_DEBUG = 4,
        GGML_LOG_LEVEL_CONT  = 5, // continue previous log
    };

@ -573,13 +573,6 @@ extern "C" {
        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
    };

-    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type type;
@ -602,6 +595,7 @@ extern "C" {

        int32_t flags;

+        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];

        // source tensor and offset for views
@ -614,7 +608,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[8];
+        // char padding[4];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -624,6 +618,67 @@ extern "C" {
    // If it returns true, the computation is aborted
    typedef bool (*ggml_abort_callback)(void * data);

+    // Scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct ggml_threadpool * ggml_threadpool_t;
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+        struct ggml_threadpool * threadpool;
+
+        // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // scratch buffer
+    // TODO: deprecate and remove
+    struct ggml_scratch {
+        size_t offs;
+        size_t size;
+        void * data;
+    };
+
+    struct ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
+
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };

    //
    // GUID
@ -646,6 +701,9 @@ extern "C" {
    // accepts a UTF-8 path, even on Windows
    GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);

+    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);

@ -708,6 +766,7 @@ extern "C" {

    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);

+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);

@ -747,7 +806,8 @@ extern "C" {
            int64_t ne2,
            int64_t ne3);

-    GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);

    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@ -757,25 +817,35 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

-    // Converts a flat index into coordinates
-    GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);

-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    // Converts a flat index into coordinates
+    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);

    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
    GGML_ATTRIBUTE_FORMAT(2, 3)
    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);

-    // Tensor flags
-    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
-
    //
    // operations on tensors with backpropagation
    //
@ -1489,7 +1559,7 @@ extern "C" {
        "use ggml_rope_ext_inplace instead");

    // compute correction dims for YaRN RoPE scaling
-    GGML_API void ggml_rope_yarn_corr_dims(
+    void ggml_rope_yarn_corr_dims(
        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);

    // rotary position embedding backward, i.e compute dx from dy
@ -1745,9 +1815,6 @@ extern "C" {
            struct ggml_tensor * a,
            enum ggml_prec       prec);

-    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
-            const struct ggml_tensor * a);
-
    // TODO: needs to be adapted to ggml_flash_attn_ext
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
@ -1821,7 +1888,7 @@ extern "C" {
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);

-    GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
+    GGML_API struct ggml_tensor * ggml_rwkv_wkv(
            struct ggml_context * ctx,
            struct ggml_tensor  * k,
            struct ggml_tensor  * v,
@ -1984,20 +2051,31 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * grad,
-            struct ggml_tensor  * m,
-            struct ggml_tensor  * v,
-            struct ggml_tensor  * adamw_params); // parameters such a the learning rate
+            float                 alpha,
+            float                 beta1,
+            float                 beta2,
+            float                 eps,
+            float                 wd); // weight decay

    //
    // automatic differentiation
    //

-    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
-        struct ggml_context * ctx_compute, // context for gradient computation
-        struct ggml_cgraph  * cgraph,
-        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
+    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
+
+    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
+
+    GGML_API void ggml_build_opt_adamw(
+            struct ggml_context * ctx,
+            struct ggml_cgraph  * gf,
+            struct ggml_cgraph  * gb,
+            float                 alpha,
+            float                 beta1,
+            float                 beta2,
+            float                 eps,
+            float                 wd); // weight decay

    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
@ -2017,9 +2095,28 @@ extern "C" {
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

-    GGML_API struct ggml_tensor * ggml_graph_get_tensor  (const struct ggml_cgraph * cgraph, const char * name);
-    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
-    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@ -2030,14 +2127,201 @@ extern "C" {
    // dump the graph into a file using the dot format
    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);

-    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    GGML_API void ggml_build_backward_gradient_checkpointing(
+            struct ggml_context   * ctx,
+            struct ggml_cgraph    * gf,
+            struct ggml_cgraph    * gb,
+            struct ggml_cgraph    * gb_tmp,
+            struct ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
+    //
+    // optimization
+    //
+
+    // optimization methods
+    enum ggml_opt_type {
+        GGML_OPT_TYPE_ADAM,
+        GGML_OPT_TYPE_LBFGS,
+    };
+
+    // linesearch methods
+    enum ggml_linesearch {
+        GGML_LINESEARCH_DEFAULT = 1,
+
+        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
+        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
+        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+    };
+
+    // optimization return values
+    enum ggml_opt_result {
+        GGML_OPT_RESULT_OK = 0,
+        GGML_OPT_RESULT_DID_NOT_CONVERGE,
+        GGML_OPT_RESULT_NO_CONTEXT,
+        GGML_OPT_RESULT_INVALID_WOLFE,
+        GGML_OPT_RESULT_FAIL,
+        GGML_OPT_RESULT_CANCEL,
+
+        GGML_LINESEARCH_FAIL = -128,
+        GGML_LINESEARCH_MINIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
+        GGML_LINESEARCH_INVALID_PARAMETERS,
+    };
+
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);

    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);

-    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    // optimization parameters
+    //
+    //   see ggml.c (ggml_opt_default_params) for default values
+    //
+    struct ggml_opt_params {
+        enum ggml_opt_type type;
+
+        size_t graph_size;
+
+        int n_threads;
+
+        // delta-based convergence test
+        //
+        //   if past == 0 - disabled
+        //   if past > 0:
+        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+        //
+        int past;
+        float delta;
+
+        // maximum number of iterations without improvement
+        //
+        //   if 0 - disabled
+        //   if > 0:
+        //     assume convergence if no cost improvement in this number of iterations
+        //
+        int max_no_improvement;
+
+        bool print_forward_graph;
+        bool print_backward_graph;
+
+        int n_gradient_accumulation;
+
+        // ADAM parameters
+        struct {
+            int n_iter;
+
+            float sched; // schedule multiplier (fixed, decay or warmup)
+            float decay; // weight decay for AdamW, use 0.0f to disable
+            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float eps_f; // epsilon for convergence test
+            float eps_g; // epsilon for convergence test
+            float gclip; // gradient clipping
+        } adam;
+
+        // LBFGS parameters
+        struct {
+            int m; // number of corrections to approximate the inv. Hessian
+            int n_iter;
+            int max_linesearch;
+
+            float eps;      // convergence tolerance
+            float ftol;     // line search tolerance
+            float wolfe;
+            float min_step;
+            float max_step;
+
+            enum ggml_linesearch linesearch;
+        } lbfgs;
+    };
+
+    struct ggml_opt_context {
+        struct ggml_context * ctx;
+        struct ggml_opt_params params;
+
+        int iter;
+        int64_t nx; // number of parameter elements
+
+        bool just_initialized;
+
+        float loss_before;
+        float loss_after;
+
+        struct {
+            struct ggml_tensor * g;  // current gradient
+            struct ggml_tensor * m;  // first moment
+            struct ggml_tensor * v;  // second moment
+            struct ggml_tensor * pf; // past function values
+            float fx_best;
+            float fx_prev;
+            int n_no_improvement;
+        } adam;
+
+        struct {
+            struct ggml_tensor * x;    // current parameters
+            struct ggml_tensor * xp;   // previous parameters
+            struct ggml_tensor * g;    // current gradient
+            struct ggml_tensor * gp;   // previous gradient
+            struct ggml_tensor * d;    // search direction
+            struct ggml_tensor * pf;   // past function values
+            struct ggml_tensor * lmal; // the L-BFGS memory alpha
+            struct ggml_tensor * lmys; // the L-BFGS memory ys
+            struct ggml_tensor * lms;  // the L-BFGS memory s
+            struct ggml_tensor * lmy;  // the L-BFGS memory y
+            float fx_best;
+            float step;
+            int j;
+            int k;
+            int end;
+            int n_no_improvement;
+        } lbfgs;
+    };
+
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+
+    // optimize the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt(
+            struct ggml_context * ctx,
+            struct ggml_opt_params params,
+            struct ggml_tensor * f);
+
+    // initialize optimizer context
+    GGML_API void ggml_opt_init(
+            struct ggml_context     * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_opt_params    params,
+            int64_t                   nx);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume_g(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f,
+            struct ggml_cgraph * gf,
+            struct ggml_cgraph * gb,
+            ggml_opt_callback callback,
+            void * callback_data);
+
+    //
+    // tensor flags
+    //
+    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_output(struct ggml_tensor * tensor);

    //
    // quantization
@ -2194,6 +2478,47 @@ extern "C" {
    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);

+    //
+    // system info
+    //
+
+    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_API int ggml_cpu_has_avx2       (void);
+    GGML_API int ggml_cpu_has_avx512     (void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_API int ggml_cpu_has_fma        (void);
+    GGML_API int ggml_cpu_has_neon       (void);
+    GGML_API int ggml_cpu_has_sve        (void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
+    GGML_API int ggml_cpu_has_metal      (void);
+    GGML_API int ggml_cpu_has_f16c       (void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_API int ggml_cpu_has_blas       (void);
+    GGML_API int ggml_cpu_has_cuda       (void);
+    GGML_API int ggml_cpu_has_vulkan     (void);
+    GGML_API int ggml_cpu_has_kompute    (void);
+    GGML_API int ggml_cpu_has_gpublas    (void);
+    GGML_API int ggml_cpu_has_sse3       (void);
+    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_riscv_v    (void);
+    GGML_API int ggml_cpu_has_sycl       (void);
+    GGML_API int ggml_cpu_has_rpc        (void);
+    GGML_API int ggml_cpu_has_vsx        (void);
+    GGML_API int ggml_cpu_has_matmul_int8(void);
+    GGML_API int ggml_cpu_has_cann       (void);
+    GGML_API int ggml_cpu_has_llamafile  (void);
+
+    // get the sve vector length in bytes
+    GGML_API int ggml_cpu_get_sve_cnt(void);
+
+    //
+    // Internal types and functions exposed for tests and benchmarks
+    //
+
 #ifdef  __cplusplus
 // restrict not standard in C++
 #define GGML_RESTRICT
@ -2202,18 +2527,34 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+    typedef void (*ggml_from_float_to_mat_t)
+                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                       const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                       const void * GGML_RESTRICT y, int nr, int nc);

-    struct ggml_type_traits {
+    typedef struct {
        const char             * type_name;
        int64_t                  blck_size;
        int64_t                  blck_size_interleave; // interleave elements in blocks
        size_t                   type_size;
        bool                     is_quantized;
        ggml_to_float_t          to_float;
+        ggml_from_float_t        from_float;
        ggml_from_float_t        from_float_ref;
-    };
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+        int64_t                  ncols; // number of columns to process simultaneously
+        ggml_gemv_t              gemv;
+        ggml_gemm_t              gemm;
+    } ggml_type_traits_t;

-    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
+    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@ -1,5 +1,9 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once

+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
 #include "ggml.h"

 // GGML internal header
@ -8,11 +12,27 @@
 extern "C" {
 #endif

+// Quantization
+void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
+
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

+// GEMV
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+// GEMM
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -14,7 +14,7 @@

 //#define GGML_ALLOCATOR_DEBUG

-//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
+//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
 #define AT_PRINTF(...)


@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
    size = GGML_PAD(size, talloc->alignment);

    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
        GGML_ABORT("not enough space in the buffer");
    }
@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
            best_fit_block = alloc->n_free_blocks - 1;
        } else {
            // this should never happen
-            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                    __func__, size, max_avail);
            GGML_ABORT("not enough space in the buffer");
        }
@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
                }
            }
        }
-        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
        for (int i = 0; i < 1024; i++) {
            if (alloc->allocated_tensors[i].tensor) {
-                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
                    alloc->allocated_tensors[i].offset,
                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
            }
        }
-        GGML_LOG_DEBUG("\n");
+        fprintf(stderr, "\n");
    }
 #endif

@ -348,6 +348,7 @@ struct tensor_alloc {
 };

 struct leaf_alloc {
+    int buffer_id;
    struct tensor_alloc leaf;
 };

@ -466,12 +467,18 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
    return ggml_gallocr_hash_get(galloc, t)->allocated;
 }

+static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    hn->buffer_id = buffer_id;
+    hn->offset = offset;
+    hn->allocated = true;
+}
+
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }

 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
-    GGML_ASSERT(buffer_id >= 0);
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);

    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@ -733,6 +740,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
        if (leaf->view_src || leaf->data) {
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@ -760,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
        if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
            if (galloc->buffers[i] == NULL) {
-                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@ -810,25 +818,21 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }

 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    size_t node_size = 0;
-    if (!node->data && !node->view_src) {
-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
-        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
-    }
+    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
    return talloc->size_max >= node_size;
 }

 static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
    if (galloc->n_nodes != graph->n_nodes) {
 #ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
+        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
 #endif
        return true;
    }

    if (galloc->n_leafs != graph->n_leafs) {
 #ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
+        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
 #endif
        return true;
    }
@ -839,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph

        if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
+            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
 #endif
            return true;
        }
@ -851,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
            }
            if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 #ifndef NDEBUG
-                GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 #endif
                return true;
            }
@ -865,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    if (ggml_gallocr_needs_realloc(galloc, graph)) {
        if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
+            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
 #endif
            if (!ggml_gallocr_reserve(galloc, graph)) {
                return false;
            }
        } else {
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
 #endif
            return false;
        }
@ -936,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
    if (buffer == NULL) {
 #ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
+        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
        for (size_t i = 0; i < *n_buffers; i++) {
            ggml_backend_buffer_free((*buffers)[i]);
@ -986,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
        }

        if (this_size > max_size) {
-            GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
                    __func__, t->name,
                    ggml_backend_buft_name(buft),
                    this_size, max_size);
@ -1018,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte

    if (n_buffers == 0) {
 #ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
 #endif
        return NULL;
    }
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@ -1,107 +0,0 @@
-if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
-        CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
-    message(STATUS "Using AMX")
-
-    file(GLOB   GGML_HEADERS_AMX "*.h")
-    list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
-
-    file(GLOB   GGML_SOURCES_AMX "*.cpp")
-
-    add_library(ggml-amx
-                ${GGML_HEADERS_AMX}
-                ${GGML_SOURCES_AMX})
-
-    target_link_libraries(ggml-amx PRIVATE ggml-base)
-    target_include_directories(ggml-amx PRIVATE . ..)
-
-    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
-    # TODO: integrate AMX backend into the CPU backend
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
-            include(../ggml-cpu/cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
-            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
-            endif()
-            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
-            endif()
-            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-
-    target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
-else()
-    set(GGML_AMX OFF PARENT_SCOPE)
-    message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
-endif()
--- a/ggml/src/ggml-amx/common.h
+++ b/ggml/src/ggml-amx/common.h
@ -1,94 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-// hack until AMX is moved into the CPU backend
-#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
-
-#include <algorithm>
-#include <memory>
-#include <type_traits>
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
-#define TILE_M 16
-#define TILE_N 16
-#define TILE_K 32
-#define VNNI_BLK 4
-
-#define AMX_BLK_SIZE 32
-
-#define TMM0 0
-#define TMM1 1
-#define TMM2 2
-#define TMM3 3
-#define TMM4 4
-#define TMM5 5
-#define TMM6 6
-#define TMM7 7
-
-// parallel routines
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline T div_up(T x, T y) { return (x + y - 1) / y; }
-
-template <typename T>
-inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
-#if 0
-    // onednn partition pattern
-    T& n_my = n_end;
-    if (nth <= 1 || n == 0) {
-        n_start = 0;
-        n_my = n;
-    } else {
-        T n1 = div_up(n, nth);
-        T n2 = n1 - 1;
-        T T1 = n - n2 * nth;
-        n_my = ith < T1 ? n1 : n2;
-        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
-    }
-    n_end += n_start;
-#else
-    // pytorch aten partition pattern
-    T n_my = div_up(n, nth);
-    n_start = ith * n_my;
-    n_end = std::min(n_start + n_my, n);
-#endif
-}
-
-template <typename func_t>
-inline void parallel_for(int nth, int n, const func_t& f) {
-#if defined(_OPENMP)
-#pragma omp parallel num_threads(nth)
-{
-    //int nth = omp_get_num_threads();
-    int ith = omp_get_thread_num();
-    int tbegin, tend;
-    balance211(n, nth, ith, tbegin, tend);
-    f(tbegin, tend);
-}
-#else
-    f(0, n);
-
-    GGML_UNUSED(nth);
-#endif
-}
-
-// quantized types that have AMX support
-inline bool qtype_has_amx_kernels(const enum ggml_type type) {
-    // TODO: fix padding for vnni format
-    return (type == GGML_TYPE_Q4_0) ||
-        (type == GGML_TYPE_Q4_1);
-        //(type == GGML_TYPE_Q8_0) ||
-        //(type == GGML_TYPE_Q4_K) ||
-        //(type == GGML_TYPE_Q5_K) ||
-        //(type == GGML_TYPE_Q6_K) ||
-        //(type == GGML_TYPE_IQ4_XS);
-}
-
-// ggml backend context
-struct ggml_backend_amx_context {
-    int n_threads = GGML_DEFAULT_N_THREADS;
-    std::unique_ptr<char[]> work_data;
-    size_t work_size = 0;
-};
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@ -1,446 +0,0 @@
-#include "ggml-amx.h"
-#include "ggml-amx/common.h"
-#include "ggml-amx/mmq.h"
-#include "ggml-backend-impl.h"
-#include "ggml-impl.h"
-
-#if defined(__gnu_linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#if defined(__AMX_INT8__)
-
-// AMX buffer interface
-static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)(buffer->context);
-}
-
-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    if (qtype_has_amx_kernels(tensor->type)) {
-        ggml_backend_amx_convert_weight(tensor, data, offset, size);
-    } else {
-        memcpy((char *)tensor->data + offset, data, size);
-    }
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        if (qtype_has_amx_kernels(src->type)) {
-            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
-        } else {
-            memcpy(dst->data, src->data, ggml_nbytes(src));
-        }
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "AMX";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
-    return ggml_backend_amx_get_alloc_size(tensor);
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_amx;
-}
-
-// backend interface
-
-static const char * ggml_backend_amx_name(ggml_backend_t backend) {
-    return "AMX";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_amx_free(ggml_backend_t backend) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-    delete ctx;
-    delete backend;
-}
-
-static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        switch (node->op) {
-        case GGML_OP_MUL_MAT:
-            ggml_backend_amx_mul_mat(ctx, node);
-            break;
-
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            break;
-
-        default:
-            fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
-            GGML_ASSERT(false);
-        }
-    }
-
-    return GGML_STATUS_SUCCESS;
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i ggml_backend_amx_i = {
-    /* .get_name                = */ ggml_backend_amx_name,
-    /* .free                    = */ ggml_backend_amx_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_amx_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_amx_guid() {
-    static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
-    return &guid;
-}
-
-#define ARCH_GET_XCOMP_PERM     0x1022
-#define ARCH_REQ_XCOMP_PERM     0x1023
-#define XFEATURE_XTILECFG       17
-#define XFEATURE_XTILEDATA      18
-
-static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
-    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
-        fprintf(stderr, "AMX is not ready to be used!\n");
-        return false;
-    }
-    return true;
-#elif defined(_WIN32)
-    return true;
-#endif
-}
-
-ggml_backend_t ggml_backend_amx_init() {
-
-    // invoke a Linux system call to request access to AMX features
-    ggml_amx_init();
-
-    // backend context
-    ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
-
-    // ggml amx backend
-    ggml_backend_t backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_amx_guid(),
-        /* .interface = */ ggml_backend_amx_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_amx(backend_amx));
-
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
-    ctx->n_threads = n_threads;
-}
-
-// device interface
-
-static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
-    return "AMX";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
-    return "Intel Advanced Matrix Extensions";
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_amx_device_get_name(dev);
-    props->description = ggml_backend_amx_device_get_description(dev);
-    props->type        = ggml_backend_amx_device_get_type(dev);
-    ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
-    // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_amx_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_amx_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-
-    // handle only 2d gemm for now
-    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-    };
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT: {
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const enum ggml_type type = src0->type;
-            const int64_t ne0 = op->ne[0];
-
-            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
-            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
-            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
-
-            bool can_use_amx =
-                is_contiguous_2d(src0) &&       // src0 must be contiguous
-                is_contiguous_2d(src1) &&       // src1 must be contiguous
-                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
-                has_amx_kernels &&              // with amx kernel impls
-                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
-
-            return can_use_amx;
-        }
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
-    /* .get_name             = */ ggml_backend_amx_device_get_name,
-    /* .get_description      = */ ggml_backend_amx_device_get_description,
-    /* .get_memory           = */ ggml_backend_amx_device_get_memory,
-    /* .get_type             = */ ggml_backend_amx_device_get_type,
-    /* .get_props            = */ ggml_backend_amx_device_get_props,
-    /* .init_backend         = */ ggml_backend_amx_device_init,
-    /* .get_buffer_type      = */ ggml_backend_amx_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_amx_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_amx_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// backend reg interface
-
-static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
-    return "AMX";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_device ggml_backend_amx_device = {
-        /* .iface   = */ ggml_backend_amx_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_amx_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_amx_set_n_threads;
-    }
-    return NULL;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
-    /* .get_name         = */ ggml_backend_amx_reg_get_name,
-    /* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_amx_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_amx_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .iface   = */ ggml_backend_amx_reg_i,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_amx_reg;
-}
-
-#else // if defined(__AMX_INT8__)
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
-    return nullptr;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return false;
-}
-
-ggml_backend_t ggml_backend_amx_init(void) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-    return nullptr;
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-
-    GGML_UNUSED(backend_amx);
-    GGML_UNUSED(n_threads);
-}
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    return nullptr;
-}
-
-#endif
--- a/ggml/src/ggml-amx/mmq.cpp
+++ b/ggml/src/ggml-amx/mmq.cpp
--- a/ggml/src/ggml-amx/mmq.h
+++ b/ggml/src/ggml-amx/mmq.h
@ -1,17 +0,0 @@
-#pragma once
-#include "common.h"
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
-
-void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -22,7 +22,7 @@ extern "C" {
        size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
        // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
+        // (optional) check if tensor data is in host memory (defaults to false)
        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
    };

@ -37,6 +37,7 @@ extern "C" {
    //

    struct ggml_backend_buffer_i {
+        const char * (*get_name)     (ggml_backend_buffer_t buffer);
        // (optional) free the buffer
        void         (*free_buffer)  (ggml_backend_buffer_t buffer);
        // base address of the buffer
@ -87,16 +88,18 @@ extern "C" {

        void (*free)(ggml_backend_t backend);

+        // buffer allocation
+        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+
        // (optional) asynchronous tensor data access
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

-        // (optional) complete all pending operations (required if the backend supports async operations)
+        // (optional) complete all pending operations
        void (*synchronize)(ggml_backend_t backend);

-        // (optional) graph plans (not used currently)
-        // compute graph with a plan
+        // (optional) compute graph with a plan (not used currently)
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@ -107,6 +110,21 @@ extern "C" {
        // compute graph (always async if supported by the backend)
        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);

+        // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
+        //            new backends should implement the device interface instead
+
+        // These functions are being moved to the device interface
+        // check if the backend can compute an operation
+        bool (*supports_op)  (ggml_backend_t backend, const struct ggml_tensor * op);
+
+        // check if the backend can use tensors allocated in a buffer type
+        bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+
+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
+        // these should be expensive operations with large batch sizes that may benefit from running on this backend
+        // even if the weight has to be copied from the CPU temporarily
+        bool (*offload_op)   (ggml_backend_t backend, const struct ggml_tensor * op);
+
        // (optional) event synchronization
        // record an event on this stream
        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
@ -166,8 +184,9 @@ extern "C" {
        // check if the backend can use tensors allocated in a buffer type
        bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);

-        // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
-        // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
+        // these should be expensive operations with large batch sizes that may benefit from running on this backend
+        // even if the weight has to be copied from the CPU temporarily
        bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);

        // (optional) event synchronization
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -1,195 +0,0 @@
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
-#include "ggml-cpu.h"
-#include "ggml-impl.h"
-#include <cstring>
-#include <vector>
-
-// Backend registry
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef GGML_USE_BLAS
-#include "ggml-blas.h"
-#endif
-
-#ifdef GGML_USE_RPC
-#include "ggml-rpc.h"
-#endif
-
-#ifdef GGML_USE_AMX
-#  include "ggml-amx.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
-#endif
-
-struct ggml_backend_registry {
-    std::vector<ggml_backend_reg_t> backends;
-    std::vector<ggml_backend_dev_t> devices;
-
-    ggml_backend_registry() {
-#ifdef GGML_USE_CUDA
-        register_backend(ggml_backend_cuda_reg());
-#endif
-#ifdef GGML_USE_METAL
-        register_backend(ggml_backend_metal_reg());
-#endif
-#ifdef GGML_USE_SYCL
-        register_backend(ggml_backend_sycl_reg());
-#endif
-#ifdef GGML_USE_VULKAN
-        register_backend(ggml_backend_vk_reg());
-#endif
-#ifdef GGML_USE_CANN
-        register_backend(ggml_backend_cann_reg());
-#endif
-#ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
-#endif
-#ifdef GGML_USE_RPC
-        register_backend(ggml_backend_rpc_reg());
-#endif
-#ifdef GGML_USE_AMX
-        register_backend(ggml_backend_amx_reg());
-#endif
-#ifdef GGML_USE_KOMPUTE
-        register_backend(ggml_backend_kompute_reg());
-#endif
-
-        register_backend(ggml_backend_cpu_reg());
-    }
-
-    void register_backend(ggml_backend_reg_t reg) {
-        if (!reg) {
-            return;
-        }
-
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
-            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
-#endif
-        backends.push_back(reg);
-        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
-            register_device(ggml_backend_reg_dev_get(reg, i));
-        }
-    }
-
-    void register_device(ggml_backend_dev_t device) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
-#endif
-        devices.push_back(device);
-    }
-};
-
-static ggml_backend_registry & get_reg() {
-    static ggml_backend_registry reg;
-    return reg;
-}
-
-// Internal API
-void ggml_backend_register(ggml_backend_reg_t reg) {
-    get_reg().register_backend(reg);
-}
-
-void ggml_backend_device_register(ggml_backend_dev_t device) {
-    get_reg().register_device(device);
-}
-
-// Backend (reg) enumeration
-size_t ggml_backend_reg_count() {
-    return get_reg().backends.size();
-}
-
-ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
-    GGML_ASSERT(index < ggml_backend_reg_count());
-    return get_reg().backends[index];
-}
-
-ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
-    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
-        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
-            return reg;
-        }
-    }
-    return NULL;
-}
-
-// Device enumeration
-size_t ggml_backend_dev_count() {
-    return get_reg().devices.size();
-}
-
-ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
-    GGML_ASSERT(index < ggml_backend_dev_count());
-    return get_reg().devices[index];
-}
-
-ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
-            return dev;
-        }
-    }
-    return NULL;
-}
-
-ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (ggml_backend_dev_type(dev) == type) {
-            return dev;
-        }
-    }
-    return NULL;
-}
-
-// Convenience functions
-ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
-    if (!dev) {
-        return NULL;
-    }
-    return ggml_backend_dev_init(dev, params);
-}
-
-ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
-    if (!dev) {
-        return NULL;
-    }
-    return ggml_backend_dev_init(dev, params);
-}
-
-ggml_backend_t ggml_backend_init_best(void) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
-    if (!dev) {
-        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    }
-    if (!dev) {
-        return NULL;
-    }
-    return ggml_backend_dev_init(dev, NULL);
-}
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -4,9 +4,8 @@

 #include <future>
 #include <vector>
-#include <cstring>

-#if defined(GGML_BLAS_USE_ACCELERATE)
+#if defined(GGML_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
 #elif defined(GGML_BLAS_USE_MKL)
 #   include <mkl.h>
@ -27,6 +26,30 @@ struct ggml_backend_blas_context {
 #endif
 };

+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if (ggml_is_contiguous(src0) &&
+        ggml_is_contiguous(src1) &&
+        src1->type == GGML_TYPE_F32 &&
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
+        return true;
+    }
+
+    return false;
+}
+
 static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
@ -65,8 +88,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg

    // convert src0 to float
    if (type != GGML_TYPE_F32) {
-        const auto * type_traits = ggml_get_type_traits(type);
-        ggml_to_float_t const to_float = type_traits->to_float;
+        ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
+        ggml_to_float_t const to_float = type_traits.to_float;

        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -212,7 +235,7 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g

 // backend interface

-static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
+static const char * ggml_backend_blas_name(ggml_backend_t backend) {
    return "BLAS";

    GGML_UNUSED(backend);
@ -224,6 +247,12 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
    delete backend;
 }

+static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(backend);
+}
+
 static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;

@ -256,9 +285,31 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
    GGML_UNUSED(backend);
 }

+static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    return (op->op == GGML_OP_MUL_MAT  && ggml_backend_blas_use_blas(op)) ||
+           (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
+                                          op->src[1]->type == GGML_TYPE_F32 &&
+                                          ggml_is_matrix(src0) &&
+                                          ggml_is_matrix(src1) &&
+                                          ggml_is_contiguous(src0) &&
+                                          (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
+
+    GGML_UNUSED(backend);
+}
+
+static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(backend);
+}
+
 static struct ggml_backend_i blas_backend_i = {
-    /* .get_name                = */ ggml_backend_blas_get_name,
+    /* .get_name                = */ ggml_backend_blas_name,
    /* .free                    = */ ggml_backend_blas_free,
+    /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
    /* .set_tensor_async        = */ NULL,
    /* .get_tensor_async        = */ NULL,
    /* .cpy_tensor_async        = */ NULL,
@ -268,6 +319,9 @@ static struct ggml_backend_i blas_backend_i = {
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
+    /* .supports_op             = */ ggml_backend_blas_supports_op,
+    /* .supports_buft           = */ ggml_backend_blas_supports_buft,
+    /* .offload_op              = */ NULL,
    /* .event_record            = */ NULL,
    /* .event_wait              = */ NULL,
 };
@ -283,18 +337,18 @@ ggml_backend_t ggml_backend_blas_init(void) {
    ggml_backend_t backend = new ggml_backend {
        /* .guid      = */ ggml_backend_blas_guid(),
        /* .interface = */ blas_backend_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
+        /* .device    = */ nullptr,
        /* .context   = */ ctx,
    };

-#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
-        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+        fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
    }
 #endif

-#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
-    GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+    fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
 #endif

    return backend;
@ -310,205 +364,3 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
    ctx->n_threads = n_threads;
 }
-
-// device interface
-
-static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
-    return "BLAS";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
-    #if defined(GGML_BLAS_USE_ACCELERATE)
-        return "Accelerate";
-    #elif defined(GGML_BLAS_USE_MKL)
-        return "MKL";
-    #elif defined(GGML_BLAS_USE_BLIS)
-        return "BLIS";
-    #elif defined(GGML_BLAS_USE_NVPL)
-        return "NVPL";
-    #elif defined(OPENBLAS_VERSION)
-        return "OpenBLAS";
-    #else
-        return "BLAS";
-    #endif
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_blas_device_get_name(dev);
-    props->description = ggml_backend_blas_device_get_description(dev);
-    props->type        = ggml_backend_blas_device_get_type(dev);
-    ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_blas_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT:
-        {
-            // BLAS usually is only faster for large matrices
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const int64_t ne10 = src1->ne[0];
-
-            const int64_t ne0 = op->ne[0];
-            const int64_t ne1 = op->ne[1];
-
-            // TODO: find the optimal value
-            const int64_t min_batch = 32;
-
-            return ggml_is_contiguous(src0) &&
-                   ggml_is_contiguous(src1) &&
-                   src1->type == GGML_TYPE_F32 &&
-                   (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
-                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
-        }
-
-        case GGML_OP_OUT_PROD:
-            return op->src[0]->type == GGML_TYPE_F32 &&
-                   op->src[1]->type == GGML_TYPE_F32 &&
-                   ggml_is_matrix(src0) &&
-                   ggml_is_matrix(src1) &&
-                   ggml_is_contiguous(src0) &&
-                   (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
-                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
-
-        default:
-            return false;
-
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft);
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
-    /* .get_name             = */ ggml_backend_blas_device_get_name,
-    /* .get_description      = */ ggml_backend_blas_device_get_description,
-    /* .get_memory           = */ ggml_backend_blas_device_get_memory,
-    /* .get_type             = */ ggml_backend_blas_device_get_type,
-    /* .get_props            = */ ggml_backend_blas_device_get_props,
-    /* .init_backend         = */ ggml_backend_blas_device_init_backend,
-    /* .get_buffer_type      = */ ggml_backend_blas_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
-    /* .supports_op          = */ ggml_backend_blas_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_blas_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// backend reg interface
-
-static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
-    return "BLAS";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_device ggml_backend_blas_device = {
-        /* .iface   = */ ggml_backend_blas_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_blas_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_blas_set_n_threads;
-    }
-    return NULL;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
-    /* .get_name         = */ ggml_backend_blas_reg_get_name,
-    /* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_blas_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_blas_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_blas_reg(void) {
-    static struct ggml_backend_reg ggml_backend_blas_reg = {
-        /* .iface   = */ ggml_backend_blas_reg_i,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_blas_reg;
-}
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@ -1,90 +0,0 @@
-if (GGML_STATIC)
-    set(BLA_STATIC ON)
-endif()
-#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-#    set(BLA_SIZEOF_INTEGER 8)
-#endif()
-
-set(BLA_VENDOR ${GGML_BLAS_VENDOR})
-find_package(BLAS)
-
-if (BLAS_FOUND)
-    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-    add_library(ggml-blas
-                ggml-blas.cpp
-                )
-
-    target_link_libraries(ggml-blas PRIVATE ggml-base)
-    target_include_directories(ggml-blas PRIVATE . ..)
-
-    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-        add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
-    elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
-        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-        find_package(PkgConfig REQUIRED)
-        if (${GGML_BLAS_VENDOR} MATCHES "Generic")
-            pkg_check_modules(DepBLAS blas)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
-            # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-            pkg_check_modules(DepBLAS openblas64)
-            if (NOT DepBLAS_FOUND)
-                pkg_check_modules(DepBLAS openblas)
-            endif()
-        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-            add_compile_definitions(GGML_BLAS_USE_BLIS)
-            pkg_check_modules(DepBLAS blis)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
-            pkg_check_modules(DepBLAS blas-atlas)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
-            pkg_check_modules(DepBLAS flexiblas_api)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-            # all Intel* libraries share the same include path
-            pkg_check_modules(DepBLAS mkl-sdl)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
-            # this doesn't provide pkg-config
-            # suggest to assign BLAS_INCLUDE_DIRS on your own
-            if ("${NVHPC_VERSION}" STREQUAL "")
-                message(WARNING "Better to set NVHPC_VERSION")
-            else()
-                set(DepBLAS_FOUND ON)
-                set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-            endif()
-        endif()
-        if (DepBLAS_FOUND)
-            set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-        else()
-            message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-            " detected by pkgconfig, trying to find cblas.h from possible paths...")
-            find_path(BLAS_INCLUDE_DIRS
-                NAMES cblas.h
-                HINTS
-                    /usr/include
-                    /usr/local/include
-                    /usr/include/openblas
-                    /opt/homebrew/opt/openblas/include
-                    /usr/local/opt/openblas/include
-                    /usr/include/x86_64-linux-gnu/openblas/include
-            )
-        endif()
-    endif()
-
-    message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-
-    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
-
-    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
-        add_compile_definitions(GGML_BLAS_USE_MKL)
-    endif()
-
-    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
-    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
-else()
-    message(ERROR "BLAS not found, please refer to "
-                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-                  " to set correct GGML_BLAS_VENDOR")
-endif()
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -39,8 +39,6 @@

 #include "ggml-common.h"

-#define GGML_CANN_NAME "CANN"
-
 /**
 * @brief Handles CANN errors by printing an error message and aborting.
 *
@ -489,6 +487,23 @@ struct ggml_backend_cann_buffer_context {
    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
 };

+/**
+ * @brief Retrieve the name associated with a CANN buffer.
+ *
+ * This function returns the name of a CANN buffer, which is stored in the
+ * context of the buffer.
+ *
+ * @param buffer The CANN buffer whose name is to be retrieved.
+ * @return A pointer to a C-string containing the name of the buffer.
+ */
+
+static const char* ggml_backend_cann_buffer_get_name(
+    ggml_backend_buffer_t buffer) {
+    return "CANN";
+
+    GGML_UNUSED(buffer);
+}
+
 /**
 * @brief Check if a buffer is a CANN buffer.
 *
@ -498,10 +513,9 @@ struct ggml_backend_cann_buffer_context {
 * @param buffer The buffer to check.
 * @return true if the buffer is a CANN buffer, false otherwise.
 */
-static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
 static bool ggml_backend_buffer_is_cann(
    ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_is_cann(buffer->buft);
+    return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
 }

 /**
@ -837,6 +851,13 @@ static void ggml_backend_cann_buffer_set_tensor(
        void *transform_buffer = malloc(size);
        ggml_backend_cann_transform(tensor, data, transform_buffer);

+#ifndef NDEBUG
+        void *check_buffer = malloc(size);
+        ggml_backend_cann_transform_back(tensor, transform_buffer,
+                                         check_buffer);
+        GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
+        free(check_buffer);
+#endif
        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
                              transform_buffer, size,
                              ACL_MEMCPY_HOST_TO_DEVICE));
@ -948,7 +969,8 @@ static void ggml_backend_cann_buffer_clear(
 * This structure defines function pointers to operations that can be performed
 * on a CANN buffer within the backend.
 */
-static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
+static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
+    /* .get_name        = */ ggml_backend_cann_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cann_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cann_buffer_init_tensor,
@ -982,10 +1004,9 @@ struct ggml_backend_cann_buffer_type_context {
 */
 static const char* ggml_backend_cann_buffer_type_name(
    ggml_backend_buffer_type_t buft) {
-    ggml_backend_cann_buffer_type_context* buft_ctx =
-        (ggml_backend_cann_buffer_type_context*)buft->context;
+    return "CANN";

-    return buft_ctx->name.c_str();
+    GGML_UNUSED(buft);
 }

 /**
@ -1084,25 +1105,19 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
    GGML_UNUSED(buft);
 }

-static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    GGML_UNUSED(buft);
-}
-
 /**
 * @brief Interface for managing CANN buffer types in the GGML backend.
 *
 * Provides function pointers for allocating, querying properties, and managing
 * memory for CANN buffer types in the GGML backend.
 */
-static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
+static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cann_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cann_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cann_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL,  // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cann_buffer_type_get_alloc_size,
-    /* .is_host          = */ ggml_backend_cann_buffer_type_is_host,
+    /* .is_host          = */ NULL,
 };

 /**
@ -1133,7 +1148,6 @@ ggml_backend_cann_buffer_type(int32_t device) {
        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
            ggml_backend_cann_buffer_types[i] = {
                /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
                /* .context  = */
                 new ggml_backend_cann_buffer_type_context{
                    i, "CANN" + std::to_string(i)},
@ -1227,6 +1241,7 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm

    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;

    return buffer;
@ -1248,7 +1263,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
-        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
+        /* .device   = */ nullptr,
        /* .context  = */ nullptr,
    };

@ -1448,6 +1463,24 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
    delete backend;
 }

+/**
+ * @brief Retrieves the default buffer type associated with the CANN backend.
+ *
+ * This function returns the buffer type specific to the device associated
+ * with the CANN backend. It is used to allocate buffers for computations
+ * performed by the backend.
+ *
+ * @param backend Pointer to the CANN backend structure.
+ * @return Pointer to the buffer type structure for the CANN backend.
+ */
+static ggml_backend_buffer_type_t
+ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    return ggml_backend_cann_buffer_type(cann_ctx->device);
+}
+
 /**
 * @brief Sets tensor data asynchronously in the CANN backend.
 *
@ -1477,6 +1510,13 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
        void *transform_buffer = malloc(size);
        ggml_backend_cann_transform(tensor, data, transform_buffer);

+#ifndef NDEBUG
+        void *check_buffer = malloc(size);
+        ggml_backend_cann_transform_back(tensor, transform_buffer,
+                                         check_buffer);
+        GGML_ASSERT(memcmp(data, check_buffer, size));
+        free(check_buffer);
+#endif
        ACL_CHECK(aclrtMemcpyAsync(
            (char *)tensor->data + offset, size, transform_buffer, size,
            ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
@ -1651,7 +1691,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
 * @return bool Returns true if the operation is supported by the backend,
 *              otherwise false.
 */
-static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
+static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
                                                    const ggml_tensor* op) {
    switch (op->op) {
        case GGML_OP_UNARY:
@ -1742,7 +1782,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            return false;
    }

-    GGML_UNUSED(dev);
+    GGML_UNUSED(backend);
 }

 /**
@ -1760,6 +1800,31 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
 }

+/**
+ * @brief Checks if the CANN backend supports a specific backend buffer type.
+ *
+ * This function determines whether the CANN backend supports the given backend
+ * buffer type by comparing the device context of the backend and buffer type.
+ * It returns true if the devices are same between the backend context and
+ * buffer type context.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the CANN backend supports the buffer type,
+ *              otherwise false.
+ */
+static bool ggml_backend_cann_supports_buft(
+    ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    if (ggml_backend_buft_is_cann(buft)) {
+        ggml_backend_cann_context * cann_ctx =
+                        (ggml_backend_cann_context *)backend->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx =
+                        (ggml_backend_cann_buffer_type_context *)buft->context;
+        return buft_ctx->device == cann_ctx->device;
+    }
+    return false;
+}
+
 /**
 * @brief Determines if a tensor operation should be offloaded to the CANN
 * backend.
@ -1774,14 +1839,54 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
 * @return bool Returns true if the operation should be offloaded, otherwise
 * false.
 */
-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
+static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
                                                   const ggml_tensor* op) {
    const int min_batch_size = 32;
-    GGML_UNUSED(dev);
+    GGML_UNUSED(backend);

    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
 }

+/**
+ * @brief Creates a new event for the CANN backend.
+ *
+ * This function initializes a new event for the CANN backend by setting the
+ * device and creating an ACL runtime event. The created event is then wrapped
+ * in a ggml_backend_event structure and returned.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @return ggml_backend_event_t Returns a pointer to the new event structure.
+ */
+static ggml_backend_event_t ggml_backend_cann_event_new(
+    ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    ggml_cann_set_device(cann_ctx->device);
+
+    aclrtEvent event;
+    ACL_CHECK(aclrtCreateEvent(&event));
+
+    return new ggml_backend_event{
+        /* .backend = */ backend,
+        /* .context = */ event,
+    };
+}
+
+/**
+ * @brief Frees a CANN backend event.
+ *
+ * This function destroys the ACL runtime event associated with the given CANN
+ * backend event and then deletes the event structure itself.
+ *
+ * @param event Pointer to the event structure to be freed.
+ */
+static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
+    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+
+    delete event;
+}
+
 /**
 * @brief Records an event on the CANN backend stream.
 *
@ -1790,9 +1895,10 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
 *
 * @param event Pointer to the event structure to be recorded.
 */
-static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+        (ggml_backend_cann_context*)event->backend->context;
+
    ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
 }

@ -1810,7 +1916,8 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
                                         ggml_backend_event_t event) {
    ggml_backend_cann_context* cann_ctx =
        (ggml_backend_cann_context*)backend->context;
-    if (ggml_backend_is_cann(backend)) {
+
+    if (ggml_backend_is_cann(event->backend)) {
        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
                                       (aclrtEvent)event->context));
    } else {
@ -1818,6 +1925,17 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
    }
 }

+/**
+ * @brief Synchronizes the given event on the CANN backend.
+ *
+ * This function waits for the specified event to complete on the ACL runtime.
+ *
+ * @param event Pointer to the event structure to be synchronized.
+ */
+static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
+    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+}
+
 /**
 * @brief Structure defining the interface for the CANN backend.
 *
@ -1825,9 +1943,10 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
 * supported by the CANN backend, including name retrieval, memory
 * management, tensor operations, synchronization, and event handling.
 */
-static const ggml_backend_i ggml_backend_cann_interface = {
+static ggml_backend_i ggml_backend_cann_interface = {
    /* .get_name                = */ ggml_backend_cann_name,
    /* .free                    = */ ggml_backend_cann_free,
+    /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
    /* .set_tensor_async        = */ ggml_backend_cann_set_tensor_async,
    /* .get_tensor_async        = */ ggml_backend_cann_get_tensor_async,
    /* .cpy_tensor_async        = */ ggml_backend_cann_cpy_tensor_async,
@ -1837,6 +1956,9 @@ static const ggml_backend_i ggml_backend_cann_interface = {
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_cann_graph_compute,
+    /* .supports_op             = */ ggml_backend_cann_supports_op,
+    /* .supports_buft           = */ ggml_backend_cann_supports_buft,
+    /* .offload_op              = */ ggml_backend_cann_offload_op,
    /* .event_record            = */ ggml_backend_cann_event_record,
    /* .event_wait              = */ ggml_backend_cann_event_wait,
 };
@ -1855,234 +1977,6 @@ static ggml_guid_t ggml_backend_cann_guid() {
    return &guid;
 }

-// backend device
-struct ggml_backend_cann_device_context {
-    int device;
-    std::string name;
-    std::string description;
-};
-
-static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
-    return ctx->name.c_str();
-}
-
-static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
-    return ctx->description.c_str();
-}
-
-static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
-    ggml_backend_cann_get_device_memory(ctx->device, free, total);
-}
-
-static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
-}
-
-static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_cann_device_get_name(dev);
-    props->description = ggml_backend_cann_device_get_description(dev);
-    props->type        = ggml_backend_cann_device_get_type(dev);
-    ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
-    bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
-
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ host_buffer,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ true,
-    };
-}
-
-static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
-    GGML_UNUSED(params);
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
-    return ggml_backend_cann_init(ctx->device);
-}
-
-/**
- * @brief Checks if the CANN backend supports a specific backend buffer type.
- *
- * This function determines whether the CANN backend supports the given backend
- * buffer type by comparing the device context of the backend and buffer type.
- * It returns true if the devices are same between the backend context and
- * buffer type context.
- *
- * @param backend Pointer to the CANN backend.
- * @param buft Pointer to the backend buffer type to check.
- * @return bool Returns true if the CANN backend supports the buffer type,
- *              otherwise false.
- */
-static bool ggml_backend_cann_supports_buft(
-    ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    if (ggml_backend_buft_is_cann(buft)) {
-        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
-        ggml_backend_cann_buffer_type_context * buft_ctx =
-                        (ggml_backend_cann_buffer_type_context *)buft->context;
-        return buft_ctx->device == dev_ctx->device;
-    }
-    return false;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
-    return ggml_backend_cann_buffer_type(ctx->device);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return ggml_backend_cann_host_buffer_type();
-}
-
-/**
- * @brief Creates a new event for the CANN backend device.
- *
- * This function initializes a new event for the CANN backend by setting the
- * device and creating an ACL runtime event. The created event is then wrapped
- * in a ggml_backend_event structure and returned.
- *
- * @param backend Pointer to the CANN backend.
- * @return ggml_backend_event_t Returns a pointer to the new event structure.
- */
-static ggml_backend_event_t ggml_backend_cann_device_event_new(
-    ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
-
-    ggml_cann_set_device(dev_ctx->device);
-
-    aclrtEvent event;
-    ACL_CHECK(aclrtCreateEvent(&event));
-
-    return new ggml_backend_event{
-        /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
-        /* .context = */ event,
-    };
-}
-
-/**
- * @brief Frees a CANN backend event.
- *
- * This function destroys the ACL runtime event associated with the given CANN
- * backend event and then deletes the event structure itself.
- *
- * @param event Pointer to the event structure to be freed.
- */
-static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
-
-    delete event;
-    GGML_UNUSED(dev);
-}
-
-/**
- * @brief Synchronizes the given event on the CANN backend.
- *
- * This function waits for the specified event to complete on the ACL runtime.
- *
- * @param event Pointer to the event structure to be synchronized.
- */
-static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
-
-    GGML_UNUSED(dev);
-}
-
-static const ggml_backend_device_i ggml_backend_cann_device_interface = {
-    /* .get_name                = */ ggml_backend_cann_device_get_name,
-    /* .get_description         = */ ggml_backend_cann_device_get_description,
-    /* .get_memory              = */ ggml_backend_cann_device_get_memory,
-    /* .get_type                = */ ggml_backend_cann_device_get_type,
-    /* .get_props               = */ ggml_backend_cann_device_get_props,
-    /* .init_backend            = */ ggml_backend_cann_device_init,    // called for every card
-    /* .get_buffer_type         = */ ggml_backend_cann_device_get_buffer_type,
-    /* .get_host_buffer_type    = */ ggml_backend_cann_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr    = */ NULL, // not supported for CANN
-    /* .supports_op             = */ ggml_backend_cann_supports_op,
-    /* .supports_buft           = */ ggml_backend_cann_supports_buft,
-    /* .offload_op              = */ ggml_backend_cann_offload_op,
-    /* .event_new               = */ ggml_backend_cann_device_event_new,
-    /* .event_free              = */ ggml_backend_cann_device_event_free,
-    /* .event_synchronize       = */ ggml_backend_cann_device_event_synchronize,
-};
-
-
-// backend reg
-struct ggml_backend_cann_reg_context {
-    std::vector<ggml_backend_dev_t> devices;
-};
-
-static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return GGML_CANN_NAME;
-}
-
-static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
-    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
-    return ctx->devices.size();
-}
-
-static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
-    GGML_ASSERT(index < ctx->devices.size());
-    return ctx->devices[index];
-}
-
-static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-    // reserved for future use
-    return nullptr;
-}
-
-static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
-    /* .get_name          = */ ggml_backend_cann_reg_get_name,
-    /* .get_device_count  = */ ggml_backend_cann_reg_get_device_count,
-    /* .get_device_get    = */ ggml_backend_cann_reg_get_device,
-    /* .get_proc_address  = */ ggml_backend_cann_reg_get_proc_address,
-};
-
-// backend registry, called only once for cann backend
-ggml_backend_reg_t ggml_backend_cann_reg() {
-    static ggml_backend_reg reg;
-    static bool initialized = false;
-
-    {
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        if (!initialized) {
-            aclInit(nullptr);
-            ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
-
-            for (int i = 0; i < ggml_cann_info().device_count; i++) {
-                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
-                dev_ctx->description = aclrtGetSocName();
-                dev_ctx->device = i;
-                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
-                ggml_cann_set_device(i);
-                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cann_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
-                };
-                ctx->devices.push_back(dev);
-            }
-
-            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cann_reg_interface,
-                /* .context   = */ ctx
-            };
-        }
-
-        initialized = true;
-    }
-
-    return &reg;
-}
-
 ggml_backend_t ggml_backend_cann_init(int32_t device) {
    aclInit(nullptr);
    if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
@ -2099,7 +1993,7 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
    ggml_backend_t cann_backend =
        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                         /* .interface = */ ggml_backend_cann_interface,
-                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                         /* .device    = */ nullptr,
                         /* .context   = */ ctx};

    return cann_backend;
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@ -1,46 +0,0 @@
-if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
-    set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
-    message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
-endif()
-
-if (CANN_INSTALL_DIR)
-    # Only Support Linux.
-    if (NOT UNIX)
-        message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
-    endif()
-
-    # Supported platforms: x86-64, arm64
-    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
-    else()
-        message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
-    endif()
-
-    # Set header and libs
-    set(CANN_INCLUDE_DIRS
-        ${CANN_INSTALL_DIR}/include
-        ${CANN_INSTALL_DIR}/include/aclnn
-        ${CANN_INSTALL_DIR}/acllib/include
-    )
-
-    add_subdirectory(kernels)
-    list(APPEND CANN_LIBRARIES
-        ascendcl
-        nnopbase
-        opapi
-        acl_op_compiler
-        ascendc_kernels
-    )
-
-    file(GLOB GGML_SOURCES_CANN "*.cpp")
-
-    add_library(ggml-cann ${GGML_SOURCES_CANN})
-    target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
-    target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
-    target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
-
-    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
-    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
-else()
-    message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
-endif()
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -27,6 +27,80 @@ extern "C" {

 #endif

+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
@ -314,6 +388,28 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)

 #endif // defined(__ARM_NEON)

+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    ggml_fp16_internal_t tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    ggml_fp16_t res;
+    ggml_fp16_internal_t tmp = f;
+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+    return res;
+}
+
+#else
+
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
@ -366,6 +462,153 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
 }
 #endif

+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -1,261 +0,0 @@
-add_library(ggml-cpu
-            ggml-cpu.c
-            ggml-cpu.cpp
-            ggml-cpu-aarch64.c
-            ggml-cpu-aarch64.h
-            ggml-cpu-quants.c
-            ggml-cpu-quants.h
-            )
-
-target_link_libraries(ggml-cpu PRIVATE ggml-base)
-target_include_directories(ggml-cpu PRIVATE . ..)
-
-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-
-        target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (GGML_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
-
-        add_compile_definitions(GGML_USE_OPENMP)
-
-        target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-
-        # FIXME: should be replaced with a compiler id check
-        #if (GGML_MUSA)
-        #    list(APPEND GGML_CPU_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
-        #    list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
-        #endif()
-    else()
-        message(WARNING "OpenMP not found")
-    endif()
-endif()
-
-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
-
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
-endif()
-
-if (GGML_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
-
-    message(STATUS "Using memkind for CPU HBM")
-
-    add_compile_definitions(GGML_USE_CPU_HBM)
-
-    target_link_libraries(ggml-cpu PUBLIC memkind)
-endif()
-
-if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-    (NOT CMAKE_OSX_ARCHITECTURES      AND
-     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-
-    message(STATUS "ARM detected")
-
-    if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
-
-        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        endif ()
-
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-            endif()
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
-        endif()
-        if (GGML_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
-    endif()
-elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-    message(STATUS "x86 detected")
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
-            include(cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                    list(APPEND ARCH_FLAGS -mavx512vbmi)
-                endif()
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                    list(APPEND ARCH_FLAGS -mavx512vnni)
-                endif()
-            endif()
-            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                    list(APPEND ARCH_FLAGS -mavx512bf16)
-                endif()
-            endif()
-            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
-            endif()
-            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
-            endif()
-            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
-    string(FIND "${POWER10_M}" "POWER10" substring_index)
-    if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
-        set(substring_index -1)
-    endif()
-
-    if (${substring_index} GREATER_EQUAL 0)
-       list(APPEND ARCH_FLAGS -mcpu=power10)
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    message(STATUS "loongarch64 detected")
-
-    list(APPEND ARCH_FLAGS -march=loongarch64)
-    if (GGML_LASX)
-        list(APPEND ARCH_FLAGS -mlasx)
-    endif()
-    if (GGML_LSX)
-        list(APPEND ARCH_FLAGS -mlsx)
-    endif()
-else()
-    message(STATUS "Unknown architecture")
-endif()
-
-if (GGML_CPU_AARCH64)
-    message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
-    add_compile_definitions(GGML_USE_CPU_AARCH64)
-endif()
-
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-
-if (EMSCRIPTEN)
-    set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
-endif()
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@ -1,30 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization
-void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
-
-// GEMV
-void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-// GEMM
-void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-void           ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
-enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
-
-#ifdef __cplusplus
-}
-#endif
-
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h
@ -1,63 +0,0 @@
-#pragma once
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-
-#include "ggml.h"
-
-// GGML CPU internal header
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization
-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-// Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -1,663 +0,0 @@
-#include "ggml-backend.h"
-#include "ggml-backend-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-cpu-aarch64.h"
-#include "ggml-impl.h"
-#include <cctype>
-#include <string>
-#include <vector>
-
-#if defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-// ggml-backend interface
-
-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-// buffer type AARCH64
-
-static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
-
-    ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_AARCH64";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-
-    if (buffer == NULL) {
-        return NULL;
-    }
-
-    buffer->buft = buft;
-    buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
-    buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ NULL,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_aarch64;
-}
-
-bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
-    return buft == ggml_backend_cpu_aarch64_buffer_type();
-}
-
-static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
-    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
-        std::vector<ggml_backend_buffer_type_t> bufts;
-
-#ifdef GGML_USE_CPU_HBM
-        bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
-#endif
-
-#ifdef GGML_USE_CPU_AARCH64
-        bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
-#endif
-
-        bufts.push_back(NULL);
-
-        return bufts;
-    }();
-
-    return bufts.data();
-
-    GGML_UNUSED(device);
-}
-
-// CPU backend - backend (stream)
-
-struct ggml_backend_cpu_context {
-    int                 n_threads;
-    ggml_threadpool_t   threadpool;
-
-    uint8_t *           work_data;
-    size_t              work_size;
-
-    ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
-
-static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
-    return "CPU";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    delete[] cpu_ctx->work_data;
-    delete cpu_ctx;
-    delete backend;
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
-        if (cpu_plan->cplan.work_data == NULL) {
-            delete cpu_plan;
-            return NULL;
-        }
-    }
-
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return cpu_plan;
-}
-
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    delete[] cpu_plan->cplan.work_data;
-    delete cpu_plan;
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        delete[] cpu_ctx->work_data;
-        cpu_ctx->work_data = new uint8_t[cplan.work_size];
-        if (cpu_ctx->work_data == NULL) {
-            cpu_ctx->work_size = 0;
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-        cpu_ctx->work_size = cplan.work_size;
-    }
-    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return ggml_graph_compute(cgraph, &cplan);
-}
-
-static const struct ggml_backend_i ggml_backend_cpu_i = {
-    /* .get_name                = */ ggml_backend_cpu_get_name,
-    /* .free                    = */ ggml_backend_cpu_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_cpu_guid(void) {
-    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    // initialize CPU backend now to avoid slowing the first graph computation
-    ggml_cpu_init();
-
-    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
-    ctx->threadpool          = NULL;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
-    ctx->abort_callback      = NULL;
-    ctx->abort_callback_data = NULL;
-
-    ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_cpu_guid(),
-        /* .interface = */ ggml_backend_cpu_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    if (cpu_backend == NULL) {
-        delete ctx;
-        return NULL;
-    }
-
-    return cpu_backend;
-}
-
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-
-    if (ctx->threadpool && ctx->threadpool != threadpool) {
-        // already had a different threadpool, pause/suspend it before switching
-        ggml_threadpool_pause(ctx->threadpool);
-    }
-    ctx->threadpool = threadpool;
-}
-
-void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->abort_callback = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-}
-
-// CPU backend - device
-
-struct ggml_backend_cpu_device_context {
-    std::string description = "CPU";
-
-    ggml_backend_cpu_device_context() {
-#ifdef __APPLE__
-        size_t len = 0;
-        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
-            description.resize(len);
-            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
-        }
-#elif defined(__linux__)
-        FILE * f = fopen("/proc/cpuinfo", "r");
-        if (f) {
-            char buf[1024];
-            while (fgets(buf, sizeof(buf), f)) {
-                if (strncmp(buf, "model name", 10) == 0) {
-                    char * p = strchr(buf, ':');
-                    if (p) {
-                        p++;
-                        while (std::isspace(*p)) {
-                            p++;
-                        }
-                        while (std::isspace(p[strlen(p) - 1])) {
-                            p[strlen(p) - 1] = '\0';
-                        }
-                        description = p;
-                        break;
-                    }
-                }
-            }
-            fclose(f);
-        }
-#elif defined(_WIN32)
-        HKEY hKey;
-        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                        0,
-                        KEY_READ,
-                        &hKey) == ERROR_SUCCESS) {
-            DWORD cpu_brand_size = 0;
-            if (RegQueryValueExA(hKey,
-                                TEXT("ProcessorNameString"),
-                                NULL,
-                                NULL,
-                                NULL,
-                                &cpu_brand_size) == ERROR_SUCCESS) {
-                description.resize(cpu_brand_size);
-                if (RegQueryValueExA(hKey,
-                                    TEXT("ProcessorNameString"),
-                                    NULL,
-                                    NULL,
-                                    (LPBYTE)&description[0], // NOLINT
-                                    &cpu_brand_size) == ERROR_SUCCESS) {
-                    if (description.find('\0') != std::string::npos) {
-                        description.resize(description.find('\0'));
-                    }
-                }
-            }
-            RegCloseKey(hKey);
-        }
-#endif
-    }
-};
-
-static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
-    return "CPU";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
-    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
-
-    return ctx->description.c_str();
-}
-
-static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_cpu_device_get_name(dev);
-    props->description = ggml_backend_cpu_device_get_description(dev);
-    props->type        = ggml_backend_cpu_device_get_type(dev);
-    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_cpu_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-
-    if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
-        if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
-            return false;
-        }
-    }
-
-    for (int i = 1; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
-            return false;
-        }
-    }
-
-    switch (op->op) {
-        case GGML_OP_CPY:
-            return
-                op->type != GGML_TYPE_IQ2_XXS &&
-                op->type != GGML_TYPE_IQ2_XS  &&
-                op->type != GGML_TYPE_IQ1_S   &&
-                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
-        case GGML_OP_MUL_MAT:
-            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
-        case GGML_OP_ROPE_BACK:
-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
-        case GGML_OP_IM2COL_BACK:
-            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
-        case GGML_OP_OUT_PROD:
-            return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
-        default:
-            return true;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
-    /* .get_name             = */ ggml_backend_cpu_device_get_name,
-    /* .get_description      = */ ggml_backend_cpu_device_get_description,
-    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
-    /* .get_type             = */ ggml_backend_cpu_device_get_type,
-    /* .get_props            = */ ggml_backend_cpu_device_get_props,
-    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
-    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
-    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// CPU backend - backend (reg)
-
-static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
-    return "CPU";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_cpu_device_context ctx;
-    static ggml_backend_device ggml_backend_cpu_device = {
-        /* .iface   = */ ggml_backend_cpu_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ &ctx,
-    };
-
-    return &ggml_backend_cpu_device;
-}
-
-struct ggml_backend_feature {
-    const char * name;
-    const char * value;
-};
-
-// Not used yet
-// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
-// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
-static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
-    static std::vector<ggml_backend_feature> features = []() {
-        std::vector<ggml_backend_feature> features;
-        if (ggml_cpu_has_sse3()) {
-            features.push_back({ "SSE3", "1" });
-        }
-        if (ggml_cpu_has_ssse3()) {
-            features.push_back({ "SSSE3", "1" });
-        }
-        if (ggml_cpu_has_avx()) {
-            features.push_back({ "AVX", "1" });
-        }
-        if (ggml_cpu_has_avx2()) {
-            features.push_back({ "AVX2", "1" });
-        }
-        if (ggml_cpu_has_f16c()) {
-            features.push_back({ "F16C", "1" });
-        }
-        if (ggml_cpu_has_fma()) {
-            features.push_back({ "FMA", "1" });
-        }
-        if (ggml_cpu_has_avx_vnni()) {
-            features.push_back({ "AVX_VNNI", "1" });
-        }
-        if (ggml_cpu_has_avx512()) {
-            features.push_back({ "AVX512", "1" });
-        }
-        if (ggml_cpu_has_avx512_vbmi()) {
-            features.push_back({ "AVX512_VBMI", "1" });
-        }
-        if (ggml_cpu_has_avx512_vnni()) {
-            features.push_back({ "AVX512_VNNI", "1" });
-        }
-        if (ggml_cpu_has_avx512_bf16()) {
-            features.push_back({ "AVX512_BF16", "1" });
-        }
-        if (ggml_cpu_has_amx_int8()) {
-            features.push_back({ "AMX_INT8", "1" });
-        }
-        if (ggml_cpu_has_neon()) {
-            features.push_back({ "NEON", "1" });
-        }
-        if (ggml_cpu_has_arm_fma()) {
-            features.push_back({ "ARM_FMA", "1" });
-        }
-        if (ggml_cpu_has_fp16_va()) {
-            features.push_back({ "FP16_VA", "1" });
-        }
-        if (ggml_cpu_has_matmul_int8()) {
-            features.push_back({ "MATMUL_INT8", "1" });
-        }
-        if (ggml_cpu_has_sve()) {
-            features.push_back({ "SVE", "1" });
-        }
-        if (ggml_cpu_get_sve_cnt() > 0) {
-            static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
-            features.push_back({ "SVE_CNT", sve_cnt.c_str() });
-        }
-        if (ggml_cpu_has_riscv_v()) {
-            features.push_back({ "RISCV_V", "1" });
-        }
-        if (ggml_cpu_has_vsx()) {
-            features.push_back({ "VSX", "1" });
-        }
-        if (ggml_cpu_has_wasm_simd()) {
-            features.push_back({ "WASM_SIMD", "1" });
-        }
-        if (ggml_cpu_has_llamafile()) {
-            features.push_back({ "LLAMAFILE", "1" });
-        }
-
-        features.push_back({ nullptr, nullptr });
-
-        return features;
-    }();
-
-    return features.data();
-
-    GGML_UNUSED(reg);
-}
-
-static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_cpu_set_n_threads;
-    }
-    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
-        return (void *)ggml_backend_cpu_get_extra_bufts;
-    }
-
-    return NULL;
-
-    GGML_UNUSED(reg);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
-    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
-    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_cpu_reg(void) {
-    // init CPU feature detection
-    ggml_cpu_init();
-
-    static struct ggml_backend_reg ggml_backend_cpu_reg = {
-        /* .iface   = */ ggml_backend_cpu_reg_i,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_reg;
-}
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -16,11 +16,11 @@
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
+#include "ggml-cuda/dmmv.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmq.cuh"
-#include "ggml-cuda/mmv.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
@ -36,7 +36,7 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
-#include "ggml-cuda/wkv6.cuh"
+#include "ggml-cuda/rwkv-wkv.cuh"

 #include <algorithm>
 #include <array>
@ -91,7 +91,7 @@ int ggml_cuda_get_device() {

 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
    ggml_cuda_set_device(device);
-#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
+#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
    auto res = hipMallocManaged(ptr, size);
    if (res == hipSuccess) {
        // if error we "need" to know why...
@ -100,7 +100,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
    return res;
 #else

-#if !defined(GGML_USE_HIP)
+#if !defined(GGML_USE_HIPBLAS)
    cudaError_t err;
    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
    {
@ -113,7 +113,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
    return err;
 #else
    return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIP)
+#endif // !defined(GGML_USE_HIPBLAS)

 #endif
 }
@ -151,7 +151,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;

-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -163,7 +163,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
        info.devices[id].vmm = !!device_vmm;

        cudaDeviceProp prop;
@ -175,13 +175,13 @@ static ggml_cuda_device_info ggml_cuda_init() {

        info.devices[id].nsm   = prop.multiProcessorCount;
        info.devices[id].smpb  = prop.sharedMemPerBlock;
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        info.devices[id].smpbo = prop.sharedMemPerBlock;
        info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
 #else
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = 100*prop.major + 10*prop.minor;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    }

    for (int id = 0; id < info.device_count; ++id) {
@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                return;
            }
        }
-        GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
+        GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
        ggml_cuda_set_device(device);
        CUDA_CHECK(cudaFree(ptr));
        pool_size -= size;
@ -299,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };

 // pool with virtual memory
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

@ -393,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
    }
 };
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)

 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }

@ -421,13 +421,18 @@ struct ggml_backend_cuda_buffer_context {
    }
 };

-static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    delete ctx;
+    return ctx->name.c_str();
 }

 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-    return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
+    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
+}
+
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+    delete ctx;
 }

 static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
@ -510,6 +515,7 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }

 static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
+    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
@ -542,6 +548,8 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac

    ggml_cuda_set_device(buft_ctx->device);

+    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
+
    void * dev_ptr;
    cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
    if (err != cudaSuccess) {
@ -649,9 +657,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
 }

 struct ggml_backend_cuda_split_buffer_type_context {
-    int main_device;
    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
-    std::string name;
 };

 struct ggml_backend_cuda_split_buffer_context {
@ -674,6 +680,16 @@ struct ggml_backend_cuda_split_buffer_context {
    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 };

+static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return GGML_CUDA_NAME "_Split";
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
+    GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
+}

 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@ -817,6 +833,7 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u
 }

 static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
+    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
@ -831,9 +848,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 // cuda split buffer type

 static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+    return GGML_CUDA_NAME "_Split";

-    return ctx->name.c_str();
+    GGML_UNUSED(buft);
 }

 static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
@ -898,11 +915,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };

-ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
+ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);

-    static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
+    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;

    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};

@ -920,23 +937,18 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device,
        }
    }

-    auto it = buft_map.find({main_device, tensor_split_arr});
+    auto it = buft_map.find(tensor_split_arr);
    if (it != buft_map.end()) {
        return &it->second;
    }
-    auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
-        main_device,
-        tensor_split_arr,
-        GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
-    };

    struct ggml_backend_buffer_type buft {
        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
-        /* .context = */ ctx,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
+        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
    };

-    auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
+    auto result = buft_map.emplace(tensor_split_arr, buft);
    return &result.first->second;
 }

@ -948,6 +960,12 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
    GGML_UNUSED(buft);
 }

+static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return GGML_CUDA_NAME "_Host";
+
+    GGML_UNUSED(buffer);
+}
+
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    CUDA_CHECK(cudaFreeHost(buffer->context));
 }
@ -962,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
    if (err != cudaSuccess) {
        // clear the error
        cudaGetLastError();
-        GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
        return nullptr;
    }
@ -980,6 +998,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm

    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

    return buffer;
@ -1020,12 +1039,120 @@ typedef void (*ggml_cuda_op_mul_mat_t)(

 #define MUL_MAT_SRC1_COL_STRIDE 128

+static __global__ void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
+
+    const half * x = (const half *) vx;
+
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi = __half2float(x[ix]);
+
+        const int row_y = col_x;
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+    tmp = warp_reduce_sum(tmp);
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
+
+    const half * x = (const half *) vx;
+
+    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y   = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst   = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int row_y = col_x;
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const int iy = channel*nrows_y + row_y;
+
+        const float xi = __half2float(x[ix]);
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+    tmp = warp_reduce_sum(tmp);
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void ggml_mul_mat_p021_f16_f32_cuda(
+    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
+    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
+
+    const dim3 block_nums(1, nrows_x, nchannels_y);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
+}
+
+static void ggml_mul_mat_vec_nc_f16_f32_cuda(
+    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
+    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
+
+    const dim3 block_nums(1, nrows_x, nchannels_y);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
+        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
+}
+
 static cudaError_t ggml_cuda_cpy_tensor_2d(
    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {

    GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
-    const char * src_ptr = (const char *) src->data;
-    char       * dst_ptr = (char       *) dst;
+    char * src_ptr = (char *) src->data;
+    char * dst_ptr = (char *) dst;

    const int64_t ne0 = src->ne[0];
    const int64_t nb0 = src->nb[0];
@ -1035,7 +1162,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
    const enum ggml_type type = src->type;
    const int64_t ts = ggml_type_size(type);
    const int64_t bs = ggml_blck_size(type);
-    const int64_t i1_diff = i1_high - i1_low;
+    int64_t i1_diff = i1_high - i1_low;

    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
    if (nb0 == ts && nb1 == ts*ne0/bs) {
@ -1189,17 +1316,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
                        CUDA_CHECK(err);
-                    } else {
-                        // reset the error
-                        cudaGetLastError();
                    }
                } else {
                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
                    if (err != cudaErrorPeerAccessNotEnabled) {
                        CUDA_CHECK(err);
-                    } else {
-                        // reset the error
-                        cudaGetLastError();
                    }
                }
            }
@ -1217,7 +1338,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
    // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
    cudaMemcpy3DPeerParms p = {};
    p.dstDevice = dstDevice;
@ -1231,7 +1352,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
    GGML_UNUSED(dstDevice);
    GGML_UNUSED(srcDevice);
    return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
 }

 static void ggml_cuda_op_mul_mat(
@ -1279,7 +1400,7 @@ static void ggml_cuda_op_mul_mat(

    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);

-    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
+    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
    GGML_ASSERT(!(split && ne02 > 1));
    GGML_ASSERT(!(split && ne03 > 1));
    GGML_ASSERT(!(split && ne02 < ne12));
@ -1358,24 +1479,14 @@ static void ggml_cuda_op_mul_mat(
        if (src0_is_contiguous) {
            dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
        } else {
-            // If src0 is not contiguous it will be copied to a temporary buffer.
-            // This buffer needs to be cleared entirely because multiple regions will function as padding.
-            const size_t nbytes_data    = ggml_nbytes(src0);
-            const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
-            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
-        // TODO: remove this for MUSA once the Guilty Lockup issue is resolved
-#ifndef GGML_USE_MUSA
-            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
-#else // GGML_USE_MUSA
-            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
-#endif // !GGML_USE_MUSA
+            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
        }

-        // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
+        // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
-            const size_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
-            const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
-            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
+            const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
+            const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
        }

        if (src1_on_device && src1_is_contiguous) {
@ -1546,6 +1657,58 @@ static void ggml_cuda_op_mul_mat(
    }
 }

+static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    cudaStream_t main_stream = ctx.stream();
+
+    void  * src0_ddq = src0->data;
+    float * src1_ddf = (float *) src1->data;
+    float * dst_ddf  = (float *) dst->data;
+
+    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+
+static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    cudaStream_t main_stream = ctx.stream();
+
+    void  * src0_ddq = src0->data;
+    float * src1_ddf = (float *) src1->data;
+    float * dst_ddf  = (float *) dst->data;
+
+    const int64_t row_stride_x = nb01 / sizeof(half);
+    const int64_t channel_stride_x = nb02 / sizeof(half);
+
+    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+
 static __global__ void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
@ -1717,19 +1880,23 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 }

 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
+    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);

-    bool use_mul_mat_vec   = src0->type == GGML_TYPE_F16
+    bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+        && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
+    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
+    bool              use_mul_mat_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

-    bool any_gpus_with_slow_fp16   = false;
-    bool any_gpus_without_fp16_mma = false;
+    // if mmvq is available it's a better choice than dmmv:
+#ifndef GGML_CUDA_FORCE_DMMV
+    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+#endif // GGML_CUDA_FORCE_DMMV
+
+    bool any_gpus_with_slow_fp16 = false;

    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@ -1740,16 +1907,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
                continue;
            }

-            const int cc              = ggml_cuda_info().devices[id].cc;
-            use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-            any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
-            any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
+            const int cc            = ggml_cuda_info().devices[id].cc;
+            use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
        }
    } else {
-        const int cc              = ggml_cuda_info().devices[ctx.device].cc;
-        use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-        any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
-        any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
+        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
+        use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
    }

    // debug helpers
@ -1760,16 +1925,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

-    if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
-        // the custom F16 vector kernel can be used over batched cuBLAS GEMM
-        // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
-        ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
+    if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        // FP32 precision KQ single-batch for batch size 1 without FlashAttention
+        ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
+    } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        // FP32 precision KQV single-batch for batch size 1 without FlashAttention
+        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
-        // general KQ + KQV multi-batch without FlashAttention
+        // KQ + KQV multi-batch without FlashAttention
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
-    } else if (use_mul_mat_vec) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
+    } else if (use_dequantize_mul_mat_vec) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
    } else if (use_mul_mat_vec_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
@ -1840,7 +2007,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

    GGML_TENSOR_BINARY_OP_LOCALS

-    GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
+    GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");

    cudaStream_t stream = ctx.stream();

@ -1973,7 +2140,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

 static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
    // why is this here instead of mul_mat?
-    if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
+    if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
        ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
    }

@ -2155,8 +2322,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
-        case GGML_OP_RWKV_WKV6:
-            ggml_cuda_op_rwkv_wkv6(ctx, dst);
+        case GGML_OP_RWKV_WKV:
+            ggml_cuda_op_rwkv_wkv(ctx, dst);
            break;
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
            ggml_cuda_cross_entropy_loss_back(ctx, dst);
@ -2194,6 +2361,12 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
    delete backend;
 }

+static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
+}
+
 static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@ -2233,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_

    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
 #ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
+        GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
 #endif
        return false;
    }
@ -2351,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+            GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
 #endif
        }
    }
@ -2399,17 +2572,17 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                continue;
            }

-            if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
-                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
 #endif
            }

            if (node->op == GGML_OP_MUL_MAT_ID) {
                use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
-                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
 #endif
            }

@ -2418,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                // Changes in batch size or context size can cause changes to the grid size of some kernels.
                use_cuda_graph = false;
 #ifndef NDEBUG
-                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
            }

@ -2430,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                if (!ptr) {
                    use_cuda_graph = false;
 #ifndef NDEBUG
-                    GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+                    GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
 #endif
                } else {
                    if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
@ -2454,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+            GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
 #endif
        }
    }
@ -2486,8 +2659,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    if (node->src[j] != nullptr) {
                        assert(node->src[j]->buffer);
-                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
-                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
+                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
                    }
                }
 #endif
@ -2513,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                use_cuda_graph = false;
                cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
 #ifndef NDEBUG
-                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
+                GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
 #endif
            } else {
                graph_evaluated_or_captured = true; // CUDA graph has been captured
@ -2580,7 +2752,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
        if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
+            GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
 #endif
            // The pre-existing graph exec cannot be updated due to violated constraints
            // so instead clear error and re-instantiate
@ -2629,6 +2801,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
 static const ggml_backend_i ggml_backend_cuda_interface = {
    /* .get_name                = */ ggml_backend_cuda_get_name,
    /* .free                    = */ ggml_backend_cuda_free,
+    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
@ -2638,6 +2811,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op             = */ NULL, // moved to device
+    /* .supports_buft           = */ NULL, // moved to device
+    /* .offload_op              = */ NULL, // moved to device
    /* .event_record            = */ ggml_backend_cuda_event_record,
    /* .event_wait              = */ ggml_backend_cuda_event_wait,
 };
@ -2678,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
        // clear the error
        cudaGetLastError();

-        GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
+        GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
        return false;
    }
@ -2727,7 +2903,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *

 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
+    return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
 }

 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@ -2744,14 +2920,13 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
 #endif

    props->caps = {
-        /* .async                 = */ true,
-        /* .host_buffer           = */ host_buffer,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ events,
+        /* async       */ true,
+        /* host_buffer */ host_buffer,
+        /* events      */ events,
    };
 }

-static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
    GGML_UNUSED(params);
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    return ggml_backend_cuda_init(ctx->device);
@ -2767,29 +2942,18 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
    return ggml_backend_cuda_host_buffer_type();
 }

+static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
 // TODO: move these functions here
 static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;

-    // split buffers can only be used with GGML_OP_MUL_MAT
-    if (op->op != GGML_OP_MUL_MAT) {
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
-                return false;
-            }
-        }
-    }
-
-    // check if all the sources are allocated on this device
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
-            ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
-            if (buft_ctx->device != dev_ctx->device) {
-                return false;
-            }
-        }
-    }
-
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
@ -2814,17 +2978,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                struct ggml_tensor * a = op->src[0];
                struct ggml_tensor * b = op->src[1];
-                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
-                // this avoids some edge cases (and the performance would not be good anyways)
-                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
-                    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
-                    int64_t row_low;
-                    int64_t row_high;
-                    get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
-                    if (row_low == row_high) {
-                        return false;
-                    }
-                }
                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                    return false;
                }
@ -2960,20 +3113,18 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            } break;
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
-            break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
+        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
+        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
@ -2989,6 +3140,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
+            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
@ -3000,15 +3152,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
-        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_RWKV_WKV:
            return true;
        case GGML_OP_FLASH_ATTN_EXT: {
 #ifndef FLASH_ATTN_AVAILABLE
            return false;
 #endif
-            if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
-                return false;
-            }
            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
                return true;
            }
@ -3031,27 +3180,24 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 }

 static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
-}
-
-static int64_t get_op_batch_size(const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_GET_ROWS:
-            return 0;
-        case GGML_OP_MUL_MAT:
-            return op->ne[1];
-        case GGML_OP_MUL_MAT_ID:
-        case GGML_OP_ROPE:
-            return op->ne[2];
-        default:
-            return ggml_nrows(op);
+    if (ggml_backend_buft_is_cuda_split(buft)) {
+        return true;
    }
+
+    if (ggml_backend_buft_is_cuda(buft)) {
+        ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
+        ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+        return buft_ctx->device == dev_ctx->device;
+    }
+
+    return false;
 }

 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    const int min_batch_size = 32;

-    return get_op_batch_size(op) >= min_batch_size;
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);

    GGML_UNUSED(dev);
 }
@ -3092,10 +3238,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
    /* .get_memory              = */ ggml_backend_cuda_device_get_memory,
    /* .get_type                = */ ggml_backend_cuda_device_get_type,
    /* .get_props               = */ ggml_backend_cuda_device_get_props,
-    /* .init_backend            = */ ggml_backend_cuda_device_init_backend,
+    /* .init_backend            = */ ggml_backend_cuda_device_init,
    /* .get_buffer_type         = */ ggml_backend_cuda_device_get_buffer_type,
    /* .get_host_buffer_type    = */ ggml_backend_cuda_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr    = */ NULL,
+    /* .buffer_from_host_ptr    = */ ggml_backend_cuda_device_buffer_from_host_ptr,
    /* .supports_op             = */ ggml_backend_cuda_device_supports_op,
    /* .supports_buft           = */ ggml_backend_cuda_device_supports_buft,
    /* .offload_op              = */ ggml_backend_cuda_device_offload_op,
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -1,155 +0,0 @@
-cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-find_package(CUDAToolkit)
-
-if (CUDAToolkit_FOUND)
-    message(STATUS "CUDA Toolkit found")
-
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # native == GPUs available at build time
-        # 52     == Maxwell, lowest CUDA 12 standard
-        # 60     == P100, FP16 CUDA intrinsics
-        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
-        # 70     == V100, FP16 tensor cores
-        # 75     == Turing, int8 tensor cores
-        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
-            set(CMAKE_CUDA_ARCHITECTURES "native")
-        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    enable_language(CUDA)
-
-    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
-    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
-
-    file(GLOB   GGML_SOURCES_CUDA "*.cu")
-    file(GLOB   SRCS "template-instances/fattn-wmma*.cu")
-    list(APPEND GGML_SOURCES_CUDA ${SRCS})
-    file(GLOB   SRCS "template-instances/mmq*.cu")
-    list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-    if (GGML_CUDA_FA_ALL_QUANTS)
-        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-    endif()
-
-    add_library(ggml-cuda
-                ${GGML_HEADERS_CUDA}
-                ${GGML_SOURCES_CUDA}
-                )
-
-    target_link_libraries(ggml-cuda PRIVATE ggml-base)
-    target_include_directories(ggml-cuda PRIVATE . ..)
-
-    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
-
-    if (GGML_CUDA_GRAPHS)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
-    endif()
-
-    if (GGML_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-    endif()
-
-    if (GGML_CUDA_FORCE_CUBLAS)
-        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
-    endif()
-
-    if (GGML_CUDA_NO_VMM)
-        add_compile_definitions(GGML_CUDA_NO_VMM)
-    endif()
-
-    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-        add_compile_definitions(GGML_CUDA_F16)
-    endif()
-
-    if (GGML_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
-    if (GGML_STATIC)
-        if (WIN32)
-            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-        else ()
-            target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-        endif()
-    else()
-        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
-    endif()
-
-    if (GGML_CUDA_NO_VMM)
-        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
-    else()
-        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
-    endif()
-
-    set(CUDA_CXX_FLAGS "")
-
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (GGML_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
-    endif()
-
-    if (GGML_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    endif()
-
-    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-
-    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-else()
-    message(FATAL_ERROR "CUDA Toolkit not found")
-endif()
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -6,7 +6,7 @@
 #include <cstdint>
 #include <memory>

-#if defined(GGML_USE_HIP)
+#if defined(GGML_USE_HIPBLAS)
 #define GGML_COMMON_DECL_HIP
 #define GGML_COMMON_IMPL_HIP
 #else
@ -26,13 +26,13 @@
 #include <string>
 #include <vector>

-#if defined(GGML_USE_HIP)
+#if defined(GGML_USE_HIPBLAS)
 #include "vendors/hip.h"
 #elif defined(GGML_USE_MUSA)
 #include "vendors/musa.h"
 #else
 #include "vendors/cuda.h"
-#endif // defined(GGML_USE_HIP)
+#endif // defined(GGML_USE_HIPBLAS)

 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in

 #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)

-#if !defined(GGML_USE_HIP)
+#if !defined(GGML_USE_HIPBLAS)
 static const char * cu_get_error_str(CUresult err) {
    const char * err_str;
    cuGetErrorString(err, &err_str);
@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif // GGML_CUDA_F16

-#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_AVAILABLE
-#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL

 #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 #define FAST_FP16_AVAILABLE
 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 #define FP16_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 #define INT8_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING

 #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
 #define FLASH_ATTN_AVAILABLE
@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
 static __device__ void no_device_code(
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {

-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    GGML_UNUSED(arch_list);
 #else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();

    GGML_UNUSED(no_device_code); // suppress unused function warning
@ -176,7 +176,7 @@ static __device__ void no_device_code(
 #endif // __CUDA_ARCH__

 static __device__ __forceinline__ int warp_reduce_sum(int x) {
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
    return __reduce_add_sync(0xffffffff, x);
 #else
 #pragma unroll
@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
 }

 static __device__ __forceinline__ float warp_reduce_sum(float x) {
@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #ifdef FP16_AVAILABLE

-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
    }
    return a;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

 #else
    NO_DEVICE_CODE;
@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #ifdef FP16_AVAILABLE

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
    return __float2half(fmaxf(__half2float(a), __half2float(b)));
 #else
    return __hmax(a, b);
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX

 #else
   NO_DEVICE_CODE;
@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
 }

 static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))

 #if CUDART_VERSION >= CUDART_HMAX
    return __hmax2(a, b);
@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
    GGML_UNUSED(a);
    GGML_UNUSED(b);
    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 }

 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1) {
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #else
   GGML_UNUSED(x);
   NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }

 #if CUDART_VERSION < CUDART_HMASK
@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 #endif // CUDART_VERSION < CUDART_HMASK

 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #endif
    return c;

-#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

 #if __CUDA_ARCH__ >= MIN_CC_DP4A
    return __dp4a(a, b, c);
@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
    return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }

 // TODO: move to ggml-common.h
--- a/ggml/src/ggml-cuda/count-equal.cu
+++ b/ggml/src/ggml-cuda/count-equal.cu
@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
-    const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
+    const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);

    CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));

--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
@ -1,6 +1,6 @@
 #include "common.cuh"

-#define CUDA_CPY_BLOCK_SIZE 64
+#define CUDA_CPY_BLOCK_SIZE 32

 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);

--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@ -416,11 +416,10 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,

 static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const half * x = (const half *) vx;
-    // load 2 halfs into register in a single instruction
-    const half2 x_reg = *((half2 *) &(x[ib + iqs]));
+
    // automatic half -> float type cast if dfloat == float
-    v.x = __low2float(x_reg);
-    v.y = __high2float(x_reg);
+    v.x = x[ib + iqs + 0];
+    v.y = x[ib + iqs + 1];
 }

 static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
@ -477,28 +476,13 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
            // matrix multiplication
            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
 #ifdef GGML_CUDA_F16
-            if ( y_offset == 1 ) {
-                // load 2 dfloats into register in a single instruction
-                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
-                tmp += __hmul2(v, y_reg);
-            }
-            else {
-                tmp += __hmul2(v, {
-                        y[iybs + iqs + j/qr + 0],
-                        y[iybs + iqs + j/qr + y_offset]
-                    });
-            }
+            tmp += __hmul2(v, {
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]
+            });
 #else
-            if ( y_offset == 1 ) {
-                // load 2 dfloats into register in a single instruction
-                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
-                tmp += v.x * y_reg.x;
-                tmp += v.y * y_reg.y;
-            }
-            else {
-                tmp += v.x * y[iybs + iqs + j/qr + 0];
-                tmp += v.y * y[iybs + iqs + j/qr + y_offset];
-            }
+            tmp += v.x * y[iybs + iqs + j/qr + 0];
+            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
 #endif // GGML_CUDA_F16
        }
    }
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
 }

 template<int D, int parallel_blocks> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@ -5,9 +5,9 @@
 #define FATTN_KQ_STRIDE_TILE_F16 64

 template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_tile_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@ -5,9 +5,9 @@
 #define FATTN_KQ_STRIDE_TILE_F32 32

 template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_tile_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@ -7,9 +7,9 @@

 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -13,9 +13,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q   = dst->src[0];

-    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
+    const int32_t precision = KQV->op_params[3];

-    if (prec != GGML_PREC_DEFAULT) {
+    if (precision != GGML_PREC_DEFAULT) {
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
            switch (Q->ne[0]) {
@ -301,11 +301,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst

    ggml_cuda_set_device(ctx.device);
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
+    const int32_t precision = KQV->op_params[3];

    // On AMD the tile kernels perform poorly, use the vec kernel instead:
    if (cc >= CC_OFFSET_AMD) {
-        if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
+        if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
        } else {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
@ -332,7 +332,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }

    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
-        if (prec == GGML_PREC_DEFAULT) {
+        if (precision == GGML_PREC_DEFAULT) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
            return;
        } else if(Q->ne[0] <= 128) {
--- a/ggml/src/ggml-cuda/ggml/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/ggml/CMakeLists.txt
@ -1,155 +0,0 @@
-cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-find_package(CUDAToolkit)
-
-if (CUDAToolkit_FOUND)
-    message(STATUS "CUDA Toolkit found")
-
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # native == GPUs available at build time
-        # 52     == Maxwell, lowest CUDA 12 standard
-        # 60     == P100, FP16 CUDA intrinsics
-        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
-        # 70     == V100, FP16 tensor cores
-        # 75     == Turing, int8 tensor cores
-        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
-            set(CMAKE_CUDA_ARCHITECTURES "native")
-        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    enable_language(CUDA)
-
-    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
-    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
-
-    file(GLOB   GGML_SOURCES_CUDA "*.cu")
-    file(GLOB   SRCS "template-instances/fattn-wmma*.cu")
-    list(APPEND GGML_SOURCES_CUDA ${SRCS})
-    file(GLOB   SRCS "template-instances/mmq*.cu")
-    list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-    if (GGML_CUDA_FA_ALL_QUANTS)
-        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-    endif()
-
-    add_library(ggml-cuda
-                ${GGML_HEADERS_CUDA}
-                ${GGML_SOURCES_CUDA}
-                )
-
-    target_link_libraries(ggml-cuda PRIVATE ggml-base)
-    target_include_directories(ggml-cuda PRIVATE . ..)
-
-    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
-
-    if (GGML_CUDA_GRAPHS)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
-    endif()
-
-    if (GGML_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-    endif()
-
-    if (GGML_CUDA_FORCE_CUBLAS)
-        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
-    endif()
-
-    if (GGML_CUDA_NO_VMM)
-        add_compile_definitions(GGML_CUDA_NO_VMM)
-    endif()
-
-    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-        add_compile_definitions(GGML_CUDA_F16)
-    endif()
-
-    if (GGML_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
-    if (GGML_STATIC)
-        if (WIN32)
-            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-        else ()
-            target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-        endif()
-    else()
-        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
-    endif()
-
-    if (GGML_CUDA_NO_VMM)
-        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
-    else()
-        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
-    endif()
-
-    set(CUDA_CXX_FLAGS "")
-
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (GGML_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
-    endif()
-
-    if (GGML_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    endif()
-
-    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-
-    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-else()
-    message(FATAL_ERROR "CUDA Toolkit not found")
-endif()
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@ -91,9 +91,9 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t OH = is_2D ? dst->ne[2] : 1;
    const int64_t OW =         dst->ne[1];

-    const size_t  delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-    const int64_t batch        = src1->ne[is_2D ? 3 : 2];
-    const size_t  batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const int64_t batch = src1->ne[3];
+    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32

    if(dst->type == GGML_TYPE_F16) {
        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -8,6 +8,8 @@ void ggml_cuda_op_mul_mat_q(

    const int64_t ne00 = src0->ne[0];

+    const int64_t nb01 = src0->nb[1];
+
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    GGML_ASSERT(ne10 % QK8_1 == 0);
@ -15,7 +17,7 @@ void ggml_cuda_op_mul_mat_q(
    const int64_t ne0 = dst->ne[0];

    const int64_t row_diff = row_high - row_low;
-    const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
+    const int64_t stride00 = nb01 / ggml_type_size(src0->type);

    int id = ggml_cuda_get_device();
    const int compute_capability = ggml_cuda_info().devices[id].cc;
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -100,9 +100,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
    return 128;
 #else // INT8_MMA_AVAILABLE

-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    return 128;
-#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

 #if __CUDA_ARCH__ >= CC_VOLTA
 #ifdef GGML_CUDA_FORCE_MMQ
@ -115,7 +115,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
    return 64;
 #endif // __CUDA_ARCH__ >= CC_VOLTA

-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #endif // INT8_MMA_AVAILABLE
 }

@ -124,7 +124,7 @@ static constexpr int get_mmq_y_host(const int cc) {
 }

 static constexpr __device__ int get_mmq_y_device() {
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA1)
    return 64;
 #else
@ -136,7 +136,7 @@ static constexpr __device__ int get_mmq_y_device() {
 #else
    return 64;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }

 #define MMQ_DP4A_TXS_Q4_0    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0   + mmq_y/QI4_0,     0}
@ -2569,7 +2569,7 @@ static __device__ void mul_mat_q_process_tile(
 // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598

 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
    __launch_bounds__(WARP_SIZE*nwarps, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
@ -2579,7 +2579,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #else
    __launch_bounds__(WARP_SIZE*nwarps, 2)
 #endif // __CUDA_ARCH__ >= CC_VOLTA
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 static __global__ void mul_mat_q(
    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
    constexpr int mmq_y = get_mmq_y_device();

    // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
-#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
+#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
    {
        constexpr bool fixup = false;
        mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
                blockIdx.x, blockIdx.y, 0, ne00/qk);
        return;
    }
-#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
+#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA

    const     int64_t blocks_per_ne00 = ne00 / qk;
    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
@ -2765,14 +2765,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a

    const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
    static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
    if (!shmem_limit_raised[id]) {
        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
        shmem_limit_raised[id] = true;
    }
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))

    const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@ -1,223 +0,0 @@
-#include "common.cuh"
-#include "mmv.cuh"
-
-template <typename type_acc, int block_size>
-static __global__ void mul_mat_vec(
-        const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
-        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
-    const int64_t row     = blockIdx.x;
-    const int64_t channel = blockIdx.z;
-    const int     tid     = threadIdx.x;
-
-    x   += (channel/channel_ratio)*stride_channel_x + row*stride_row;
-    y   +=  channel               *stride_channel_y;
-    dst +=  channel               *stride_channel_dst;
-
-    const half2  * x2 = (const half2  *) x;
-    const float2 * y2 = (const float2 *) y;
-
-    extern __shared__ char data_mmv[];
-    float * buf_iw = (float *) data_mmv;
-
-    if (block_size > WARP_SIZE) {
-        if (tid < WARP_SIZE) {
-            buf_iw[tid] = 0.0f;
-        }
-        __syncthreads();
-    }
-
-    float sumf;
-
-    if (std::is_same<type_acc, float>::value) {
-        sumf = 0.0f;
-
-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-            const float2 tmpx = __half22float2(x2[col2]);
-            const float2 tmpy = y2[col2];
-            sumf += tmpx.x * tmpy.x;
-            sumf += tmpx.y * tmpy.y;
-        }
-    } else {
-#ifdef FP16_AVAILABLE
-        half2 sumh2 = make_half2(0.0f, 0.0f);
-
-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-            const float2 tmp = y2[col2];
-            sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
-        }
-
-        sumf = __low2float(sumh2) + __high2float(sumh2);
-#else
-        NO_DEVICE_CODE;
-#endif // FP16_AVAILABLE
-    }
-
-    sumf = warp_reduce_sum(sumf);
-
-    if (block_size > WARP_SIZE) {
-        buf_iw[tid/WARP_SIZE] = sumf;
-        __syncthreads();
-        if (tid > WARP_SIZE) {
-            return;
-        }
-        sumf = buf_iw[tid];
-        sumf = warp_reduce_sum(sumf);
-    }
-
-    if (tid != 0) {
-        return;
-    }
-
-    dst[row] = sumf;
-}
-
-template <typename type_acc>
-static void launch_mul_mat_vec_cuda(
-        const half * x, const float * y, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
-        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
-        cudaStream_t stream) {
-    GGML_ASSERT(ncols      % 2 == 0);
-    GGML_ASSERT(stride_row % 2 == 0);
-    GGML_ASSERT(nchannels_y % nchannels_x == 0);
-    const int64_t channel_ratio = nchannels_y / nchannels_x;
-
-    int64_t block_size_best = WARP_SIZE;
-    int64_t niter_best      = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
-    for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
-        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
-        if (niter < niter_best) {
-            niter_best      = niter;
-            block_size_best = block_size;
-        }
-    }
-
-    const int smem = WARP_SIZE*sizeof(float);
-    const dim3 block_nums(nrows, 1, nchannels_y);
-    const dim3 block_dims(block_size_best, 1, 1);
-    switch (block_size_best) {
-        case   32: {
-            mul_mat_vec<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case   64: {
-            mul_mat_vec<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case   96: {
-            mul_mat_vec<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case  128: {
-            mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case  160: {
-            mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case  192: {
-            mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case  224: {
-            mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        case  256: {
-            mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
-        } break;
-        default: {
-            GGML_ABORT("fatal error");
-        } break;
-    }
-}
-
-static void mul_mat_vec_cuda(
-        const half * x, const float * y, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
-        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
-        enum ggml_prec prec, cudaStream_t stream) {
-    switch (prec) {
-        case GGML_PREC_DEFAULT: {
-            launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
-                stride_channel_x, stride_channel_y, stride_channel_dst, stream);
-        } break;
-        case GGML_PREC_F32: {
-            launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
-                stride_channel_x, stride_channel_y, stride_channel_dst, stream);
-        } break;
-    }
-}
-
-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-
-    GGML_ASSERT(src1->ne[1] == 1);
-
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
-
-    const half  * src0_d = (const half  *) src0->data;
-    const float * src1_d = (const float *) src1->data;
-    float       *  dst_d = (float       *)  dst->data;
-
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne12 = src1->ne[2];
-    GGML_ASSERT(dst->ne[2] == ne12);
-
-    GGML_ASSERT(src0->ne[3] == 1);
-    GGML_ASSERT(src1->ne[3] == 1);
-    GGML_ASSERT( dst->ne[3] == 1);
-
-    const int64_t stride_row         = src0->nb[1] / ggml_type_size(src0->type);
-    const int64_t channel_stride_x   = src0->nb[2] / ggml_type_size(src0->type);
-    const int64_t channel_stride_y   = src1->nb[2] / ggml_type_size(src1->type);
-    const int64_t channel_stride_dst =  dst->nb[2] / ggml_type_size( dst->type);
-
-    mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
-}
-
-void ggml_cuda_op_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    GGML_ASSERT(src1_ncols == 1);
-
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
-
-
-    // ggml_cuda_op provides single, contiguous matrices
-    const int64_t stride_row         = ne00;
-    const int64_t nchannels_x        = 1;
-    const int64_t nchannels_y        = 1;
-    const int64_t channel_stride_x   = 0;
-    const int64_t channel_stride_y   = 0;
-    const int64_t channel_stride_dst = 0;
-
-    mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
-        nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
-
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(src1);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(src1_ddq_i);
-    GGML_UNUSED(src1_ncols);
-    GGML_UNUSED(src1_padded_row_size);
-}
--- a/ggml/src/ggml-cuda/mmv.cuh
+++ b/ggml/src/ggml-cuda/mmv.cuh
@ -1,12 +0,0 @@
-#include "common.cuh"
-
-// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
-#define MMV_MAX_ROWS 512
-
-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-void ggml_cuda_op_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
 }

 template <ggml_type type, int ncols_y>
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 // tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(

    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);

-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
 #else
    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)

    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    const     int row0 = rows_per_cuda_block*blockIdx.x;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	552419f2c0	ggml : aligned malloc -> malloc	2024-10-31 21:40:11 +02:00
Georgi Gerganov	987f3145d0	ggml : allocate contexts on the heap (v2)	2024-10-31 21:29:48 +02:00
Georgi Gerganov	3689d49b81	whisper : reduce ggml_context usage	2024-10-30 13:39:14 +02:00