whisper : remove ggml source tree

This commit removes the ggml source tree in favor of creating a git submodule for https://github.com/ggml-org/ggml instead. Refs: https://github.com/ggerganov/whisper.cpp/issues/2785
2025-05-02 08:43:02 +00:00 · 2025-04-03 14:47:38 +02:00 · 2025-04-03 14:47:38 +02:00 · 6e142c3247
commit 6e142c3247
parent eac1bc9c47
527 changed files with 0 additions and 174485 deletions
--- a/ggml/.gitignore
+++ b/ggml/.gitignore
@ -1 +0,0 @@
-src/ggml-metal-embed.metal
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -1,362 +0,0 @@
-cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
-project("ggml" C CXX)
-include(CheckIncludeFileCXX)
-
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
-
-if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-    set(GGML_STANDALONE ON)
-
-    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-    # configure project version
-    # TODO
-else()
-    set(GGML_STANDALONE OFF)
-endif()
-
-if (EMSCRIPTEN)
-    set(BUILD_SHARED_LIBS_DEFAULT OFF)
-
-    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
-else()
-    if (MINGW)
-        set(BUILD_SHARED_LIBS_DEFAULT OFF)
-    else()
-        set(BUILD_SHARED_LIBS_DEFAULT ON)
-    endif()
-endif()
-
-# remove the lib prefix on win32 mingw
-if (WIN32)
-    set(CMAKE_STATIC_LIBRARY_PREFIX "")
-    set(CMAKE_SHARED_LIBRARY_PREFIX "")
-    set(CMAKE_SHARED_MODULE_PREFIX  "")
-endif()
-
-option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
-option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
-
-#
-# option list
-#
-
-# TODO: mark all options as advanced when not GGML_STANDALONE
-
-if (APPLE)
-    set(GGML_METAL_DEFAULT ON)
-    set(GGML_BLAS_DEFAULT ON)
-    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
-else()
-    set(GGML_METAL_DEFAULT OFF)
-    set(GGML_BLAS_DEFAULT OFF)
-    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
-endif()
-
-if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
-    message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
-    set(GGML_NATIVE_DEFAULT OFF)
-else()
-    set(GGML_NATIVE_DEFAULT ON)
-endif()
-
-# defaults
-if (NOT GGML_LLAMAFILE_DEFAULT)
-    set(GGML_LLAMAFILE_DEFAULT OFF)
-endif()
-
-if (NOT GGML_CUDA_GRAPHS_DEFAULT)
-    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
-endif()
-
-# general
-option(GGML_STATIC "ggml: static link libraries"                     OFF)
-option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
-option(GGML_LTO    "ggml: enable link time optimization"             OFF)
-option(GGML_CCACHE "ggml: use ccache if available"                   ON)
-
-# debug
-option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
-option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
-option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
-
-# build
-option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
-
-# sanitizers
-option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
-option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
-option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
-
-# instruction set specific
-if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
-    set(INS_ENB OFF)
-else()
-    set(INS_ENB ON)
-endif()
-
-message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
-message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
-message(DEBUG "INS_ENB             : ${INS_ENB}")
-
-option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
-option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
-option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
-option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
-option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
-option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
-option(GGML_BMI2             "ggml: enable BMI2"             ${INS_ENB})
-option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
-option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
-option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
-option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
-if (NOT MSVC)
-    # in MSVC F16C and FMA is implied with AVX2/AVX512
-    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
-    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
-    # MSVC does not seem to support AMX
-    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
-    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
-    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
-endif()
-option(GGML_LASX             "ggml: enable lasx"             ON)
-option(GGML_LSX              "ggml: enable lsx"              ON)
-option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
-option(GGML_VXE              "ggml: enable vxe"              ON)
-
-option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
-set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
-set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
-
-
-if (WIN32)
-    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
-endif()
-
-# ggml core
-set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
-option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
-
-# 3rd party libs / backends
-option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
-option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
-set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
-                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
-
-option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
-option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
-option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
-option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                            "ggml: max. batch size for using peer access")
-option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
-option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
-option(GGML_CUDA_FA                         "ggml: compile ggml FlashAttention CUDA kernels"  ON)
-option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
-set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
-                                            "ggml: cuda link binary compression mode; requires cuda 12.8+")
-set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
-
-option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
-option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
-option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
-option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
-option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
-option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
-option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
-option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
-option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
-option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
-option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
-option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
-option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
-option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
-option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
-option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
-option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
-set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
-                                            "ggml: metal minimum macOS version")
-set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
-option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
-option(GGML_RPC                             "ggml: use RPC"                                   OFF)
-option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
-option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
-option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
-set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
-                                            "ggml: sycl target device")
-set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
-                                            "ggml: sycl device architecture")
-
-option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
-option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
-option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
-option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
-set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
-                                            "gmml: OpenCL API version to target")
-
-# toolchain for vulkan-shaders-gen
-set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
-
-# extra artifacts
-option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
-option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
-
-#
-# dependencies
-#
-
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_C_STANDARD_REQUIRED true)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED true)
-
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-
-find_package(Threads REQUIRED)
-
-include(GNUInstallDirs)
-
-#
-# build the library
-#
-
-add_subdirectory(src)
-
-#
-# tests and examples
-#
-
-if (GGML_BUILD_TESTS)
-    enable_testing()
-    add_subdirectory(tests)
-endif ()
-
-if (GGML_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-endif ()
-
-#
-# install
-#
-
-include(CMakePackageConfigHelpers)
-
-# all public headers
-set(GGML_PUBLIC_HEADERS
-    include/ggml.h
-    include/ggml-cpu.h
-    include/ggml-alloc.h
-    include/ggml-backend.h
-    include/ggml-blas.h
-    include/ggml-cann.h
-    include/ggml-cpp.h
-    include/ggml-cuda.h
-    include/ggml-kompute.h
-    include/ggml-opt.h
-    include/ggml-metal.h
-    include/ggml-rpc.h
-    include/ggml-sycl.h
-    include/ggml-vulkan.h
-    include/gguf.h)
-
-set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-#if (GGML_METAL)
-#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
-#endif()
-install(TARGETS ggml LIBRARY PUBLIC_HEADER)
-install(TARGETS ggml-base LIBRARY)
-
-if (GGML_STANDALONE)
-    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
-        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
-        @ONLY)
-
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
-        DESTINATION share/pkgconfig)
-endif()
-
-#
-# Create CMake package
-#
-
-# Generate version info based on git commit.
-
-if(NOT DEFINED GGML_BUILD_NUMBER)
-    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
-    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE GGML_BUILD_NUMBER
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-
-    if(GGML_BUILD_NUMBER EQUAL 1)
-        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
-    endif()
-
-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE GGML_BUILD_COMMIT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-endif()
-
-
-# Capture variables prefixed with GGML_.
-
-set(variable_set_statements
-"
-####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
-####### Any changes to this file will be overwritten by the next CMake run        #######
-
-")
-
-set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
-
-get_cmake_property(all_variables VARIABLES)
-foreach(variable_name IN LISTS all_variables)
-    if(variable_name MATCHES "^GGML_")
-        string(REPLACE ";" "\\;"
-               variable_value "${${variable_name}}")
-
-        set(variable_set_statements
-            "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
-    endif()
-endforeach()
-
-set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
-
-# Create the CMake package and set install location.
-
-set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
-set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
-set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
-set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
-
-configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
-    PATH_VARS GGML_INCLUDE_INSTALL_DIR
-              GGML_LIB_INSTALL_DIR
-              GGML_BIN_INSTALL_DIR)
-
-write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
-    VERSION ${GGML_INSTALL_VERSION}
-    COMPATIBILITY SameMajorVersion)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
--- a/ggml/cmake/BuildTypes.cmake
+++ b/ggml/cmake/BuildTypes.cmake
@ -1,54 +0,0 @@
-# Add new build types
-
-# ReleaseGG - Release with enabled asserts
-
-SET(CMAKE_CXX_FLAGS_RELEASEGG
-    "-O3"
-    CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_C_FLAGS_RELEASEGG
-    "-O3"
-    CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
-    ""
-    CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
-    FORCE )
-MARK_AS_ADVANCED(
-    CMAKE_CXX_FLAGS_RELEASEGG
-    CMAKE_C_FLAGS_RELEASEGG
-    CMAKE_EXE_LINKER_FLAGS_RELEASEGG
-    CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
-
-# RelWithDebInfoGG - RelWithDebInfo with enabled asserts
-
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
-    "-O2 -g"
-    CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
-    "-O2 -g"
-    CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
-    ""
-    CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
-    FORCE )
-MARK_AS_ADVANCED(
-    CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
-    CMAKE_C_FLAGS_RELWITHDEBINFOGG
-    CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
-    CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
-
-if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
-endif()
--- a/ggml/cmake/GitVars.cmake
+++ b/ggml/cmake/GitVars.cmake
@ -1,22 +0,0 @@
-find_package(Git)
-
-# the commit's SHA1
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_SHA1
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the date of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_DATE
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the subject of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%s
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/ggml/cmake/common.cmake
+++ b/ggml/cmake/common.cmake
@ -1,26 +0,0 @@
-function(ggml_get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@ -1,152 +0,0 @@
-
-@GGML_VARIABLES_EXPANDED@
-
-@PACKAGE_INIT@
-
-set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
-set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
-#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
-
-find_package(Threads REQUIRED)
-
-find_library(GGML_LIBRARY ggml
-    REQUIRED
-    HINTS ${GGML_LIB_DIR}
-    NO_CMAKE_FIND_ROOT_PATH)
-
-add_library(ggml::ggml UNKNOWN IMPORTED)
-set_target_properties(ggml::ggml
-    PROPERTIES
-        IMPORTED_LOCATION "${GGML_LIBRARY}")
-
-find_library(GGML_BASE_LIBRARY ggml-base
-    REQUIRED
-    HINTS ${GGML_LIB_DIR}
-    NO_CMAKE_FIND_ROOT_PATH)
-
-add_library(ggml::ggml-base UNKNOWN IMPORTED)
-set_target_properties(ggml::ggml-base
-    PROPERTIES
-        IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
-
-if (NOT GGML_SHARED_LIB)
-    if (APPLE AND GGML_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
-    endif()
-
-    if (GGML_OPENMP)
-        find_package(OpenMP REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    endif()
-
-    if (GGML_CPU_HBM)
-        find_library(memkind memkind REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
-    endif()
-
-    if (GGML_BLAS)
-        find_package(BLAS REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
-        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
-    endif()
-
-    if (GGML_CUDA)
-        find_package(CUDAToolkit REQUIRED)
-    endif()
-
-    if (GGML_METAL)
-        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-        find_library(METAL_FRAMEWORK    Metal REQUIRED)
-        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-
-        list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
-                    ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
-    endif()
-
-    if (GGML_VULKAN)
-        find_package(Vulkan REQUIRED)
-        list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
-    endif()
-
-    if (GGML_HIP)
-        find_package(hip     REQUIRED)
-        find_package(hipblas REQUIRED)
-        find_package(rocblas REQUIRED)
-        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
-    endif()
-
-    if (GGML_SYCL)
-        find_package(DNNL)
-        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
-        endif()
-        if (WIN32)
-            find_package(IntelSYCL REQUIRED)
-            find_package(MKL       REQUIRED)
-            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
-        endif()
-    endif()
-endif()
-
-set(_ggml_all_targets "")
-foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
-    string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
-    string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
-
-    find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
-        REQUIRED
-        HINTS ${GGML_LIB_DIR}
-        NO_CMAKE_FIND_ROOT_PATH)
-
-    message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
-
-    add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
-    set_target_properties(ggml::${_ggml_backend}
-        PROPERTIES
-            INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
-            IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-            IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
-            INTERFACE_COMPILE_FEATURES c_std_90
-            POSITION_INDEPENDENT_CODE ON)
-
-    string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
-    if(is_cpu_variant)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
-        set_target_properties(ggml::${_ggml_backend}
-           PROPERTIES
-               INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
-
-        if(GGML_CPU_INTERFACE_LINK_OPTIONS)
-            set_target_properties(ggml::${_ggml_backend}
-                PROPERTIES
-                    INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
-        endif()
-
-    else()
-        list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
-        set_target_properties(ggml::${_ggml_backend}
-            PROPERTIES
-                INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
-
-        if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
-            set_target_properties(ggml::${_ggml_backend}
-                PROPERTIES
-                    INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
-        endif()
-    endif()
-
-    list(APPEND _ggml_all_targets ggml::${_ggml_backend})
-endforeach()
-
-list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
-set_target_properties(ggml::ggml
-    PROPERTIES
-        INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
-
-add_library(ggml::all INTERFACE IMPORTED)
-set_target_properties(ggml::all
-    PROPERTIES
-        INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
-
-check_required_components(ggml)
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@ -1,76 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct             ggml_backend * ggml_backend_t;
-
-// Tensor allocator
-struct ggml_tallocr {
-    ggml_backend_buffer_t buffer;
-    void * base;
-    size_t alignment;
-    size_t offset;
-};
-
-GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API enum ggml_status    ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
-
-// Graph allocator
-/*
-  Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-
-    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
-    ggml_gallocr_reserve(galloc, build_graph(max_batch));
-
-    // allocate the graph
-    struct ggml_cgraph * graph = build_graph(batch);
-    ggml_gallocr_alloc_graph(galloc, graph);
-
-    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
-
-    // evaluate the graph
-    ggml_backend_graph_compute(backend, graph);
-*/
-
-// special tensor flags for use with the graph allocator:
-//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
-//   ggml_set_output(): output tensors are never freed and never overwritten
-
-typedef struct ggml_gallocr * ggml_gallocr_t;
-
-GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
-GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
-
-// pre-allocate buffers from a measure graph - does not allocate or modify the graph
-// call with a worst-case graph to avoid buffer reallocations
-// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
-// returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(
-    ggml_gallocr_t galloc,
-    struct ggml_cgraph * graph,
-    const int * node_buffer_ids,
-    const int * leaf_buffer_ids);
-
-// automatic reallocation if the topology changes when using a single buffer
-// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
-GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-
-GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-
-// Utils
-// Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -1,354 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef GGML_BACKEND_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef GGML_BACKEND_BUILD
-#            define GGML_BACKEND_API __declspec(dllexport) extern
-#        else
-#            define GGML_BACKEND_API __declspec(dllimport) extern
-#        endif
-#    else
-#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
-#    endif
-#else
-#    define GGML_BACKEND_API extern
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend_event * ggml_backend_event_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-    typedef struct ggml_backend_reg * ggml_backend_reg_t;
-    typedef struct ggml_backend_device * ggml_backend_dev_t;
-
-
-    //
-    // Backend buffer type
-    //
-
-    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
-    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
-
-    //
-    // Backend buffer
-    //
-
-    enum ggml_backend_buffer_usage {
-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
-    };
-
-    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
-    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    //
-    // Backend (stream)
-    //
-
-    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    // "offset" refers to the offset in tensor->data for setting/getting data
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-    // NOTE: will be removed, use device version instead
-    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
-    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
-    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // asynchronous copy
-    // the copy is performed after all the currently queued operations in backend_src
-    // backend_dst will wait for the copy to complete before performing other operations
-    // automatic fallback to sync copy if async is not supported
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
-
-    //
-    // Events
-    //
-
-    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
-    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
-    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
-    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
-    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
-
-    //
-    // Backend device
-    //
-
-    enum ggml_backend_dev_type {
-        // CPU device using system memory
-        GGML_BACKEND_DEVICE_TYPE_CPU,
-        // GPU device using dedicated memory
-        GGML_BACKEND_DEVICE_TYPE_GPU,
-        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
-        GGML_BACKEND_DEVICE_TYPE_ACCEL
-    };
-
-    // functionality supported by the device
-    struct ggml_backend_dev_caps {
-        // asynchronous operations
-        bool async;
-        // pinned host buffer
-        bool host_buffer;
-        // creating buffers from host ptr
-        bool buffer_from_host_ptr;
-        // event synchronization
-        bool events;
-    };
-
-    // all the device properties
-    struct ggml_backend_dev_props {
-        const char * name;
-        const char * description;
-        size_t memory_free;
-        size_t memory_total;
-        enum ggml_backend_dev_type type;
-        struct ggml_backend_dev_caps caps;
-    };
-
-    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
-    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
-    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
-    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
-    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
-    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
-    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
-    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
-    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
-    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
-
-    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
-    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
-    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
-
-    //
-    // Backend (reg)
-    //
-
-    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
-    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
-    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
-    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
-
-    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
-
-    // Split buffer type for tensor parallelism
-    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
-    // Set the number of threads for the backend
-    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
-    // Get additional buffer types provided by the device (returns a NULL-terminated array)
-    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
-    // Set the abort callback for the backend
-    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
-    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
-    struct ggml_backend_feature {
-        const char * name;
-        const char * value;
-    };
-    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
-
-    //
-    // Backend registry
-    //
-
-    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
-
-    // Backend (reg) enumeration
-    GGML_API size_t             ggml_backend_reg_count(void);
-    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
-    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
-
-    // Device enumeration
-    GGML_API size_t             ggml_backend_dev_count(void);
-    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
-    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
-    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
-
-    // Direct backend (stream) initialization
-    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
-    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
-    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
-    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
-    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
-    GGML_API ggml_backend_t ggml_backend_init_best(void);
-
-    // Load a backend from a dynamic library and register it
-    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
-    // Unload a backend if loaded dynamically and unregister it
-    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
-    // Load all known backends from dynamic libraries
-    GGML_API void               ggml_backend_load_all(void);
-    GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);
-
-    //
-    // Backend scheduler
-    //
-
-    // The backend scheduler allows for multiple backend devices to be used together
-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
-    // The backends are selected based on:
-    // - the backend that supports the operation
-    // - the location of the pre-allocated tensors (e.g. the weights)
-    /*
-      Example usage:
-
-        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // preferrably to run on the same backend as the buffer
-        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
-
-        // initialize buffers from a max size graph (optional)
-        reserve_graph = build_graph(sched, max_batch_size);
-
-        // manually assign nodes to a backend (optional, should not be needed in most cases)
-        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
-
-        ggml_backend_sched_reserve(sched, reserve_graph);
-
-        // compute
-        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
-        for (int i = 0; i < 10; ++i) {
-            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
-        }
-
-        // if there are graph inputs:
-        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
-        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
-        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
-        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
-        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
-
-        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
-        // allocate them statically via ggml_backend_alloc_ctx_tensors
-    }
-    */
-
-    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
-    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
-    // when ask == true, the scheduler wants to know if the user wants to observe this node
-    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
-    //
-    // when ask == false, the scheduler is passing the node tensor to the user for observation
-    // if the user returns false, the scheduler will cancel the graph compute
-    //
-    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-
-    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
-    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
-
-    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
-
-    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
-
-    // Get the number of splits of the last graph
-    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
-
-    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
-    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-
-    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
-    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
-    // The correct way to use this API is to discard the deallocated tensors and create new ones.
-    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
-    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
-
-    // CPU buffer types are always available
-    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@ -1,25 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
-
-GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
-
-// number of threads used for conversion to float
-// for openblas and blis, this will also set the number of threads used for blas operations
-GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
-
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml-backend.h"
-#include "ggml.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * @brief Maximum number of CANN devices supported.
- */
-#define GGML_CANN_MAX_DEVICES 16
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
-
-/**
- * @brief Initializes the CANN backend for a specified device.
- *
- * This function initializes the CANN backend for the given device.
- * It verifies the device index, allocates a context, and creates a backend
- * instance.
- *
- * @param device The index of the device to initialize.
- * @return A pointer to the initialized backend instance, or nullptr on failure.
- */
-GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
-
-/**
- * @brief Checks if a given backend is a CANN backend.
- *
- * This function verifies if the provided backend is a CANN backend by comparing
- * its GUID with the CANN backend's GUID.
- *
- * @param backend The backend instance to check.
- * @return True if the backend is a CANN backend, false otherwise.
- */
-GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
-
-/**
- * @brief Retrieves the CANN buffer type for a specified device.
- *
- * This function initializes and returns the buffer type interface associated
- * with the given device. It ensures thread-safe access using a mutex.
- *
- * @param device The device index for which to retrieve the buffer type.
- * @return A pointer to the buffer type interface for the specified device, or
- * nullptr if the device index is out of range.
- */
-GGML_BACKEND_API ggml_backend_buffer_type_t
-ggml_backend_cann_buffer_type(int32_t device);
-
-/**
- * @brief Retrieves the number of CANN devices available.
- *
- * This function returns the number of CANN devices available based on
- * information obtained from `ggml_cann_info()`.
- *
- * @return The number of CANN devices available.
- */
-GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
-
-/**
- * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
- *
- * @return A pointer to the host buffer type interface.
- */
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
-
-/**
- * @brief Retrieves the description of a specific CANN device.
- *
- * This function sets the specified device, retrieves the SoC name,
- * and writes it into the provided description buffer.
- *
- * @param device The device index to retrieve the description for.
- * @param description Pointer to a buffer where the description will be written.
- * @param description_size Size of the description buffer.
- */
-GGML_BACKEND_API void ggml_backend_cann_get_device_description(
-    int32_t device, char* description, size_t description_size);
-
-/**
- * @brief Retrieves the memory information of a specific CANN device.
- *
- * This function sets the specified device, retrieves the free and total
- * memory information of the specified type (ACL_HBM_MEM), and stores them
- * in the provided pointers.
- *
- * @param device The device index to retrieve memory information for.
- * @param free Pointer to a variable where the free memory size will be stored.
- * @param total Pointer to a variable where the total memory size will be
- * stored.
- */
-GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
-                                                  size_t* free,
-                                                  size_t* total);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml-cpp.h
+++ b/ggml/include/ggml-cpp.h
@ -1,39 +0,0 @@
-#pragma once
-
-#ifndef __cplusplus
-#error "This header is for C++ only"
-#endif
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "gguf.h"
-#include <memory>
-
-// Smart pointers for ggml types
-
-// ggml
-
-struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
-struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
-
-typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
-typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
-
-// ggml-alloc
-
-struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
-
-typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
-
-// ggml-backend
-
-struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
-struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
-struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
-struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
-
-typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
-typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
-typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
-typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -1,138 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggml-org/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
-    // numa strategies
-    enum ggml_numa_strategy {
-        GGML_NUMA_STRATEGY_DISABLED   = 0,
-        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
-        GGML_NUMA_STRATEGY_ISOLATE    = 2,
-        GGML_NUMA_STRATEGY_NUMACTL    = 3,
-        GGML_NUMA_STRATEGY_MIRROR     = 4,
-        GGML_NUMA_STRATEGY_COUNT
-    };
-
-    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
-
-    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
-    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
-
-    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
-    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
-
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-
-    //
-    // system info
-    //
-
-    // x86
-    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
-    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
-    GGML_BACKEND_API int ggml_cpu_has_bmi2       (void);
-    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
-    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
-    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
-    // ARM
-    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
-    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
-    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
-    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
-    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
-    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
-    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
-    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
-    // other
-    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
-    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
-    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
-    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
-
-    // Internal types and functions exposed for tests and benchmarks
-
-    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-
-    struct ggml_type_traits_cpu {
-        ggml_from_float_t        from_float;
-        ggml_vec_dot_t           vec_dot;
-        enum ggml_type           vec_dot_type;
-        int64_t                  nrows; // number of rows to process simultaneously
-    };
-
-    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
-
-    GGML_BACKEND_API void ggml_cpu_init(void);
-
-    //
-    // CPU backend
-    //
-
-    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@ -1,47 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#ifdef GGML_USE_HIP
-#define GGML_CUDA_NAME "ROCm"
-#define GGML_CUBLAS_NAME "hipBLAS"
-#elif defined(GGML_USE_MUSA)
-#define GGML_CUDA_NAME "MUSA"
-#define GGML_CUBLAS_NAME "muBLAS"
-#else
-#define GGML_CUDA_NAME "CUDA"
-#define GGML_CUBLAS_NAME "cuBLAS"
-#endif
-#define GGML_CUDA_MAX_DEVICES       16
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
-
-// device buffer
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
-
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
-GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@ -1,50 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define GGML_KOMPUTE_MAX_DEVICES 16
-
-struct ggml_vk_device {
-    int index;
-    int type; // same as VkPhysicalDeviceType
-    size_t heapSize;
-    const char * name;
-    const char * vendor;
-    int subgroupSize;
-    uint64_t bufferAlignment;
-    uint64_t maxAlloc;
-};
-
-struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
-bool ggml_vk_has_vulkan(void);
-bool ggml_vk_has_device(void);
-struct ggml_vk_device ggml_vk_current_device(void);
-
-//
-// backend API
-//
-
-// forward declaration
-typedef struct ggml_backend * ggml_backend_t;
-
-GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
-
-GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
-
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -1,66 +0,0 @@
-// Note: this description is outdated
-//
-// An interface allowing to compute ggml_cgraph with Metal
-//
-// This is a fully functional interface that extends ggml with GPU support for Apple devices.
-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
-//
-// How it works?
-//
-// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
-// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
-// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
-//
-// You only need to make sure that all memory buffers that you used during the graph creation
-// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
-// used during the graph evaluation to determine the arguments of the compute kernels.
-//
-// Synchronization between device and host memory (for example for input and output tensors)
-// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stddef.h>
-#include <stdbool.h>
-
-struct ggml_tensor;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// backend API
-// user-code should use only these functions
-//
-
-GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
-
-GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
-
-GGML_DEPRECATED(
-        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
-
-GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
-
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-
-// helper to check if the device supports a specific family
-// ideally, the user code should be doing these checks
-// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
-
-// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml-opencl.h
+++ b/ggml/include/ggml-opencl.h
@ -1,26 +0,0 @@
-#ifndef GGML_OPENCL_H
-#define GGML_OPENCL_H
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-//
-// backend API
-//
-GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
-GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
-
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif // GGML_OPENCL_H
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -1,216 +0,0 @@
-// This file contains functionality for training models using GGML.
-// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
-// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
-//
-// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stdint.h>
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    struct ggml_opt_dataset;
-    struct ggml_opt_context;
-    struct ggml_opt_result;
-
-    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
-    typedef struct ggml_opt_context * ggml_opt_context_t;
-    typedef struct ggml_opt_result  * ggml_opt_result_t;
-
-    // ====== Loss ======
-
-    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
-    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
-    enum ggml_opt_loss_type {
-        GGML_OPT_LOSS_TYPE_MEAN,
-        GGML_OPT_LOSS_TYPE_SUM,
-        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
-        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
-    };
-
-    // ====== Dataset ======
-
-    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
-            int64_t ne_label,     // number of elements per label
-            int64_t ndata,        // total number of datapoints/labels
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
-    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
-
-    // get underlying tensors that store the data
-    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
-    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
-
-    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
-    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
-
-    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
-    GGML_API void ggml_opt_dataset_get_batch(
-            ggml_opt_dataset_t   dataset,
-            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
-            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
-            int64_t              ibatch);
-
-    // ====== Model / Context ======
-
-    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
-        GGML_OPT_BUILD_TYPE_GRAD,
-        GGML_OPT_BUILD_TYPE_OPT,
-    };
-
-    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
-    struct ggml_opt_optimizer_params {
-        // AdamW optimizer parameters
-        struct {
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay for AdamW, use 0.0f to disable
-        } adamw;
-    };
-
-    // callback to calculate optimizer parameters prior to a backward pass
-    // userdata can be used to pass arbitrary data
-    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
-
-    // returns the default optimizer params (constant)
-    // userdata is not used
-    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
-
-    // parameters for initializing a new optimization context
-    struct ggml_opt_params {
-        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
-
-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
-
-        // the forward graph is defined by inputs and outputs
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
-        struct ggml_tensor * inputs;
-        struct ggml_tensor * outputs;
-
-        enum ggml_opt_loss_type  loss_type;
-        enum ggml_opt_build_type build_type;
-
-        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
-
-        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
-        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
-    };
-
-    // get parameters for an optimization context with defaults set where possible
-    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
-            struct ggml_context     * ctx_compute,
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
-
-    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
-    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
-
-    // set gradients to zero, initilize loss, and optionally reset the optimizer
-    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
-
-    // get underlying tensors that store data
-    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
-    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
-    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
-    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
-    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
-    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
-
-    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
-
-    // ====== Optimization Result ======
-
-    GGML_API ggml_opt_result_t ggml_opt_result_init();
-    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
-    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
-
-    // get data from result, uncertainties are optional and can be ignored by passing NULL
-    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
-    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
-    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
-    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
-
-    // ====== Computation ======
-
-    // do forward pass, increment result if not NULL
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
-
-    // do forward pass, increment result if not NULL, do backward pass
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
-
-    // ############################################################################
-    // ## The high-level functions start here. They do not depend on any private ##
-    // ## functions or structs and can be copied to and adapted for user code.   ##
-    // ############################################################################
-
-    // ====== Intended Usage ======
-    //
-    // 1. Select the appropriate loss for your problem.
-    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
-    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
-    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
-    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
-    //    The second context should contain all other tensors and will be (re)allocated automatically.
-    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
-    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
-    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
-
-    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
-    typedef void (*ggml_opt_epoch_callback)(
-            bool               train,       // true after training evaluation, false after validation evaluation
-            ggml_opt_context_t opt_ctx,
-            ggml_opt_dataset_t dataset,
-            ggml_opt_result_t  result,      // result associated with the dataset subsection
-            int64_t            ibatch,      // number of batches that have been evaluated so far
-            int64_t            ibatch_max,  // total number of batches in this dataset subsection
-            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
-
-    // do training on front of dataset, do evaluation only on back of dataset
-    GGML_API void ggml_opt_epoch(
-            ggml_opt_context_t      opt_ctx,
-            ggml_opt_dataset_t      dataset,
-            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
-            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
-            int64_t                 idata_split,    // data index at which to split training and evaluation
-            ggml_opt_epoch_callback callback_train,
-            ggml_opt_epoch_callback callback_eval);
-
-    // callback that prints a progress bar on stderr
-    GGML_API void ggml_opt_epoch_callback_progress_bar(
-            bool               train,
-            ggml_opt_context_t opt_ctx,
-            ggml_opt_dataset_t dataset,
-            ggml_opt_result_t  result,
-            int64_t            ibatch,
-            int64_t            ibatch_max,
-            int64_t            t_start_us);
-
-    // fit model defined by inputs and outputs to dataset
-    GGML_API void ggml_opt_fit(
-            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
-            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
-            enum ggml_opt_loss_type         loss_type,      // loss to minimize
-            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
-            int64_t                         nepoch,         // how many times the dataset should be iterated over
-            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
-            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
-            bool                            silent);        // whether or not info prints to stderr should be suppressed
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@ -1,30 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_RPC_MAX_SERVERS       16
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
-GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
-
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
-
-GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
-
-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
-                                                    const char * cache_dir,
-                                                    size_t free_mem, size_t total_mem);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
-
-GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -1,49 +0,0 @@
-//
-//  MIT license
-//  Copyright (C) 2024 Intel Corporation
-//  SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#define GGML_SYCL_NAME "SYCL"
-#define GGML_SYCL_MAX_DEVICES 48
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
-
-GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
-
-// devide buffer
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
-
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
-
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-
-GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
-                                                       char *description,
-                                                       size_t description_size);
-GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
-GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
-
-// SYCL doesn't support registering host memory, keep here for reference
-// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -1,29 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_VK_NAME "Vulkan"
-#define GGML_VK_MAX_DEVICES 16
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-
-GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
-GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@ -1,202 +0,0 @@
-// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
-// GGUF files have the following structure:
-//
-// 1. File magic "GGUF" (4 bytes).
-// 2. File version (uint32_t).
-// 3. Number of ggml tensors in file (int64_t).
-// 4. Number of key-value-pairs in file (int64_t).
-// 5. For each KV pair:
-//   1. The key (string).
-//   2. The value type (gguf_type).
-//   3a. If the value type is GGUF_TYPE_ARRAY:
-//     1. The type of the array (gguf_type).
-//     2. The number of elements in the array (uint64_t).
-//     3. The binary representation of each element in the array.
-//   3b. Otherwise:
-//     1. The binary representation of the value.
-// 6. For each ggml tensor:
-//   1. The tensor name (string).
-//   2. The number of dimensions of the tensor (uint32_t).
-//   3. For each dimension:
-//     1. The size of the tensor in the dimension (int64_t).
-//   4. The tensor data type (ggml_type).
-//   5. The tensor data offset in the tensor data binary blob (uint64_t).
-// 7. The tensor data binary blob (optional, aligned).
-//
-// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
-// All enums are stored as int32_t.
-// All bool values are stored as int8_t.
-// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
-//   otherwise GGUF_DEFAULT_ALIGNMENT is used.
-//
-// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
-
-#pragma once
-
-#include "ggml.h"
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#define GGUF_MAGIC   "GGUF"
-#define GGUF_VERSION 3
-
-#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
-
-#define GGUF_DEFAULT_ALIGNMENT 32
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    // types that can be stored as GGUF KV data
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-        GGUF_TYPE_UINT64  = 10,
-        GGUF_TYPE_INT64   = 11,
-        GGUF_TYPE_FLOAT64 = 12,
-        GGUF_TYPE_COUNT,       // marks the end of the enum
-    };
-
-    struct gguf_context;
-
-    struct gguf_init_params {
-        bool no_alloc;
-
-        // if not NULL, create a ggml_context and allocate the tensor data in it
-        struct ggml_context ** ctx;
-    };
-
-    GGML_API struct gguf_context * gguf_init_empty(void);
-    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
-
-    GGML_API void gguf_free(struct gguf_context * ctx);
-
-    GGML_API const char * gguf_type_name(enum gguf_type type);
-
-    GGML_API uint32_t gguf_get_version    (const struct gguf_context * ctx);
-    GGML_API size_t   gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);
-
-    GGML_API int64_t      gguf_get_n_kv(const struct gguf_context * ctx);
-    GGML_API int64_t      gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
-
-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
-
-    // will abort if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
-    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
-    GGML_API size_t       gguf_get_arr_n   (const struct gguf_context * ctx, int64_t key_id);
-
-    // get raw pointer to the first element of the array with the given key_id
-    // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
-
-    // get ith C string from array with given key_id
-    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
-
-    GGML_API int64_t        gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int64_t        gguf_find_tensor      (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
-    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
-    GGML_API const char *   gguf_get_tensor_name  (const struct gguf_context * ctx, int64_t tensor_id);
-    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int64_t tensor_id);
-    GGML_API size_t         gguf_get_tensor_size  (const struct gguf_context * ctx, int64_t tensor_id);
-
-    // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
-    GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
-
-    // overrides an existing KV pair or adds a new one, the new KV pair is always at the back
-    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t      val);
-    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t       val);
-    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t     val);
-    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t      val);
-    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t     val);
-    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t      val);
-    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float        val);
-    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t     val);
-    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t      val);
-    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double       val);
-    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool         val);
-    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
-
-    // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
-    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
-
-    // creates a new array with n strings and copies the corresponding strings from data
-    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
-
-    // set or add KV pairs from another context
-    GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
-
-    // add tensor to GGUF context, tensor name must be unique
-    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
-
-    // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
-    //   in such a way that the tensor data remains as one contiguous block (except for padding)
-    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
-
-    // assumes that at least gguf_get_tensor_size bytes can be read from data
-    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
-
-    // writing gguf files can be done in 3 ways:
-    //
-    // - write the entire gguf_context to a binary file in a single pass:
-    //
-    //   gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
-    //
-    // - write only the meta data to a file, then re-open the file and append the tensor data:
-    //
-    //   gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
-    //   FILE * f = fopen(fname, "ab");
-    //   fwrite(f, ...); // write tensor data
-    //   fclose(f);
-    //
-    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
-    //
-    //   FILE * f = fopen(fname, "wb");
-    //   const size_t size_meta = gguf_get_meta_size(ctx);
-    //   fseek(f, size_meta, SEEK_SET);
-    //   fwrite(f, ...); // write tensor data
-    //   void * data = malloc(size_meta);
-    //   gguf_get_meta_data(ctx, data);
-    //   rewind(f);
-    //   fwrite(data, 1, data, f);
-    //   free(data);
-    //   fclose(f);
-    //
-
-    // write the entire context to a binary file
-    GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
-
-    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-
-    // writes the meta data to pointer "data"
-    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -1,342 +0,0 @@
-include(CheckCXXCompilerFlag)
-include("../cmake/common.cmake")
-
-add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
-
-# enable libstdc++ assertions for debug builds
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
-endif()
-
-if (NOT MSVC)
-    if (GGML_SANITIZE_THREAD)
-        add_compile_options(-fsanitize=thread)
-        link_libraries     (-fsanitize=thread)
-    endif()
-
-    if (GGML_SANITIZE_ADDRESS)
-        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries     (-fsanitize=address)
-    endif()
-
-    if (GGML_SANITIZE_UNDEFINED)
-        add_compile_options(-fsanitize=undefined)
-        link_libraries     (-fsanitize=undefined)
-    endif()
-endif()
-
-if (GGML_FATAL_WARNINGS)
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        list(APPEND C_FLAGS   -Werror)
-        list(APPEND CXX_FLAGS -Werror)
-    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-        add_compile_options(/WX)
-    endif()
-endif()
-
-if (GGML_ALL_WARNINGS)
-    if (NOT MSVC)
-        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-
-        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
-
-        ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
-
-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
-    else()
-        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
-    endif()
-endif()
-
-if (GGML_LTO)
-    include(CheckIPOSupported)
-    check_ipo_supported(RESULT result OUTPUT output)
-    if (result)
-        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
-    else()
-        message(WARNING "IPO is not supported: ${output}")
-    endif()
-endif()
-
-if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
-    find_program(GGML_CCACHE_FOUND ccache)
-    find_program(GGML_SCCACHE_FOUND sccache)
-
-    if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND)
-        if(GGML_CCACHE_FOUND)
-            set(GGML_CCACHE_VARIANT ccache)
-        else()
-            set(GGML_CCACHE_VARIANT sccache)
-        endif()
-        # TODO: should not be set globally
-        if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
-            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
-        else ()
-            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
-        endif ()
-        set(ENV{CCACHE_SLOPPINESS} time_macros)
-        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
-    else()
-        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
-    endif ()
-endif()
-
-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-    OUTPUT_QUIET
-)
-
-if (output MATCHES "dyld-1015\.7")
-    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
-endif()
-
-# architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
-#       feel free to update the Makefile for your architecture and send a pull request or issue
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if (MSVC)
-    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
-else ()
-    set(CMAKE_GENERATOR_PLATFORM_LWR "")
-endif ()
-
-if (NOT MSVC)
-    if (GGML_STATIC)
-        add_link_options(-static)
-        if (MINGW)
-            add_link_options(-static-libgcc -static-libstdc++)
-        endif()
-    endif()
-    if (GGML_GPROF)
-        add_compile_options(-pg)
-    endif()
-endif()
-
-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
-#
-# POSIX conformance
-#
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_XOPEN_SOURCE=700)
-else()
-    add_compile_definitions(_XOPEN_SOURCE=600)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
-    add_compile_definitions(_GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (
-    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
-    CMAKE_SYSTEM_NAME MATCHES "iOS"    OR
-    CMAKE_SYSTEM_NAME MATCHES "tvOS"   OR
-    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
-)
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
-
-if (WIN32)
-    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-endif()
-
-# ggml
-
-if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
-    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
-endif()
-
-add_library(ggml-base
-            ../include/ggml.h
-            ../include/ggml-alloc.h
-            ../include/ggml-backend.h
-            ../include/ggml-cpp.h
-            ../include/ggml-opt.h
-            ../include/gguf.h
-            ggml.c
-            ggml-alloc.c
-            ggml-backend.cpp
-            ggml-opt.cpp
-            ggml-threading.cpp
-            ggml-threading.h
-            ggml-quants.c
-            ggml-quants.h
-            gguf.cpp)
-
-target_include_directories(ggml-base PRIVATE .)
-if (GGML_BACKEND_DL)
-    target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
-endif()
-
-add_library(ggml
-            ggml-backend-reg.cpp)
-
-target_link_libraries(ggml PUBLIC ggml-base)
-
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ggml PRIVATE dl stdc++fs)
-endif()
-
-function(ggml_add_backend_library backend)
-    if (GGML_BACKEND_DL)
-        add_library(${backend} MODULE ${ARGN})
-        # write the shared library to the output directory
-        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
-        add_dependencies(ggml ${backend})
-    else()
-        add_library(${backend} ${ARGN})
-        target_link_libraries(ggml PUBLIC ${backend})
-        install(TARGETS ${backend} LIBRARY)
-    endif()
-
-    target_link_libraries(${backend} PRIVATE ggml-base)
-    target_include_directories(${backend} PRIVATE ..)
-
-    if (${BUILD_SHARED_LIBS})
-        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
-        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
-    endif()
-
-    if(NOT GGML_AVAILABLE_BACKENDS)
-        set(GGML_AVAILABLE_BACKENDS "${backend}"
-            CACHE INTERNAL "List of backends for cmake package")
-    else()
-        list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
-        if(has_backend EQUAL -1)
-            set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
-                CACHE INTERNAL "List of backends for cmake package")
-        endif()
-    endif()
-endfunction()
-
-function(ggml_add_backend backend)
-    string(TOUPPER "GGML_${backend}" backend_id)
-    if (${backend_id})
-        string(TOLOWER "ggml-${backend}" backend_target)
-        add_subdirectory(${backend_target})
-        message(STATUS "Including ${backend} backend")
-        if (NOT GGML_BACKEND_DL)
-            string(TOUPPER "GGML_USE_${backend}" backend_use)
-            target_compile_definitions(ggml PUBLIC ${backend_use})
-        endif()
-    endif()
-endfunction()
-
-function(ggml_add_cpu_backend_variant tag_name)
-    set(GGML_CPU_TAG_NAME ${tag_name})
-    # other: OPENMP LLAMAFILE CPU_HBM
-    foreach (feat NATIVE
-                  AVX AVX2 BMI2 AVX_VNNI FMA F16C
-                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
-                  AMX_TILE AMX_INT8 AMX_BF16)
-        set(GGML_${feat} OFF)
-    endforeach()
-
-    foreach (feat ${ARGN})
-        set(GGML_${feat} ON)
-    endforeach()
-
-    ggml_add_cpu_backend_variant_impl(${tag_name})
-endfunction()
-
-ggml_add_backend(CPU)
-
-if (GGML_CPU_ALL_VARIANTS)
-    if (NOT GGML_BACKEND_DL)
-        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
-    endif()
-    ggml_add_cpu_backend_variant(sandybridge    AVX)
-    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
-    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
-    if (NOT MSVC)
-        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
-    endif()
-elseif (GGML_CPU)
-    ggml_add_cpu_backend_variant_impl("")
-endif()
-
-ggml_add_backend(BLAS)
-ggml_add_backend(CANN)
-ggml_add_backend(CUDA)
-ggml_add_backend(HIP)
-ggml_add_backend(Kompute)
-ggml_add_backend(METAL)
-ggml_add_backend(MUSA)
-ggml_add_backend(RPC)
-ggml_add_backend(SYCL)
-ggml_add_backend(Vulkan)
-ggml_add_backend(OpenCL)
-
-foreach (target ggml-base ggml)
-    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
-    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
-endforeach()
-
-target_link_libraries(ggml-base PRIVATE Threads::Threads)
-
-find_library(MATH_LIBRARY m)
-if (MATH_LIBRARY)
-    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
-        target_link_libraries(ggml-base PRIVATE m)
-    endif()
-endif()
-
-if (CMAKE_SYSTEM_NAME MATCHES "Android")
-    target_link_libraries(ggml-base PRIVATE dl)
-endif()
-
-if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
-    target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
-endif()
-
-if (BUILD_SHARED_LIBS)
-    foreach (target ggml-base ggml)
-        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        target_compile_definitions(${target} PRIVATE GGML_BUILD)
-        target_compile_definitions(${target} PUBLIC  GGML_SHARED)
-    endforeach()
-endif()
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@ -1,107 +0,0 @@
-if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
-        CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
-    message(STATUS "Using AMX")
-
-    file(GLOB   GGML_HEADERS_AMX "*.h")
-    list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
-
-    file(GLOB   GGML_SOURCES_AMX "*.cpp")
-
-    add_library(ggml-amx
-                ${GGML_HEADERS_AMX}
-                ${GGML_SOURCES_AMX})
-
-    target_link_libraries(ggml-amx PRIVATE ggml-base)
-    target_include_directories(ggml-amx PRIVATE . ..)
-
-    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
-    # TODO: integrate AMX backend into the CPU backend
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
-            include(../ggml-cpu/cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
-            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
-            endif()
-            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
-            endif()
-            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-
-    target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
-else()
-    set(GGML_AMX OFF PARENT_SCOPE)
-    message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
-endif()
--- a/ggml/src/ggml-amx/common.h
+++ b/ggml/src/ggml-amx/common.h
@ -1,94 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-// hack until AMX is moved into the CPU backend
-#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
-
-#include <algorithm>
-#include <memory>
-#include <type_traits>
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
-#define TILE_M 16
-#define TILE_N 16
-#define TILE_K 32
-#define VNNI_BLK 4
-
-#define AMX_BLK_SIZE 32
-
-#define TMM0 0
-#define TMM1 1
-#define TMM2 2
-#define TMM3 3
-#define TMM4 4
-#define TMM5 5
-#define TMM6 6
-#define TMM7 7
-
-// parallel routines
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline T div_up(T x, T y) { return (x + y - 1) / y; }
-
-template <typename T>
-inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
-#if 0
-    // onednn partition pattern
-    T& n_my = n_end;
-    if (nth <= 1 || n == 0) {
-        n_start = 0;
-        n_my = n;
-    } else {
-        T n1 = div_up(n, nth);
-        T n2 = n1 - 1;
-        T T1 = n - n2 * nth;
-        n_my = ith < T1 ? n1 : n2;
-        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
-    }
-    n_end += n_start;
-#else
-    // pytorch aten partition pattern
-    T n_my = div_up(n, nth);
-    n_start = ith * n_my;
-    n_end = std::min(n_start + n_my, n);
-#endif
-}
-
-template <typename func_t>
-inline void parallel_for(int nth, int n, const func_t& f) {
-#if defined(_OPENMP)
-#pragma omp parallel num_threads(nth)
-{
-    //int nth = omp_get_num_threads();
-    int ith = omp_get_thread_num();
-    int tbegin, tend;
-    balance211(n, nth, ith, tbegin, tend);
-    f(tbegin, tend);
-}
-#else
-    f(0, n);
-
-    GGML_UNUSED(nth);
-#endif
-}
-
-// quantized types that have AMX support
-inline bool qtype_has_amx_kernels(const enum ggml_type type) {
-    // TODO: fix padding for vnni format
-    return (type == GGML_TYPE_Q4_0) ||
-        (type == GGML_TYPE_Q4_1);
-        //(type == GGML_TYPE_Q8_0) ||
-        //(type == GGML_TYPE_Q4_K) ||
-        //(type == GGML_TYPE_Q5_K) ||
-        //(type == GGML_TYPE_Q6_K) ||
-        //(type == GGML_TYPE_IQ4_XS);
-}
-
-// ggml backend context
-struct ggml_backend_amx_context {
-    int n_threads = GGML_DEFAULT_N_THREADS;
-    std::unique_ptr<char[]> work_data;
-    size_t work_size = 0;
-};
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@ -1,446 +0,0 @@
-#include "ggml-amx.h"
-#include "ggml-amx/common.h"
-#include "ggml-amx/mmq.h"
-#include "ggml-backend-impl.h"
-#include "ggml-impl.h"
-
-#if defined(__gnu_linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#if defined(__AMX_INT8__)
-
-// AMX buffer interface
-static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)(buffer->context);
-}
-
-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    if (qtype_has_amx_kernels(tensor->type)) {
-        ggml_backend_amx_convert_weight(tensor, data, offset, size);
-    } else {
-        memcpy((char *)tensor->data + offset, data, size);
-    }
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        if (qtype_has_amx_kernels(src->type)) {
-            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
-        } else {
-            memcpy(dst->data, src->data, ggml_nbytes(src));
-        }
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "AMX";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
-    return ggml_backend_amx_get_alloc_size(tensor);
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_amx;
-}
-
-// backend interface
-
-static const char * ggml_backend_amx_name(ggml_backend_t backend) {
-    return "AMX";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_amx_free(ggml_backend_t backend) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-    delete ctx;
-    delete backend;
-}
-
-static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        switch (node->op) {
-        case GGML_OP_MUL_MAT:
-            ggml_backend_amx_mul_mat(ctx, node);
-            break;
-
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            break;
-
-        default:
-            fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
-            GGML_ASSERT(false);
-        }
-    }
-
-    return GGML_STATUS_SUCCESS;
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i ggml_backend_amx_i = {
-    /* .get_name                = */ ggml_backend_amx_name,
-    /* .free                    = */ ggml_backend_amx_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_amx_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_amx_guid() {
-    static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
-    return &guid;
-}
-
-#define ARCH_GET_XCOMP_PERM     0x1022
-#define ARCH_REQ_XCOMP_PERM     0x1023
-#define XFEATURE_XTILECFG       17
-#define XFEATURE_XTILEDATA      18
-
-static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
-    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
-        fprintf(stderr, "AMX is not ready to be used!\n");
-        return false;
-    }
-    return true;
-#elif defined(_WIN32)
-    return true;
-#endif
-}
-
-ggml_backend_t ggml_backend_amx_init() {
-
-    // invoke a Linux system call to request access to AMX features
-    ggml_amx_init();
-
-    // backend context
-    ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
-
-    // ggml amx backend
-    ggml_backend_t backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_amx_guid(),
-        /* .interface = */ ggml_backend_amx_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_amx(backend_amx));
-
-    ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
-    ctx->n_threads = n_threads;
-}
-
-// device interface
-
-static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
-    return "AMX";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
-    return "Intel Advanced Matrix Extensions";
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_amx_device_get_name(dev);
-    props->description = ggml_backend_amx_device_get_description(dev);
-    props->type        = ggml_backend_amx_device_get_type(dev);
-    ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
-    // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_amx_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_amx_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-
-    // handle only 2d gemm for now
-    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-    };
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT: {
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const enum ggml_type type = src0->type;
-            const int64_t ne0 = op->ne[0];
-
-            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
-            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
-            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
-
-            bool can_use_amx =
-                is_contiguous_2d(src0) &&       // src0 must be contiguous
-                is_contiguous_2d(src1) &&       // src1 must be contiguous
-                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
-                has_amx_kernels &&              // with amx kernel impls
-                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
-
-            return can_use_amx;
-        }
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
-    /* .get_name             = */ ggml_backend_amx_device_get_name,
-    /* .get_description      = */ ggml_backend_amx_device_get_description,
-    /* .get_memory           = */ ggml_backend_amx_device_get_memory,
-    /* .get_type             = */ ggml_backend_amx_device_get_type,
-    /* .get_props            = */ ggml_backend_amx_device_get_props,
-    /* .init_backend         = */ ggml_backend_amx_device_init,
-    /* .get_buffer_type      = */ ggml_backend_amx_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_amx_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_amx_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// backend reg interface
-
-static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
-    return "AMX";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_device ggml_backend_amx_device = {
-        /* .iface   = */ ggml_backend_amx_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_amx_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_amx_set_n_threads;
-    }
-    return NULL;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
-    /* .get_name         = */ ggml_backend_amx_reg_get_name,
-    /* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_amx_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_amx_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .iface   = */ ggml_backend_amx_reg_i,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_amx_reg;
-}
-
-#else // if defined(__AMX_INT8__)
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
-    return nullptr;
-}
-
-bool ggml_backend_is_amx(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return false;
-}
-
-ggml_backend_t ggml_backend_amx_init(void) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-    return nullptr;
-}
-
-void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
-    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-
-    GGML_UNUSED(backend_amx);
-    GGML_UNUSED(n_threads);
-}
-
-ggml_backend_reg_t ggml_backend_amx_reg(void) {
-    return nullptr;
-}
-
-#endif
--- a/ggml/src/ggml-amx/mmq.cpp
+++ b/ggml/src/ggml-amx/mmq.cpp
--- a/ggml/src/ggml-amx/mmq.h
+++ b/ggml/src/ggml-amx/mmq.h
@ -1,17 +0,0 @@
-#pragma once
-#include "common.h"
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
-
-void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-
-void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -1,255 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    #define GGML_BACKEND_API_VERSION 1
-
-    //
-    // Backend buffer type
-    //
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*get_name)      (ggml_backend_buffer_type_t buft);
-        // allocate a buffer of this type
-        ggml_backend_buffer_t (*alloc_buffer)  (ggml_backend_buffer_type_t buft, size_t size);
-        // tensor alignment
-        size_t                (*get_alignment) (ggml_backend_buffer_type_t buft);
-        // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
-        size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
-        // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
-        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
-        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_dev_t device;
-        void * context;
-    };
-
-    //
-    // Backend buffer
-    //
-
-    struct ggml_backend_buffer_i {
-        // (optional) free the buffer
-        void         (*free_buffer)  (ggml_backend_buffer_t buffer);
-        // base address of the buffer
-        void *       (*get_base)     (ggml_backend_buffer_t buffer);
-        // (optional) initialize a tensor in the buffer (eg. add tensor extras)
-        enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        // tensor data access
-        void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
-        void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*get_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
-        bool         (*cpy_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
-        // clear the entire buffer
-        void         (*clear)        (ggml_backend_buffer_t buffer, uint8_t value);
-        // (optional) reset any internal state due to tensor initialization, such as tensor extras
-        void         (*reset)        (ggml_backend_buffer_t buffer);
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        void * context;
-        size_t size;
-        enum ggml_backend_buffer_usage usage;
-    };
-
-    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t buft,
-            struct ggml_backend_buffer_i      iface,
-                   void *                     context,
-                   size_t                     size);
-
-    // do not use directly, use ggml_backend_tensor_copy instead
-    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // multi-buffer
-    // buffer that contains a collection of buffers
-    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
-    //
-    // Backend (stream)
-    //
-
-    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
-
-        void (*free)(ggml_backend_t backend);
-
-        // (optional) asynchronous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // (optional) complete all pending operations (required if the backend supports async operations)
-        void (*synchronize)(ggml_backend_t backend);
-
-        // (optional) graph plans (not used currently)
-        // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
-        void                      (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
-        // compute the graph with the plan
-        enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph (always async if supported by the backend)
-        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // (optional) event synchronization
-        // record an event on this stream
-        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
-        // wait for an event on on a different stream
-        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
-    };
-
-    struct ggml_backend {
-        ggml_guid_t guid;
-        struct ggml_backend_i iface;
-        ggml_backend_dev_t device;
-        void * context;
-    };
-
-    struct ggml_backend_event {
-        struct ggml_backend_device * device;
-        void * context;
-    };
-
-    //
-    // Backend device
-    //
-
-    // Note: if additional properties are needed, we should add a struct with all of them
-    //       the current functions to obtain the properties can remain, since they are more convenient for often used properties
-    struct ggml_backend_device_i {
-        // device name: short identifier for this device, such as "CPU" or "CUDA0"
-        const char * (*get_name)(ggml_backend_dev_t dev);
-
-        // device description: short informative description of the device, could be the model name
-        const char * (*get_description)(ggml_backend_dev_t dev);
-
-        // device memory in bytes
-        void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
-
-        // device type
-        enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
-
-        // device properties
-        void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
-
-        // backend (stream) initialization
-        ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
-
-        // preferred buffer type
-        ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
-
-        // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
-        ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
-
-        // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
-        ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
-
-        // check if the backend can compute an operation
-        bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
-
-        // check if the backend can use tensors allocated in a buffer type
-        bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
-
-        // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
-        // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
-        bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
-
-        // (optional) event synchronization
-        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
-        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
-        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
-    };
-
-    struct ggml_backend_device {
-        struct ggml_backend_device_i iface;
-        ggml_backend_reg_t reg;
-        void * context;
-    };
-
-    //
-    // Backend (reg)
-    //
-
-    struct ggml_backend_reg_i {
-        const char * (*get_name)(ggml_backend_reg_t reg);
-
-        // enumerate available devices
-        size_t             (*get_device_count)(ggml_backend_reg_t reg);
-        ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
-
-        // (optional) get a pointer to a function in the backend
-        // backends can add custom functions that are not part of the standard ggml-backend interface
-        void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
-    };
-
-    struct ggml_backend_reg {
-        int api_version; // initialize to GGML_BACKEND_API_VERSION
-        struct ggml_backend_reg_i iface;
-        void * context;
-    };
-
-    // Internal backend registry API
-    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
-
-    // Add backend dynamic loading support to the backend
-
-    // Initialize the backend
-    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
-    // Optional: obtain a score for the backend based on the system configuration
-    // Higher scores are preferred, 0 means the backend is not supported in the current system
-    typedef int                (*ggml_backend_score_t)(void);
-
-#ifdef GGML_BACKEND_DL
-#    ifdef __cplusplus
-#        define GGML_BACKEND_DL_IMPL(reg_fn)                             \
-            extern "C" {                                                 \
-            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-            }                                                            \
-            ggml_backend_reg_t ggml_backend_init(void) {                 \
-                return reg_fn();                                         \
-            }
-#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \
-            extern "C" {                                   \
-            GGML_BACKEND_API int ggml_backend_score(void); \
-            }                                              \
-            int ggml_backend_score(void) {                 \
-                return score_fn();                         \
-            }
-#    else
-#        define GGML_BACKEND_DL_IMPL(reg_fn)                              \
-            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \
-            ggml_backend_reg_t                  ggml_backend_init(void) { \
-                return reg_fn();                                          \
-            }
-#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \
-            GGML_BACKEND_API int ggml_backend_score(void);  \
-            int                  ggml_backend_score(void) { \
-                return score_fn();                          \
-            }
-#    endif
-#else
-#    define GGML_BACKEND_DL_IMPL(reg_fn)
-#    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
-#endif
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -1,586 +0,0 @@
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
-#include "ggml-impl.h"
-#include <algorithm>
-#include <cstring>
-#include <filesystem>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <vector>
-#include <cctype>
-
-#ifdef _WIN32
-#    define WIN32_LEAN_AND_MEAN
-#    ifndef NOMINMAX
-#        define NOMINMAX
-#    endif
-#    include <windows.h>
-#elif defined(__APPLE__)
-#    include <mach-o/dyld.h>
-#    include <dlfcn.h>
-#else
-#    include <dlfcn.h>
-#    include <unistd.h>
-#endif
-
-// Backend registry
-#ifdef GGML_USE_CPU
-#include "ggml-cpu.h"
-#endif
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef GGML_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
-
-#ifdef GGML_USE_BLAS
-#include "ggml-blas.h"
-#endif
-
-#ifdef GGML_USE_RPC
-#include "ggml-rpc.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
-#endif
-
-// disable C++17 deprecation warning for std::codecvt_utf8
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-namespace fs = std::filesystem;
-
-static std::string path_str(const fs::path & path) {
-    std::string u8path;
-    try {
-#if defined(__cpp_lib_char8_t)
-        // C++20 and later: u8string() returns std::u8string
-        std::u8string u8str = path.u8string();
-        u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
-#else
-        // C++17: u8string() returns std::string
-        u8path = path.u8string();
-#endif
-    } catch (...) {
-    }
-    return u8path;
-}
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-
-#ifdef _WIN32
-
-using dl_handle = std::remove_pointer_t<HMODULE>;
-
-struct dl_handle_deleter {
-    void operator()(HMODULE handle) {
-        FreeLibrary(handle);
-    }
-};
-
-static dl_handle * dl_load_library(const fs::path & path) {
-    // suppress error dialogs for missing DLLs
-    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-    HMODULE handle = LoadLibraryW(path.wstring().c_str());
-
-    SetErrorMode(old_mode);
-
-    return handle;
-}
-
-static void * dl_get_sym(dl_handle * handle, const char * name) {
-    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-    void * p = (void *) GetProcAddress(handle, name);
-
-    SetErrorMode(old_mode);
-
-    return p;
-}
-
-#else
-
-using dl_handle = void;
-
-struct dl_handle_deleter {
-    void operator()(void * handle) {
-        dlclose(handle);
-    }
-};
-
-static void * dl_load_library(const fs::path & path) {
-    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
-
-    return handle;
-}
-
-static void * dl_get_sym(dl_handle * handle, const char * name) {
-    return dlsym(handle, name);
-}
-
-#endif
-
-using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
-
-struct ggml_backend_reg_entry {
-    ggml_backend_reg_t reg;
-    dl_handle_ptr handle;
-};
-
-struct ggml_backend_registry {
-    std::vector<ggml_backend_reg_entry> backends;
-    std::vector<ggml_backend_dev_t> devices;
-
-    ggml_backend_registry() {
-#ifdef GGML_USE_CUDA
-        register_backend(ggml_backend_cuda_reg());
-#endif
-#ifdef GGML_USE_METAL
-        register_backend(ggml_backend_metal_reg());
-#endif
-#ifdef GGML_USE_SYCL
-        register_backend(ggml_backend_sycl_reg());
-#endif
-#ifdef GGML_USE_VULKAN
-        register_backend(ggml_backend_vk_reg());
-#endif
-#ifdef GGML_USE_OPENCL
-        register_backend(ggml_backend_opencl_reg());
-#endif
-#ifdef GGML_USE_CANN
-        register_backend(ggml_backend_cann_reg());
-#endif
-#ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
-#endif
-#ifdef GGML_USE_RPC
-        register_backend(ggml_backend_rpc_reg());
-#endif
-#ifdef GGML_USE_KOMPUTE
-        register_backend(ggml_backend_kompute_reg());
-#endif
-#ifdef GGML_USE_CPU
-        register_backend(ggml_backend_cpu_reg());
-#endif
-    }
-
-    ~ggml_backend_registry() {
-        // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
-        // since backend threads may still be running and accessing resources from the dynamic library
-        for (auto & entry : backends) {
-            if (entry.handle) {
-                entry.handle.release(); // NOLINT
-            }
-        }
-    }
-
-    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
-        if (!reg) {
-            return;
-        }
-
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
-            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
-#endif
-        backends.push_back({ reg, std::move(handle) });
-        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
-            register_device(ggml_backend_reg_dev_get(reg, i));
-        }
-    }
-
-    void register_device(ggml_backend_dev_t device) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
-#endif
-        devices.push_back(device);
-    }
-
-    ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
-        dl_handle_ptr handle { dl_load_library(path) };
-        if (!handle) {
-            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
-            }
-            return nullptr;
-        }
-
-        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-        if (score_fn && score_fn() == 0) {
-            if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_str(path).c_str());
-            }
-            return nullptr;
-        }
-
-        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
-        if (!backend_init_fn) {
-            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_str(path).c_str());
-            }
-            return nullptr;
-        }
-
-        ggml_backend_reg_t reg = backend_init_fn();
-        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
-            if (!silent) {
-                if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n",
-                        __func__, path_str(path).c_str());
-                } else {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path_str(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-                }
-            }
-            return nullptr;
-        }
-
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
-
-        register_backend(reg, std::move(handle));
-
-        return reg;
-    }
-
-    void unload_backend(ggml_backend_reg_t reg, bool silent) {
-        auto it = std::find_if(backends.begin(), backends.end(),
-                               [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
-
-        if (it == backends.end()) {
-            if (!silent) {
-                GGML_LOG_ERROR("%s: backend not found\n", __func__);
-            }
-            return;
-        }
-
-        if (!silent) {
-            GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
-        }
-
-        // remove devices
-        devices.erase(
-            std::remove_if(devices.begin(), devices.end(),
-                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
-            devices.end());
-
-        // remove backend
-        backends.erase(it);
-    }
-};
-
-static ggml_backend_registry & get_reg() {
-    static ggml_backend_registry reg;
-    return reg;
-}
-
-// Internal API
-void ggml_backend_register(ggml_backend_reg_t reg) {
-    get_reg().register_backend(reg);
-}
-
-void ggml_backend_device_register(ggml_backend_dev_t device) {
-    get_reg().register_device(device);
-}
-
-// Backend (reg) enumeration
-static bool striequals(const char * a, const char * b) {
-    for (; *a && *b; a++, b++) {
-        if (std::tolower(*a) != std::tolower(*b)) {
-            return false;
-        }
-    }
-    return *a == *b;
-}
-
-size_t ggml_backend_reg_count() {
-    return get_reg().backends.size();
-}
-
-ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
-    GGML_ASSERT(index < ggml_backend_reg_count());
-    return get_reg().backends[index].reg;
-}
-
-ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
-    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
-        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        if (striequals(ggml_backend_reg_name(reg), name)) {
-            return reg;
-        }
-    }
-    return nullptr;
-}
-
-// Device enumeration
-size_t ggml_backend_dev_count() {
-    return get_reg().devices.size();
-}
-
-ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
-    GGML_ASSERT(index < ggml_backend_dev_count());
-    return get_reg().devices[index];
-}
-
-ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (striequals(ggml_backend_dev_name(dev), name)) {
-            return dev;
-        }
-    }
-    return nullptr;
-}
-
-ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (ggml_backend_dev_type(dev) == type) {
-            return dev;
-        }
-    }
-    return nullptr;
-}
-
-// Convenience functions
-ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
-    if (!dev) {
-        return nullptr;
-    }
-    return ggml_backend_dev_init(dev, params);
-}
-
-ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
-    if (!dev) {
-        return nullptr;
-    }
-    return ggml_backend_dev_init(dev, params);
-}
-
-ggml_backend_t ggml_backend_init_best(void) {
-    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
-    if (!dev) {
-        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    }
-    if (!dev) {
-        return nullptr;
-    }
-    return ggml_backend_dev_init(dev, nullptr);
-}
-
-// Dynamic loading
-ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(path, false);
-}
-
-void ggml_backend_unload(ggml_backend_reg_t reg) {
-    get_reg().unload_backend(reg, true);
-}
-
-static fs::path get_executable_path() {
-#if defined(__APPLE__)
-    // get executable path
-    std::vector<char> path;
-    uint32_t size;
-    while (true) {
-        size = path.size();
-        if (_NSGetExecutablePath(path.data(), &size) == 0) {
-            break;
-        }
-        path.resize(size);
-    }
-    std::string base_path(path.data(), size);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('/');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return base_path + "/";
-#elif defined(__linux__) || defined(__FreeBSD__)
-    std::string base_path = ".";
-    std::vector<char> path(1024);
-    while (true) {
-        // get executable path
-#    if defined(__linux__)
-        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
-#    elif defined(__FreeBSD__)
-        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
-#    endif
-        if (len == -1) {
-            break;
-        }
-        if (len < (ssize_t) path.size()) {
-            base_path = std::string(path.data(), len);
-            // remove executable name
-            auto last_slash = base_path.find_last_of('/');
-            if (last_slash != std::string::npos) {
-                base_path = base_path.substr(0, last_slash);
-            }
-            break;
-        }
-        path.resize(path.size() * 2);
-    }
-
-    return base_path + "/";
-#elif defined(_WIN32)
-    std::vector<wchar_t> path(MAX_PATH);
-    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
-    if (len == 0) {
-        return {};
-    }
-    std::wstring base_path(path.data(), len);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('\\');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return base_path + L"\\";
-#else
-    return {};
-#endif
-}
-
-static fs::path backend_filename_prefix() {
-#ifdef _WIN32
-    return fs::u8path("ggml-");
-#else
-    return fs::u8path("libggml-");
-#endif
-}
-
-static fs::path backend_filename_extension() {
-#ifdef _WIN32
-    return fs::u8path(".dll");
-#else
-    return fs::u8path(".so");
-#endif
-}
-
-static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
-    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
-    const fs::path name_path = fs::u8path(name);
-    const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
-    const fs::path file_extension = backend_filename_extension();
-
-    std::vector<fs::path> search_paths;
-    if (user_search_path == nullptr) {
-        // default search paths: executable directory, current directory
-        search_paths.push_back(get_executable_path());
-        search_paths.push_back(fs::current_path());
-    } else {
-        search_paths.push_back(fs::u8path(user_search_path));
-    }
-
-    int best_score = 0;
-    fs::path best_path;
-
-    for (const auto & search_path : search_paths) {
-        if (!fs::exists(search_path)) {
-            GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
-            continue;
-        }
-        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
-        for (const auto & entry : dir_it) {
-            if (entry.is_regular_file()) {
-                auto filename = entry.path().filename();
-                auto ext = entry.path().extension();
-                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
-                    dl_handle_ptr handle { dl_load_library(entry) };
-                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
-                    }
-                    if (handle) {
-                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                        if (score_fn) {
-                            int s = score_fn();
-#ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_str(entry.path()).c_str(), s);
-#endif
-                            if (s > best_score) {
-                                best_score = s;
-                                best_path = entry.path();
-                            }
-                        } else {
-                            if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, path_str(entry.path()).c_str());
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    if (best_score == 0) {
-        // try to load the base backend
-        for (const auto & search_path : search_paths) {
-            fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
-            fs::path path = search_path / filename;
-            if (fs::exists(path)) {
-                return get_reg().load_backend(path, silent);
-            }
-        }
-        return nullptr;
-    }
-
-    return get_reg().load_backend(best_path, silent);
-}
-
-void ggml_backend_load_all() {
-    ggml_backend_load_all_from_path(nullptr);
-}
-
-void ggml_backend_load_all_from_path(const char * dir_path) {
-#ifdef NDEBUG
-    bool silent = true;
-#else
-    bool silent = false;
-#endif
-
-    ggml_backend_load_best("blas", silent, dir_path);
-    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-    ggml_backend_load_best("kompute", silent, dir_path);
-    ggml_backend_load_best("metal", silent, dir_path);
-    ggml_backend_load_best("rpc", silent, dir_path);
-    ggml_backend_load_best("sycl", silent, dir_path);
-    ggml_backend_load_best("vulkan", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("musa", silent, dir_path);
-    ggml_backend_load_best("cpu", silent, dir_path);
-    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
-    const char * backend_path = std::getenv("GGML_BACKEND_PATH");
-    if (backend_path) {
-        ggml_backend_load(backend_path);
-    }
-}
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@ -1,87 +0,0 @@
-if (GGML_STATIC)
-    set(BLA_STATIC ON)
-endif()
-#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-#    set(BLA_SIZEOF_INTEGER 8)
-#endif()
-
-set(BLA_VENDOR ${GGML_BLAS_VENDOR})
-find_package(BLAS)
-
-if (BLAS_FOUND)
-    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-    ggml_add_backend_library(ggml-blas
-                             ggml-blas.cpp
-                            )
-
-    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-        add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
-    elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
-        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-        find_package(PkgConfig REQUIRED)
-        if (${GGML_BLAS_VENDOR} MATCHES "Generic")
-            pkg_check_modules(DepBLAS blas)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
-            # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-            pkg_check_modules(DepBLAS openblas64)
-            if (NOT DepBLAS_FOUND)
-                pkg_check_modules(DepBLAS openblas)
-            endif()
-        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-            add_compile_definitions(GGML_BLAS_USE_BLIS)
-            pkg_check_modules(DepBLAS blis)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
-            pkg_check_modules(DepBLAS blas-atlas)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
-            pkg_check_modules(DepBLAS flexiblas_api)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-            # all Intel* libraries share the same include path
-            pkg_check_modules(DepBLAS mkl-sdl)
-        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
-            # this doesn't provide pkg-config
-            # suggest to assign BLAS_INCLUDE_DIRS on your own
-            if ("${NVHPC_VERSION}" STREQUAL "")
-                message(WARNING "Better to set NVHPC_VERSION")
-            else()
-                set(DepBLAS_FOUND ON)
-                set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-            endif()
-        endif()
-        if (DepBLAS_FOUND)
-            set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-        else()
-            message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-            " detected by pkgconfig, trying to find cblas.h from possible paths...")
-            find_path(BLAS_INCLUDE_DIRS
-                NAMES cblas.h
-                HINTS
-                    /usr/include
-                    /usr/local/include
-                    /usr/include/openblas
-                    /opt/homebrew/opt/openblas/include
-                    /usr/local/opt/openblas/include
-                    /usr/include/x86_64-linux-gnu/openblas/include
-            )
-        endif()
-    endif()
-
-    message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-
-    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
-
-    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
-        add_compile_definitions(GGML_BLAS_USE_MKL)
-    endif()
-
-    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
-    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
-else()
-    message(ERROR "BLAS not found, please refer to "
-                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-                  " to set correct GGML_BLAS_VENDOR")
-endif()
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -1,517 +0,0 @@
-#include "ggml-impl.h"
-#include "ggml-blas.h"
-#include "ggml-backend-impl.h"
-
-#include <future>
-#include <vector>
-#include <cstring>
-
-#if defined(GGML_BLAS_USE_ACCELERATE)
-#   include <Accelerate/Accelerate.h>
-#elif defined(GGML_BLAS_USE_MKL)
-#   include <mkl.h>
-#elif defined(GGML_BLAS_USE_BLIS)
-#   include <blis.h>
-#elif defined(GGML_BLAS_USE_NVPL)
-#   include <nvpl_blas.h>
-#else
-#   include <cblas.h>
-#endif
-
-struct ggml_backend_blas_context {
-    int n_threads = GGML_DEFAULT_N_THREADS;
-    std::unique_ptr<char[]> work_data;
-    size_t work_size = 0;
-#ifndef GGML_USE_OPENMP
-    std::vector<std::future<void>> tasks;
-#endif
-};
-
-static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const enum ggml_type type = src0->type;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const int64_t ne_plane      = ne01*ne00;
-    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
-
-    if (ctx->work_size < desired_wsize) {
-        ctx->work_data.reset(new char[desired_wsize]);
-        ctx->work_size = desired_wsize;
-    }
-    void * wdata = ctx->work_data.get();
-
-    // convert src0 to float
-    if (type != GGML_TYPE_F32) {
-        const auto * type_traits = ggml_get_type_traits(type);
-        ggml_to_float_t const to_float = type_traits->to_float;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
-                      float * const wplane = (float *) wdata      + i02*ne_plane      + i03*ne02*ne_plane;
-
-                const int min_cols_per_thread = 4096;
-                const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
-                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
-
-#ifdef GGML_USE_OPENMP
-                #pragma omp parallel for num_threads(n_threads)
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
-                }
-#else
-                for (int i = 1; i < n_threads; i++) {
-                    const int64_t start =       i*ne01/n_threads;
-                    const int64_t end   = (i + 1)*ne01/n_threads;
-                    if (start < end) {
-                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                            for (int64_t i01 = start; i01 < end; i01++) {
-                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
-                            }
-                        }));
-                    }
-                }
-                {
-                    // reuse the current thread for the first task
-                    const int64_t start = 0;
-                    const int64_t end   = ne01/n_threads;
-                    for (int64_t i01 = start; i01 < end; i01++) {
-                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
-                    }
-                }
-#endif
-            }
-        }
-
-#ifndef GGML_USE_OPENMP
-        // wait for all tasks to finish
-        for (auto & task : ctx->tasks) {
-            task.get();
-        }
-        ctx->tasks.clear();
-#endif
-    }
-
-#if defined(OPENBLAS_VERSION)
-    openblas_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_BLIS)
-    bli_thread_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_NVPL)
-    nvpl_blas_set_num_threads(ctx->n_threads);
-#endif
-
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            const int64_t i03 = i13/r3;
-            const int64_t i02 = i12/r2;
-
-            const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-            const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-                  float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
-
-            if (type != GGML_TYPE_F32) {
-                x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
-            }
-
-            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne1, ne01, ne10,
-                        1.0f,   y, ne10,
-                                x, ne00,
-                        0.0f,   d, ne01);
-        }
-    }
-}
-
-static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(ne0  == ne00);
-    GGML_ASSERT(ne1  == ne10);
-    GGML_ASSERT(ne2  == ne02);
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne3  == ne13);
-    GGML_ASSERT(ne03 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
-    // src0: (k,n)
-    // src1: (k,m)
-    // dst:  (m,n)
-    //
-    // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
-    // Also expressed as (major,minor)
-    // a: (m,k): so src1 transposed
-    // b: (k,n): so src0
-    // c: (m,n)
-    //
-    // However, if ggml_is_transposed(src1) is true, then
-    // src1->data already contains a transposed version, so sgemm mustn't
-    // transpose it further.
-
-    int n = src0->ne[0];
-    int k = src0->ne[1];
-    int m = src1->ne[0];
-
-    CBLAS_TRANSPOSE transposeA;
-    int lda;
-
-    if (!ggml_is_transposed(src1)) {
-        transposeA = CblasTrans;
-        lda = m;
-    } else {
-        transposeA = CblasNoTrans;
-        lda = k;
-    }
-
-    float * a = (float *) ((char *) src1->data);
-    float * b = (float *) ((char *) src0->data);
-    float * c = (float *) ((char *) dst->data);
-
-    cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
-
-    GGML_UNUSED(ctx);
-}
-
-// backend interface
-
-static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
-    return "BLAS";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_blas_free(ggml_backend_t backend) {
-    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
-    delete ctx;
-    delete backend;
-}
-
-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                ggml_backend_blas_mul_mat(ctx, node);
-                break;
-
-            case GGML_OP_OUT_PROD:
-                ggml_backend_blas_out_prod(ctx, node);
-                break;
-
-            case GGML_OP_NONE:
-            case GGML_OP_RESHAPE:
-            case GGML_OP_VIEW:
-            case GGML_OP_PERMUTE:
-            case GGML_OP_TRANSPOSE:
-                break;
-
-            default:
-                GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
-        }
-    }
-
-    return GGML_STATUS_SUCCESS;
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i blas_backend_i = {
-    /* .get_name                = */ ggml_backend_blas_get_name,
-    /* .free                    = */ ggml_backend_blas_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_blas_guid(void) {
-    static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_blas_init(void) {
-    ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
-
-    ggml_backend_t backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_blas_guid(),
-        /* .interface = */ blas_backend_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
-    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
-        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
-    }
-#endif
-
-#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
-    GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
-#endif
-
-    return backend;
-}
-
-bool ggml_backend_is_blas(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
-}
-
-void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_blas(backend_blas));
-
-    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
-    ctx->n_threads = n_threads;
-}
-
-// device interface
-
-static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
-    return "BLAS";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
-    #if defined(GGML_BLAS_USE_ACCELERATE)
-        return "Accelerate";
-    #elif defined(GGML_BLAS_USE_MKL)
-        return "MKL";
-    #elif defined(GGML_BLAS_USE_BLIS)
-        return "BLIS";
-    #elif defined(GGML_BLAS_USE_NVPL)
-        return "NVPL";
-    #elif defined(OPENBLAS_VERSION)
-        return "OpenBLAS";
-    #else
-        return "BLAS";
-    #endif
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_blas_device_get_name(dev);
-    props->description = ggml_backend_blas_device_get_description(dev);
-    props->type        = ggml_backend_blas_device_get_type(dev);
-    ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_blas_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT:
-        {
-            // BLAS usually is only faster for large matrices
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const int64_t ne10 = src1->ne[0];
-
-            const int64_t ne0 = op->ne[0];
-            const int64_t ne1 = op->ne[1];
-
-            // TODO: find the optimal value
-            const int64_t min_batch = 32;
-
-            return ggml_is_contiguous(src0) &&
-                   ggml_is_contiguous(src1) &&
-                   src1->type == GGML_TYPE_F32 &&
-                   (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
-                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
-        }
-
-        case GGML_OP_OUT_PROD:
-            return op->src[0]->type == GGML_TYPE_F32 &&
-                   op->src[1]->type == GGML_TYPE_F32 &&
-                   ggml_is_matrix(src0) &&
-                   ggml_is_matrix(src1) &&
-                   ggml_is_contiguous(src0) &&
-                   (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
-                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
-
-        default:
-            return false;
-
-    }
-
-    GGML_UNUSED(dev);
-}
-
-static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft);
-
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
-    /* .get_name             = */ ggml_backend_blas_device_get_name,
-    /* .get_description      = */ ggml_backend_blas_device_get_description,
-    /* .get_memory           = */ ggml_backend_blas_device_get_memory,
-    /* .get_type             = */ ggml_backend_blas_device_get_type,
-    /* .get_props            = */ ggml_backend_blas_device_get_props,
-    /* .init_backend         = */ ggml_backend_blas_device_init_backend,
-    /* .get_buffer_type      = */ ggml_backend_blas_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
-    /* .supports_op          = */ ggml_backend_blas_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_blas_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// backend reg interface
-
-static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
-    return "BLAS";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_device ggml_backend_blas_device = {
-        /* .iface   = */ ggml_backend_blas_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_blas_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-}
-
-static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_blas_set_n_threads;
-    }
-    return NULL;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(name);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
-    /* .get_name         = */ ggml_backend_blas_reg_get_name,
-    /* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_blas_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_blas_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_blas_reg(void) {
-    static struct ggml_backend_reg ggml_backend_blas_reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_blas_reg_i,
-        /* .context     = */ NULL,
-    };
-
-    return &ggml_backend_blas_reg;
-}
-
-GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@ -1,74 +0,0 @@
-if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
-    set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
-    message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
-endif()
-
-# Auto-detech Soc type and Soc version, if detect failed, will abort build
-set(SOC_VERSION "")
-function(detect_ascend_soc_type SOC_VERSION)
-    execute_process(
-        COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
-        OUTPUT_VARIABLE npu_info
-        RESULT_VARIABLE npu_result
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    if("${npu_info}" STREQUAL "" OR ${npu_result})
-        message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
-    endif()
-    set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
-endfunction()
-
-if(NOT SOC_TYPE)
-    detect_ascend_soc_type(SOC_VERSION)
-    set(SOC_TYPE "${SOC_VERSION}")
-    message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
-endif()
-
-string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
-
-# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
-string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
-set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
-string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
-
-if (CANN_INSTALL_DIR)
-    # Only Support Linux.
-    if (NOT UNIX)
-        message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
-    endif()
-
-    # Supported platforms: x86-64, arm64
-    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
-    else()
-        message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
-    endif()
-
-    # Set header and libs
-    set(CANN_INCLUDE_DIRS
-        ${CANN_INSTALL_DIR}/include
-        ${CANN_INSTALL_DIR}/include/aclnn
-        ${CANN_INSTALL_DIR}/acllib/include
-    )
-
-    list(APPEND CANN_LIBRARIES
-        ascendcl
-        nnopbase
-        opapi
-        acl_op_compiler
-    )
-
-    file(GLOB GGML_SOURCES_CANN "*.cpp")
-
-    ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
-    target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
-    target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
-    target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
-
-    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
-
-    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
-    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
-else()
-    message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
-endif()
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "acl_tensor.h"
-
-#include <algorithm>
-#include <cstring>
-
-aclDataType ggml_cann_type_mapping(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_F32:
-            return ACL_FLOAT;
-        case GGML_TYPE_F16:
-            return ACL_FLOAT16;
-        case GGML_TYPE_I8:
-            return ACL_INT8;
-        case GGML_TYPE_I16:
-            return ACL_INT16;
-        case GGML_TYPE_I32:
-            return ACL_INT32;
-        case GGML_TYPE_Q4_0:
-            return ACL_INT4;
-        case GGML_TYPE_Q8_0:
-            return ACL_INT8;
-        default:
-            return ACL_DT_UNDEFINED;
-    }
-    return ACL_DT_UNDEFINED;
-}
-
-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
-                                   size_t* nb, int64_t dims, aclFormat format,
-                                   size_t offset) {
-    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
-    // added.
-    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
-
-    int64_t acl_storage_len = 0;
-    if (ne == nullptr) {
-        acl_storage_len = ggml_nbytes(tensor);
-        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            acl_ne[i] = tensor->ne[i];
-            // The step size of acl is in elements.
-            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
-        }
-    } else {
-        // With bcast
-        for (int i = 0; i < dims; i++) {
-            acl_storage_len += (ne[i] - 1) * nb[i];
-            acl_ne[i] = ne[i];
-            acl_stride[i] = nb[i] / ggml_element_size(tensor);
-        }
-    }
-
-    // Reverse ne and stride.
-    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
-    std::reverse(acl_ne, acl_ne + final_dims);
-    std::reverse(acl_stride, acl_stride + final_dims);
-
-    aclTensor* acl_tensor = aclCreateTensor(
-        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
-        offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
-        tensor->data);
-
-    return acl_tensor;
-}
-
-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
-            return true;
-        }
-    }
-    return false;
-}
-
-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
-                                  const ggml_tensor* src1,
-                                  int64_t* bcast_src0_ne,
-                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
-                                  size_t* bcast_src1_nb) {
-    GGML_ASSERT(ggml_can_repeat(src1, src0));
-    int bcast_dim_cnt = 0;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        int64_t nr = src0->ne[i] / src1->ne[i];
-        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
-        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
-        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
-        bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
-        bcast_dim_cnt++;
-        if (nr != 1) {
-            // Need to add an extra dim.
-            bcast_src0_ne[bcast_dim_cnt] = nr;
-            bcast_src1_ne[bcast_dim_cnt] = 1;
-            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
-                                           bcast_src0_ne[bcast_dim_cnt - 1];
-            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
-                                           bcast_src1_ne[bcast_dim_cnt - 1];
-            bcast_dim_cnt++;
-        }
-    }
-    return bcast_dim_cnt;
-}
-
-int64_t ggml_cann_get_mulmat_bcast_shape(
-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
-    // input and dst shoule in same shape, except first two dims.
-    GGML_ASSERT(input_ne[2] == dst_ne[2]);
-    GGML_ASSERT(input_ne[3] == dst_ne[3]);
-
-    int bcast_dim_cnt = 0;
-
-    // For mul_mat, a dimension needs to be added before the dimension that
-    // weight needs to be expanded to satisfy the bcast rule of matrix
-    // multiplication.
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        int64_t nr = input_ne[i] / weight_ne[i];
-        // Do not use bcast in the first two dimensions because we only support
-        // the bcast batch dimension. Just copy them.
-        if (i < 2 || nr == 1) {
-            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
-            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
-
-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
-            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
-            bcast_dim_cnt++;
-        } else {
-            // Need to add an extra dim.
-            bcast_input_ne[bcast_dim_cnt] = nr;
-            bcast_dst_ne[bcast_dim_cnt] = nr;
-            bcast_weight_ne[bcast_dim_cnt] = 1;
-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
-            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
-            bcast_dim_cnt++;
-
-            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
-            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
-            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
-                                            bcast_input_ne[bcast_dim_cnt - 1];
-            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
-                                          bcast_dst_ne[bcast_dim_cnt - 1];
-            bcast_weight_nb[bcast_dim_cnt] =
-                bcast_weight_nb[bcast_dim_cnt - 1] *
-                bcast_weight_ne[bcast_dim_cnt - 1];
-            bcast_dim_cnt++;
-        }
-    }
-    return bcast_dim_cnt;
-}
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@ -1,258 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef CANN_ACL_TENSOR_H
-#define CANN_ACL_TENSOR_H
-
-#include <algorithm>
-#include <cstring>
-
-#include <aclnn/aclnn_base.h>
-#include "common.h"
-
-/**
- * @brief	Maps a ggml_type to its corresponding aclDataType.
- *
- * @details	This function takes a ggml_type as input and returns the corresponding
- *			aclDataType. It supports mapping for various ggml_types. If the input type
- *			does not match any of the predefined ggml_types, the function returns
- *          ACL_DT_UNDEFINED.
- *
- * @param	type    The ggml_type to be mapped.
- * @return	The corresponding aclDataType. If the input type is not recognized,
- *			ACL_DT_UNDEFINED is returned.
- */
-aclDataType ggml_cann_type_mapping(ggml_type type);
-
-/**
- * @brief   Creates an ACL tensor from a ggml_tensor with optional shape.
- *
- * @details This function creates an ACL tensor based on the properties of the
- *          provided ggml_tensor. It supports customer shape by adjusting dimensions
- *          and strides accordingly. If customer shape is applied, additional
- *          dimensions and strides are calculated based on the provided parameters.
- *
- * @param   tensor      Pointer to the ggml_tensor to be converted to ACL tensor.
- * @param   ne          Pointer to an array containing dimensions. Defaults to nullptr
- *                      if no customer shape is applied.
- * @param   nb          Pointer to an array containing strides. Defaults to nullptr
- *                      if no customer shape is applied.
- * @param   dims        Number of dimensions in the tensor. Defaults to 0 if no customer
- *                      shape is applied.
- * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
- * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
- * @return  Pointer to the created ACL tensor.
- */
-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
-                             size_t* nb = nullptr, int64_t dims = 0,
-                             aclFormat format = ACL_FORMAT_ND,
-                             size_t offset = 0);
-
-/**
- * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
- *          should be size_t or float.
- *
- * @details This function creates an ACL tensor using the provided data pointer,
- *          data type, dimensions, strides, format, offset, and additional parameters.
- *          It calculates necessary dimensions and strides based on the provided ne and nb
- *          arrays, adjusting them for the ACL tensor creation. The ACL storage length
- *          is also calculated based on the provided dimensions and strides.
- *
- * @param   data_ptr    Pointer to the data buffer for the ACL tensor.
- * @param   dtype       ACL data type of the tensor.
- * @param   type_size   Size of each element in the tensor data buffer.
- * @param   ne          Pointer to an array containing tensor dimensions.
- * @param   nb          Pointer to an array containing tensor strides.
- * @param   dims        Number of dimensions of the tensor.
- * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
- * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
- * @return  Pointer to the created ACL tensor.
- */
-template<typename TYPE>
-aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
-                                   TYPE type_size, int64_t* ne, TYPE* nb,
-                                   int64_t dims,
-                                   aclFormat format = ACL_FORMAT_ND,
-                                   size_t offset = 0) {
-    int64_t tmp_ne[GGML_MAX_DIMS * 2];
-    int64_t tmp_stride[GGML_MAX_DIMS * 2];
-
-    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
-    for (int i = 0; i < dims; i++) {
-        tmp_stride[i] = nb[i] / type_size;
-    }
-
-    std::reverse(tmp_ne, tmp_ne + dims);
-    std::reverse(tmp_stride, tmp_stride + dims);
-
-    int64_t acl_storage_len = 0;
-    for (int i = 0; i < dims; i++) {
-        acl_storage_len += (ne[i] - 1) * nb[i];
-    }
-
-    aclTensor* acl_tensor =
-        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
-                        format, &acl_storage_len, 1, data_ptr);
-
-    return acl_tensor;
-}
-
-/**
- * @brief   Checks if tensors require broadcasting based on their shapes.
- *
- * @details This function determines if two ggml_tensors need to be broadcasted for
- *          element-wise operations. Broadcasting is necessary if the shapes of the
- *          tensors are not identical and no dimension in either tensor equals 1.
- *
- * @param   t0      Pointer to the first ggml_tensor.
- * @param   t1      Pointer to the second ggml_tensor.
- * @return  True if broadcasting is needed, False otherwise.
- *
- * @remarks This function iterates over the dimensions of t0 and t1. It checks if each
- *          dimension in t1 differs from t0's corresponding dimension and is not equal
- *          to 1. If such a dimension is found, broadcasting is required to align t1
- *          with t0 for element-wise operations.
- */
-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
-
-/**
- * @brief   Computes broadcast shapes and strides for two ggml_tensors.
- *
- * @details This function calculates the broadcast shapes and strides for two ggml_tensors,
- *          following the broadcasting rules similar to numpy. It adjusts dimensions and
- *          strides to ensure compatibility for element-wise operations where one tensor
- *          can be broadcasted to match the shape of another tensor.
- *
- * @param   src0                Pointer to the first ggml_tensor.
- * @param   src1                Pointer to the second ggml_tensor.
- * @param   bcast_ne_src0       Output array to store broadcasted dimensions for src0.
- * @param   bcast_ne_src1       Output array to store broadcasted dimensions for src1.
- * @param   bcast_nb_src0       Output array to store broadcasted strides for src0.
- * @param   bcast_nb_src1       Output array to store broadcasted strides for src1.
- * @return  Number of dimensions in the broadcasted shape.
- *
- * @pre     ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
- *          to match src0.
- *
- * @remarks This function iterates over the dimensions of src0 and src1, calculating the
- *          necessary broadcast dimensions and strides. If a dimension requires broadcasting
- *          (i.e., its size in src1 is smaller than in src0), an additional dimension is
- *          added with size calculated to match src0's dimension. This adjustment ensures
- *          that src1 can be element-wise broadcasted to src0's shape.
- *
- *  How it works:
- *
- *  if dim0 has padding.
- *  a -> (2, 2) padding = 2
- *   a: [[1, 2, *, *]
- *       [2, 3, *, *]]
- *  nb = (8, 4, 2)
- *
- *  if a should bcast with b -> (2, 4)
- *  b' -> (2, 2, 2)
- *  b : [[1, 2, 3, 4, *, *]
- *       [5, 6, 7, 8, *, *]]
- *  nb = (12, 6, 1)
- *
- *  after bcast:
- *  a' -> (2, 1, 2)
- *  a': [[[1, 2], *, *]
- *       [[2, 3], *, *]]
- *  nb = (8, 4, 2, 1)
- *
- *  b' : [[[1, 2], [3, 4], *, *]
- *        [[5, 6], [7, 8], *, *]]
- *  nb = (12, 6, 2, 1)
- *  \endcode
- *
- *  dim1 in a inserted dim, should add nb for dim1,
- *  and all other nb moves to next in order.
- */
-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
-                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
-
-// Bcast macro to avoid duplicate code.
-#define BCAST_SHAPE(src0, src1)                                              \
-    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
-    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
-    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
-    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
-    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
-        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
-        bcast_##src1##_nb);
-
-#define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
-
-/**
- * @brief Calculates broadcast shapes for matrix multiplication.
- *
- * @details This function computes the broadcast shapes required for matrix multiplication
- *          based on the input, weight, and destination tensor shapes. It ensures that the
- *          dimensions of weight tensors are expanded appropriately to satisfy matrix
- *          multiplication broadcast rules.
- *
- * @param input_ne      Array containing the dimensions of the input tensor.
- * @param weight_ne     Array containing the dimensions of the weight tensor.
- * @param dst_ne        Array containing the dimensions of the destination tensor.
- * @param input_nb      Array containing the strides of the input tensor.
- * @param weight_nb     Array containing the strides of the weight tensor.
- * @param dst_nb        Array containing the strides of the destination tensor.
- * @param bcast_input_ne    Output array for broadcasted input tensor dimensions.
- * @param bcast_weight_ne   Output array for broadcasted weight tensor dimensions.
- * @param bcast_dst_ne      Output array for broadcasted destination tensor dimensions.
- * @param bcast_input_nb    Output array for broadcasted input tensor strides.
- * @param bcast_weight_nb   Output array for broadcasted weight tensor strides.
- * @param bcast_dst_nb      Output array for broadcasted destination tensor strides.
- * @return The number of dimensions in the broadcasted tensors.
- *
- * @remarks This function iterates over the tensor dimensions and calculates the broadcast
- *          shapes needed for matrix multiplication. It ensures that dimensions where
- *          weight tensor requires expansion are appropriately handled to conform with
- *          broadcasting rules.
- * @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
- *       before cast dim.
- * @sa ggml_cann_get_bcast_shape
- */
-int64_t ggml_cann_get_mulmat_bcast_shape(
-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
-
-// Bcast macro to avoid duplicate code.
-#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
-    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
-    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
-    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
-    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
-    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
-    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
-    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
-        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
-        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
-        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
-
-#define BCAST_MUL_MAT_PARAM(tensor) \
-    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
-
-#endif  // CANN_ACL_TENSOR_H
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@ -1,592 +0,0 @@
-#ifndef CANN_ACLNN_OPS
-#define CANN_ACLNN_OPS
-
-/**
- * @file    acl_tensor
- * @brief   This file contains related functions of ggml_tensor and acl_tensor.
- *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
- *          functions.
- * @author  hipudding <huafengchun@gmail.com>
- * @author  wangshuai09 <391746016@qq.com>
- * @date    July 15, 2024
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <aclnnop/aclnn_add.h>
-#include <aclnnop/aclnn_arange.h>
-#include <aclnnop/aclnn_argsort.h>
-#include <aclnnop/aclnn_cat.h>
-#include <aclnnop/aclnn_clamp.h>
-#include <aclnnop/aclnn_div.h>
-#include <aclnnop/aclnn_gelu.h>
-#include <aclnnop/aclnn_hardsigmoid.h>
-#include <aclnnop/aclnn_hardswish.h>
-#include <aclnnop/aclnn_leaky_relu.h>
-#include <aclnnop/aclnn_mul.h>
-#include <aclnnop/aclnn_relu.h>
-#include <aclnnop/aclnn_silu.h>
-#include <aclnnop/aclnn_tanh.h>
-#include "acl_tensor.h"
-#include "common.h"
-
-/**
- * @brief   Repeats a ggml tensor along each dimension to match the dimensions
- *          of another tensor.
- *
- * @details This function repeats the elements of a source ggml tensor along
- *          each dimension to create a destination tensor with the specified
- *          dimensions. The operation is performed using the ACL backend and
- *          executed asynchronously on the device.
- *
- * @param   ctx The CANN context used for operations.
- * @param   dst The ggml tensor representing the destination, which op is
- *              GGML_OP_REPEAT and specifies the desired dimensions.
- */
-void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Adds two ggml tensors using the CANN backend.
- *
- * @details This function performs an element-wise addition of two tensors. In
- *          case the tensors do not have the same shape, one or both tensors
- *          will be broadcasted to match the shape of the other before the
- *          addition is performed.The formula for the operation is given by:
- *          \f[
- *              \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
- *          \f]
- *
- * @param ctx The CANN context used for operations.
- * @param dst The ggml tensor representing the destination, result of the
- *            addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
- */
-void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
- *          backend.
- *
- * @details This function computes the Leaky ReLU activation for each element of
- *          the input tensor. The Leaky ReLU function allows a small gradient
- *          when the unit is not active (i.e., when the input is negative). The
- *          Leaky ReLU function is defined as:
- *          \f[
- *              \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
- *               src)
- *          \f]
- *          `negativeSlope` is in dst->params.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the result of the Leaky ReLU
- *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
- */
-void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief    Concatenates multiple tensors along a specified dimension using the
- *           CANN backend.
- *
- * @param ctx        The CANN context used for operations.
- * @param tensorList A pointer to the list of tensors to be concatenated.
- * @param dst        The destination tensor where the result of the
- *                   concatenation is stored. dst->op is `GGML_OP_CONCAT`.
- * @param concat_dim The dimension along which the tensors are concatenated.
- *
- * @attention tensorList length should be 2 and the dimension using for concat
- *            default to 1.
- */
-void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Generates a sequence of evenly spaced values within a specified
- *          interval for a ggml tensor using the CANN backend.
- *
- * @details This function creates a sequence of numbers over a specified i
- *          nterval, starting from `start`, ending before `stop`, and
- *          incrementing by `step`. The sequence is stored in the destination
- *          tensor `dst`.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the generated sequence will be stored.
- *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
- *            `GGML_OP_ARANGE`.
- */
-void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the square of the elements of a ggml tensor using the CANN
- *          backend.
- * @details The function sets the second source tensor of the destination
- *          tensor `dst` to be equal to the first source tensor. This is
- *          effectively squaring the elements since the multiplication becomes
- *          `element * element`.
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the squared values will be stored，
- *            which dst->op is `GGML_OP_SQR`.
- */
-void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Applies a clamp operation to the elements of a ggml tensor using the
- *          CANN backend.
- *
- * @details This function clamps the elements of the input tensor `src` to a
- *          specified range defined by `min` and `max` values. The result is
- *          stored in the destination tensor `dst`. The operation is defined as:
- *          \f[
- *              y = \max(\min(x, max\_value), min\_value)
- *           \f]
- *          where `x` is an element of the input tensor, and `y` is the
- *          corresponding element in the output tensor.
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the clamped values will be stored.
- *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
- */
-void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Scales the elements of a ggml tensor by a constant factor using the
- *          CANN backend.
- *
- * @details This function multiplies each element of the input tensor `src` by
- *          a scaling factor `scale`, storing the result in the destination
- *          tensor `dst`. The operation is defined as:
- *          \f[
- *             dst = src \times scale
- *          \f]
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the scaled values will be stored.
- *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
- */
-void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Sorts the elements of a ggml tensor and returns the indices that
- *          would sort the tensor using the CANN backend.
- *
- * @details This function performs an argsort operation on the input tensor
- *          `src`. It sorts the elements of `src` in either ascending or
- *          descending order, depending on the `GGML_SORT_ORDER_DESC`,
- *          and returns the indices that would sort the original tensor.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the sorted indices will be stored.
- *            dst->op is `GGML_OP_ARGSORT`.
- */
-void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
- *          backend.
- *
- * @details This function applies the Layer Normalization operation on the
- *          input tensor `src` and stores the result in the destination tensor
- *          `dst`. Layer Normalization normalizes the features at each sample in
- *          a mini-batch independently. It is commonly used in neural networks
- *          to normalize the activations of a layer by adjusting and scaling
- *          the outputs.
- *          The operation is defined as:
- *          \f[
- *              \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
- *          \f]
- *          `Var` defaults dst->ne[0]. `eps` is in dst->params.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the normalized values will be stored.
- * @attention `Var` defaults to dst->ne[0].
- */
-void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief  Computes the Group Normalization for a ggml tensor using the CANN
- *         backend.
- *
- * @brief  This function applies the Group Normalization operation on the input
- *         tensor `src` and stores the result in the destination tensor `dst`.
- *         Group Normalization divides the channels into groups and normalizes
- *         the features within each group across spatial locations.
- *         It is commonly used in convolutional neural networks to improve
- *         training stability and performance.
- *         The operation is defined as:
- *         \f[
- *             \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
- *         \f]
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the normalized values will be stored.
- *            `n_groups` is in dst->params, which split C channel to `n_groups`.
- *            dst->op is `GGML_OP_GROUP_NORM`.
- *
- * @attention eps defaults to 1e-6f.
- */
-void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the accumulation of tensors using the CANN backend.
- *
- * @details This function performs an accumulation operation on two tensors.
- *          Depending on the `inplace` flag, it either updates the destination
- *          tensor `dst` in place by adding `alpha * src1` to it, or it creates
- *          a new tensor as the result of `src0 + alpha * src1` and stores it in
- *          `dst`.
- *          The operation is defined as:
- *          \f[
- *               dst = src0 + alpha \times src1
- *          \f]
- *          if `inplace` is `true`, `src0` is equal to 'dst'.
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the accumulated values will be stored.
- *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
- */
-void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the sum of elements along the last dimension of a ggml tensor
- *          using the CANN backend.
- *
- * @details This function performs a reduction sum operation along the last
- *          dimension of the input tensor `src`. The result of the sum is stored
- *          in the destination tensor `dst`.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the reduced values will be stored。
- *            dst->op is `GGML_OP_SUM_ROWS`.
- *
- * @attention `reduce_dims` defaults to 3, which means the last dimension.
- */
-void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
- *          the CANN backend.
- *
- * @details This function performs upsampling of the input tensor `src` using
- *          nearest neighbor interpolation. The upsampling is applied to the
- *          height and width dimensions (last two dimensions) of the tensor. The
- *          result is stored in the destination tensor `dst`, which must have
- *          the appropriate dimensions for the upsampled output.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the upsampled values will be stored.
- *            dst->op is `GGML_OP_UPSCALE`.
- */
-void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
-                                  ggml_tensor* dst);
-
-/**
- * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
- *          using the CANN backend.
- *
- * @details This function pads the input tensor `src` so that it matches the
- *          dimensions of the destination tensor `dst`. The amount of padding
- *          is calculated based on the difference in sizes between `src` and
- *          `dst` along each dimension. The padded tensor is stored in `dst`.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor, which specifies the target dimensions for
- *            padding. dst->op is `GGML_OP_PAD`.
- */
-void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
- *          backend.
- *
- * @details This function dispatches the execution of a 2D pooling operation on
- *          the input tensor `dst`. The type of pooling (average or max) is
- *          determined by the `op` parameter, which is read from the operation
- *          parameters of `dst`. The function supports average pooling
- *          (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
- *          invalid operation is encountered, the function asserts a failure.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor on which the pooling operation is to be
- *            performed. dst->op is `GGML_OP_POOL_2D`.
- */
-void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Duplicates a ggml tensor using the CANN backend.
- *
- * @details This function duplicates the contents of the source tensor `src` to
- *          the destination tensor `dst`. The function supports various tensor
- *          types and configurations, including handling of extra data, type
- *          conversions, and special cases for contiguous and non-contiguous
- *          tensors.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the duplicated data will be stored.
- *            dst->op is `GGML_OP_DUP`
- *
- * @attention Only support Fp16/FP32. Not support when src and dst have
- *            different shape and dst is no-contiguous.
- * @note:     This func need to simplify.
- */
-void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
- *          using the CANN backend.
- *
- * @details This function applies RMS normalization to the input tensor `src`
- *          and stores the result in the destination tensor `dst`. RMS
- *          normalization involves computing the root mean square of the input
- *          tensor along a specified dimension and then dividing each element of
- *          the tensor by this value, adjusted by a small epsilon value to
- *          prevent division by zero.
- *          The operation is defined as:
- *          \f[
- *               \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
- *               \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
- *          \f]
- *          `eps` is in dst->op_params.
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the normalized values will be stored.
- *            dst->op is `GGML_OP_RMS_NORM`.
- */
-void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Applies a diagonal mask to the tensor with a specified value.
- *
- * @details This function creates a mask tensor filled with ones, then applies
- *          an upper triangular and lower triangular operation to it based on
- *          the number of past elements specified. Afterward, it adds the masked
- *          tensor to the destination tensor in-place.
- *
- * @param ctx The backend CANN context used for operations.
- * @param dst The destination tensor where the result will be stored. dst->op is
- *            `GGML_OP_DIAG_MASK`
- * @param value The value to use for masking.
- */
-void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
-
-/**
- * @brief   Performs an image-to-column transformation on the input tensor.
- *
- * @details This function takes an input tensor and applies an image-to-column
- *          operation, converting spatial dimensions into column-like
- *          structures suitable for convolutional operations. It supports both
- *          half-precision (F16) and single-precision (F32) floating-point data
- *          types.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor that stores the result of the operation.
- *            dst->op is `GGML_OP_IM2COL`.
- */
-void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes time step embeddings using sine and cosine functions.
- *
- * @details This function calculates time step embeddings by applying sine and
- *          cosine transformations to a given input tensor, which is typically
- *          used in temporal models like diffusion models or transformers to
- *          encode time information effectively.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor where the result of the embedding operation
- *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
- */
-void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-// @see ggml_cann_dup.
-void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the softmax activation with optional masking.
- *
- * @details This function computes the softmax activation over the input tensor,
- *          optionally applying a mask and scaling factor. It supports both FP16
- *          and FP32 data types and can handle masking by broadcasting the mask
- *          across rows if necessary.
- *          The function performs the following steps:
- *          1. Multiplies the input tensor by a scale factor.
- *          2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
- *          3. Broadcasts the mask tensor if its dimensions do not match the
- *             input tensor's dimensions.
- *          4. Adds the mask to the scaled input tensor.
- *          5. Applies the softmax activation function along the specified
- *             dimension.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor where the result will be stored. dst->op is
- *            `GGML_OP_SOFTMAX`.
- */
-void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Extracts specific rows from a tensor based on indices.
- *
- * @details This function retrieves rows from a source tensor src0 according to
- *          the indices provided in another tensor src1 and stores the result in
- *          a destination tensor (\p dst). It supports different data types
- *          including F32, F16, Q4_0, and Q8_0.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor where the extracted rows will be stored.
- *            dst->op is `GGML_OP_GET_ROWS`.
- */
-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Executes matrix multiplication for the given tensor.
- *
- * @details This function performs matrix multiplication on the source tensors
- *          associated with the destination tensor. It supports matrix
- *          multiplication F32, F16, and Q8_0.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor for storing the result of the matrix
- *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
- */
-void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
- *
- * @details This function implements the RoPE mechanism, which is a method to
- *          encode positional information into sequence data, particularly
- *          useful in transformer models. It supports both F32 and F16 data
- *          types.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor where the RoPE-transformed data will be
- *            stored. dst->op is `GGML_OP_ROPE`.
- *
- * @note The function currently does not support cases where the n_dims is less
- *       than the input tensor's first dimension.
- * @note The function currently does not support cases where the freq_factors is
- *       not NULL.
- * @note The function currently does not support cases where the ext_factor is
- *       not equal 0.
- * @note The function currently does not support cases where the freq_scale is
- *       not equal 1.
- */
-void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
-                                       aclTensor*, uint64_t*, aclOpExecutor**),
-          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
-void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    aclTensor* acl_src0;
-    aclTensor* acl_src1;
-    aclTensor* acl_dst;
-
-    // Need bcast
-    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
-        BCAST_SHAPE(src0, src1)
-        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
-        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
-        acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
-    } else {
-        acl_src0 = ggml_cann_create_tensor(src0);
-        acl_src1 = ggml_cann_create_tensor(src1);
-        acl_dst = ggml_cann_create_tensor(dst);
-    }
-
-    uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
-    void* workspaceAddr = nullptr;
-
-    ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
-                               &executor));
-    if (workspaceSize > 0) {
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
-        workspaceAddr = workspace_allocator.get();
-    }
-
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
-
-    ACL_CHECK(aclDestroyTensor(acl_src0));
-    ACL_CHECK(aclDestroyTensor(acl_src1));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
-// Activation functions template.
-template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
-                                       aclOpExecutor**),
-          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
-                              const aclrtStream)>
-void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
-
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    aclTensor* acl_src = ggml_cann_create_tensor(src);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-    uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
-    void* workspaceAddr = nullptr;
-
-    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
-    if (workspaceSize > 0) {
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
-        workspaceAddr = workspace_allocator.get();
-    }
-
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
-
-    ACL_CHECK(aclDestroyTensor(acl_src));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
-// Activation functions template for const aclTensors.
-template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
-                                       uint64_t*, aclOpExecutor**),
-          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
-                              const aclrtStream)>
-void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
-
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    aclTensor* acl_src = ggml_cann_create_tensor(src);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-    uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
-    void* workspaceAddr = nullptr;
-
-    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
-    if (workspaceSize > 0) {
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
-        workspaceAddr = workspace_allocator.get();
-    }
-
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
-
-    ACL_CHECK(aclDestroyTensor(acl_src));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
-#endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef CANN_COMMON_H
-#define CANN_COMMON_H
-
-#include <acl/acl.h>
-
-#include <cstdio>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "../include/ggml-cann.h"
-#include "../include/ggml.h"
-
-#define MATRIX_ROW_PADDING 512
-#define GGML_CANN_MAX_STREAMS 8
-
-/**
- * @brief Handles CANN-related errors by printing an error message and
- *        terminating the program.
- * @param stmt The statement that caused the error.
- * @param func The function in which the error occurred.
- * @param file The file in which the error occurred.
- * @param line The line number at which the error occurred.
- * @param msg The error message.
- */
-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
-                                  const char* file, int line, const char* msg);
-
-/**
- * @brief Checks the result of a CANN function call and invokes the error
- *        handler if the call fails.
- * @param stmt The CANN function call to check.
- * @param success The success code that indicates the call was successful.
- * @param error_fn The function to call to retrieve the error message.
- */
-#define ACL_CHECK_GEN(stmt, success, error_fn)                                \
-    do {                                                                      \
-        int err_code = (stmt);                                                \
-        if (err_code != (success)) {                                          \
-            ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
-        }                                                                     \
-    } while (0);
-
-#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
-
-/**
- * @brief Contains information about CANN devices.
- */
-struct ggml_cann_device_info {
-    /**
-     * @brief Number of CANN devices available.
-     */
-    int32_t device_count;
-
-    /**
-     * @brief Information about a single CANN device.
-     */
-    struct cann_device_info {
-        int cc;                 /**< Compute capability.                   */
-        size_t smpb;            /**< Maximum shared memory per block.      */
-        bool vmm;               /**< Virtual memory support.               */
-        size_t vmm_granularity; /**< Granularity of virtual memory.        */
-        size_t total_vram;      /**< Total video RAM available on the device. */
-    };
-
-    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
-        {}; /**< Array of CANN device information. */
-};
-
-const ggml_cann_device_info& ggml_cann_info();
-
-void ggml_cann_set_device(int32_t device);
-int32_t ggml_cann_get_device();
-
-/**
- * @brief Abstract base class for memory pools used by CANN.
- */
-struct ggml_cann_pool {
-    /**
-     * @brief Virtual destructor for the memory pool.
-     */
-    virtual ~ggml_cann_pool() = default;
-
-    /**
-     * @brief Allocates memory from the pool.
-     *
-     * @param size         The size of the memory block to allocate.
-     * @param actual_size  Pointer to a variable where the actual allocated size
-     *                     will be stored.
-     * @return             Pointer to the allocated memory block.
-     */
-    virtual void* alloc(size_t size, size_t* actual_size) = 0;
-
-    /**
-     * @brief Frees a previously allocated memory block.
-     *
-     * @param ptr   Pointer to the memory block to free.
-     * @param size  Size of the memory block to free.
-     * @note Note that all CANN opertors are running async. Make sure memory is
-     *       still avaiable before this operator finished.
-     */
-    virtual void free(void* ptr, size_t size) = 0;
-};
-
-/**
- * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
- */
-struct ggml_cann_pool_alloc {
-    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
-    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
-    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
-
-    /**
-     * @brief Default constructor.
-     */
-    ggml_cann_pool_alloc() = default;
-
-    /**
-     * @brief Constructor that initializes the memory pool.
-     * @param pool Reference to the memory pool.
-     */
-    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
-
-    /**
-     * @brief Constructor that initializes the memory pool and allocates memory.
-     * @param pool Reference to the memory pool.
-     * @param size Size of the memory block to allocate.
-     */
-    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
-        alloc(size);
-    }
-
-    /**
-     * @brief Destructor that frees the allocated memory block.
-     */
-    ~ggml_cann_pool_alloc() {
-        if (ptr != nullptr) {
-            pool->free(ptr, actual_size);
-        }
-    }
-
-    /**
-     * @brief Allocates memory from the pool.
-     * @param size Size of the memory block to allocate.
-     * @return Pointer to the allocated memory block.
-     */
-    void* alloc(size_t size) {
-        GGML_ASSERT(pool != nullptr);
-        GGML_ASSERT(ptr == nullptr);
-        ptr = pool->alloc(size, &this->actual_size);
-        return ptr;
-    }
-
-    /**
-     * @brief Allocates memory from a specific memory pool.
-     * @param pool Reference to the memory pool.
-     * @param size Size of the memory block to allocate.
-     * @return Pointer to the allocated memory block.
-     */
-    void* alloc(ggml_cann_pool& pool, size_t size) {
-        this->pool = &pool;
-        return alloc(size);
-    }
-
-    /**
-     * @brief Gets the pointer to the allocated memory block.
-     * @return Pointer to the allocated memory block.
-     */
-    void* get() { return ptr; }
-
-    // Deleted copy constructor
-    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
-
-    // Deleted move constructor
-    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
-
-    // Deleted copy assignment operator
-    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
-
-    // Deleted move assignment operator
-    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
-};
-
-/**
- * @brief Context for managing CANN backend operations.
- */
-struct ggml_backend_cann_context {
-    int32_t device;                  /**< Device ID. */
-    std::string name;                /**< Name of the device. */
-    std::string description;         /**< Description of the device. */
-    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
-
-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
-
-    /**
-     * @brief Constructor for initializing the context with a given device.
-     * @param device Device ID.
-     */
-    explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)) {
-        ggml_cann_set_device(device);
-        description = aclrtGetSocName();
-    }
-
-    /**
-     * @brief Destructor for cleaning up resources.
-     */
-    ~ggml_backend_cann_context() {
-        ggml_cann_set_device(device);
-        if (copy_event != nullptr) {
-            ACL_CHECK(aclrtDestroyEvent(copy_event));
-        }
-        for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
-            if (streams[i] != nullptr) {
-                ACL_CHECK(aclrtDestroyStream(streams[i]));
-            }
-        }
-    }
-
-    /**
-     * @brief Get or create a stream for a given index.
-     * @param stream Index of the stream.
-     * @return The stream corresponding to the given index.
-     */
-    aclrtStream stream(int stream) {
-        if (streams[stream] == nullptr) {
-            ggml_cann_set_device(device);
-            ACL_CHECK(aclrtCreateStream(&streams[stream]));
-        }
-        return streams[stream];
-    }
-
-    /**
-     * @brief Get or create the default stream (index 0).
-     * @return The default stream.
-     */
-    aclrtStream stream() { return stream(0); }
-
-    // TODO: each stream should have a memory pool.
-    std::unique_ptr<ggml_cann_pool>
-        mem_pool; /**< Memory pool for the device. */
-
-    /**
-     * @brief Create a new memory pool for a given device.
-     * @param device Device ID.
-     * @return A unique pointer to the new memory pool.
-     */
-    static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
-
-    /**
-     * @brief Get or create the memory pool for the context.
-     * @return Reference to the memory pool.
-     */
-    ggml_cann_pool& pool() {
-        if (mem_pool == nullptr) {
-            mem_pool = new_pool_for_device(device);
-        }
-        return *mem_pool;
-    }
-};
-
-#endif  // CANN_COMMON_H
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@ -1,30 +0,0 @@
-file(GLOB SRC_FILES
-    get_row_f32.cpp
-    get_row_f16.cpp
-    get_row_q4_0.cpp
-    get_row_q8_0.cpp
-    quantize_f32_q8_0.cpp
-    quantize_f16_q8_0.cpp
-    quantize_float_to_q4_0.cpp
-    dup.cpp
-)
-
-set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
-set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
-
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-else()
-    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
-endif()
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
-
-ascendc_library(ascendc_kernels STATIC
-    ${SRC_FILES}
-)
-
-message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
-ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
-# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@ -1,19 +0,0 @@
-#ifndef ASCENDC_KERNELS_H
-#define ASCENDC_KERNELS_H
-
-#include "aclrtlaunch_ascendc_get_row_f32.h"
-#include "aclrtlaunch_ascendc_get_row_f16.h"
-#include "aclrtlaunch_ascendc_get_row_q8_0.h"
-#include "aclrtlaunch_ascendc_get_row_q4_0.h"
-
-#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
-#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
-
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
-
-#endif  // ASCENDC_KERNELS_H
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@ -1,234 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
-
-template <typename SRC_T, typename DST_T>
-class DupByRows {
-   public:
-    __aicore__ inline DupByRows() {}
-    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
-                                size_t *input_nb_ub) {
-        /* Dup by rows when src is contigous on first dimension and dst is
-        contiguous, each kernel process one row.
-        */
-
-        // Input has four dims.
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        // param
-        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
-        num_elem = input_ne_ub[0];
-
-        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
-        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
-        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
-                  / (input_ne_ub[1]);
-        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
-                - idx_ne2 * input_ne_ub[1];
-
-        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
-        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
-                     + input_nb_ub[1] * idx_ne1;
-
-        // dst is contiguous
-        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
-
-        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
-                                                                src_stride));
-        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
-                                                                dst_stride));
-
-        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-    }
-
-    __aicore__ inline void copy_in() {
-        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-        const size_t elem_per_block = 32 / sizeof(SRC_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
-        DataCopy(src_local, src_gm, cpy_elements_len);
-        src_queue.EnQue(src_local);
-    }
-
-    __aicore__ inline void copy_out() {
-        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-#ifdef ASCEND_310P
-        const size_t elem_per_block = 32 / sizeof(DST_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t len = num_elem & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(dst_gm, dst_local, len);
-        }
-        if(tail != 0) {
-            for (size_t i = tail; i < elem_per_block; i++) {
-                dst_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
-            SetAtomicNone();
-        }
-#else
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
-        DataCopyPad(dst_gm, dst_local, dataCopyParams);
-#endif
-        dst_queue.FreeTensor(dst_local);
-    }
-
-    __aicore__ inline void dup() {
-        // main process, copy one row data from src to dst.
-        copy_in();
-
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
-        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
-        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
-                                        / BLOCK_NUM * BLOCK_NUM);
-        dst_queue.EnQue<DST_T>(dst_local);
-
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-
-    __aicore__ inline void dup_with_cast() {
-        // main process, copy one row data from src to dst.
-        // cast dtype from src to dst.
-        copy_in();
-
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
-        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
-        dst_queue.EnQue<DST_T>(dst_local);
-
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-
-   private:
-
-    TPipe pipe;
-    GlobalTensor<SRC_T> src_gm;
-    GlobalTensor<DST_T> dst_gm;
-
-    int64_t num_rows;
-    int64_t num_elem;
-    int64_t idx_ne3;
-    int64_t idx_ne2;
-    int64_t idx_ne1;
-    int64_t src_stride;
-    int64_t dst_stride;
-
-    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<half, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<float, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<float, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    // copy params from gm to ub.
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<half, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@ -1,197 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F16 {
-   public:
-    __aicore__ inline GET_ROW_F16() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        // TODO, use template for F16/f32
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
-                                             & ~31);
-        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
-                                              & ~31);
-
-        local_buffer_elems = input_local_buffer_size / sizeof(half);
-
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        size_t origin_len = len;
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        const size_t elem_per_block = 32 / sizeof(half);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        Cast(output_local, input_local, RoundMode::CAST_NONE,
-             local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-
-        input_queue.FreeTensor(input_local);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    size_t local_buffer_elems;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f16(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_F16 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@ -1,190 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F32 {
-   public:
-    __aicore__ inline GET_ROW_F32() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
-        local_buffer_elems = local_buffer_size / sizeof(float);
-
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        DataCopy(output_local, input_local, local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-
-        input_queue.FreeTensor(input_local);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    size_t local_buffer_elems;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f32(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_F32 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@ -1,204 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support 4bit get row
-    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support 4bit get row.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-
-#define QK4_0 32
-
-class GET_ROW_Q4_0 {
-   public:
-    __aicore__ inline GET_ROW_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // one scale for a group.
-        scale_ne[0] /= QK4_0;
-
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        group_size_in_row = input_ne[0] / QK4_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] / 2;
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
-        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
-        DataCopy(input_local, input_gm[offset], QK4_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK4_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK4_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK4_0;
-
-        copy_in(input_offset);
-        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-
-        Muls(output_local, output_local, (float)scale, QK4_0);
-
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-
-        copy_out(output_offset);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t ir;
-    int64_t dr;
-
-    int64_t group_size_in_row;
-
-    TPipe pipe;
-    GlobalTensor<int4b_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_Q4_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@ -1,191 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-#define QK8_0 32
-
-class GET_ROW_Q8_0 {
-   public:
-    __aicore__ inline GET_ROW_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // one scale for a group.
-        scale_ne[0] /= QK8_0;
-
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        group_size_in_row = input_ne[0] / QK8_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] * sizeof(int8_t);
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK8_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK8_0;
-
-        copy_in(input_offset);
-        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
-
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-        Muls(output_local, output_local, (float)scale, QK8_0);
-
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-
-        copy_out(output_offset);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t ir;
-    int64_t dr;
-
-    int64_t group_size_in_row;
-
-    TPipe pipe;
-    GlobalTensor<int8_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_Q8_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@ -1,218 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->8bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F16_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F16_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, 32);
-        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-
-        copy_in(input_offset);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Abs(abs_local, cast_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
-        }
-
-        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-    TQue<QuePosition::VECOUT, 1> cast_queue;
-
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_F16_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@ -1,216 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support f32->8bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->8bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F32_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F32_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
-                                                 ir * group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
-        pipe.InitBuffer(scale_queue, 1, 32);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-
-        copy_in(input_offset);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-
-        Abs(abs_local, input_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(input_local, input_local, 1.0f / d, QK8_0);
-        }
-
-        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECIN, 1> cast_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_F32_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@ -1,295 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support float->4bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->4bit quantization.\n");
-    }
-
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->4bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define Group_Size 32
-
-template <typename SRC_T>
-class QUANTIZE_FLOAT_TO_Q4_0 {
-   public:
-    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
-        //                         permute=[0,0,0,0]):
-        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        // input stride of data elements
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        // output stride of data elements
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / Group_Size;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t) / 2;
-
-        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
-        pipe.InitBuffer(output_queue, BUFFER_NUM,
-                            Group_Size * sizeof(int8_t) / 2);
-        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
-        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
-        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
-        DataCopy(input_local, input_gm[offset], Group_Size);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
-        // and using DataCopyPad to avoid 32 bits align.
-        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
-        LocalTensor<int8_t> output_int8_local =
-                                    output_local.ReinterpretCast<int8_t>();
-
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
-        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
-
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<float> input_local) {
-        DataCopy(cast_local, input_local, Group_Size);
-    }
-
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<half> input_local) {
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + Group_Size * group;
-
-        // output_offset is stride for output_gm which datatype is int8_t and
-        // divided by 2 is needed for int4b_t.
-        const int64_t output_offset = (i1 * output_stride[1] +
-                                       i2 * output_stride[2] +
-                                       i3 * output_stride[3] +
-                                       Group_Size * group) / 2;
-        copy_in(input_offset);
-
-        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
-        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
-        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
-        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
-
-        input_to_cast(cast_local, input_local);
-
-        ReduceMax(max_local, cast_local, work_local, Group_Size);
-        ReduceMin(min_local, cast_local, work_local, Group_Size);
-        const float max_value = max_local.GetValue(0);
-        const float min_value = min_local.GetValue(0);
-        float d = max_value;
-        if (min_value < 0 && (-1 * min_value) > max_value) {
-            d = min_value;
-        }
-
-        d = d / (-8);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
-        }
-
-        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
-        float scalar = 8.5f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
-        scalar = 15.0f;
-        Mins(cast_local, cast_local, scalar, Group_Size);
-        scalar = -8.0f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-
-        // float->half->int4b
-        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
-        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
-
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        max_queue.FreeTensor(max_local);
-        min_queue.FreeTensor(min_local);
-        int8_queue.FreeTensor(int8_local);
-        half_queue.FreeTensor(half_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                // Copy Group_Size/2 length data each time.
-                if (scale_local_offset == Group_Size / 2) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local,
-                                      Group_Size / 2);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += Group_Size / 2;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-        scale_queue.FreeTensor(scale_local);
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<SRC_T> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_FLOAT_TO_Q4_0<half> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_FLOAT_TO_Q4_0<float> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -1,489 +0,0 @@
-function(ggml_add_cpu_backend_variant_impl tag_name)
-    if (tag_name)
-        set(GGML_CPU_NAME ggml-cpu-${tag_name})
-    else()
-        set(GGML_CPU_NAME ggml-cpu)
-    endif()
-
-    ggml_add_backend_library(${GGML_CPU_NAME})
-
-    list (APPEND GGML_CPU_SOURCES
-        ggml-cpu/ggml-cpu.c
-        ggml-cpu/ggml-cpu.cpp
-        ggml-cpu/ggml-cpu-aarch64.cpp
-        ggml-cpu/ggml-cpu-aarch64.h
-        ggml-cpu/ggml-cpu-hbm.cpp
-        ggml-cpu/ggml-cpu-hbm.h
-        ggml-cpu/ggml-cpu-quants.c
-        ggml-cpu/ggml-cpu-quants.h
-        ggml-cpu/ggml-cpu-traits.cpp
-        ggml-cpu/ggml-cpu-traits.h
-        ggml-cpu/amx/amx.cpp
-        ggml-cpu/amx/amx.h
-        ggml-cpu/amx/mmq.cpp
-        ggml-cpu/amx/mmq.h
-        ggml-cpu/ggml-cpu-impl.h
-        ggml-cpu/common.h
-        ggml-cpu/binary-ops.h
-        ggml-cpu/binary-ops.cpp
-        ggml-cpu/unary-ops.h
-        ggml-cpu/unary-ops.cpp
-        ggml-cpu/simd-mappings.h
-        ggml-cpu/vec.h
-        ggml-cpu/vec.cpp
-        ggml-cpu/ops.h
-        ggml-cpu/ops.cpp
-        )
-
-    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
-    target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
-
-    if (APPLE AND GGML_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate)
-        if (ACCELERATE_FRAMEWORK)
-            message(STATUS "Accelerate framework found")
-
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
-
-            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
-        else()
-            message(WARNING "Accelerate framework not found")
-        endif()
-    endif()
-
-    if (GGML_OPENMP)
-        find_package(OpenMP)
-        if (OpenMP_FOUND)
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
-
-            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-        else()
-            message(WARNING "OpenMP not found")
-        endif()
-    endif()
-
-    if (GGML_LLAMAFILE)
-        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
-
-        list(APPEND GGML_CPU_SOURCES
-                    ggml-cpu/llamafile/sgemm.cpp
-                    ggml-cpu/llamafile/sgemm.h)
-    endif()
-
-    if (GGML_CPU_HBM)
-        find_library(memkind memkind REQUIRED)
-
-        message(STATUS "Using memkind for CPU HBM")
-
-        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
-
-        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
-    endif()
-
-    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-
-        message(STATUS "ARM detected")
-
-        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
-            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
-        else()
-            check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-            if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-            endif()
-
-            if (GGML_NATIVE)
-                # -mcpu=native does not always enable all the features in some compilers,
-                # so we check for them manually and enable them if available
-
-                execute_process(
-                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
-                    INPUT_FILE "/dev/null"
-                    OUTPUT_QUIET
-                    ERROR_VARIABLE ARM_MCPU
-                    RESULT_VARIABLE ARM_MCPU_RESULT
-                )
-                if (NOT ARM_MCPU_RESULT)
-                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
-                endif()
-                if ("${ARM_MCPU_FLAG}" STREQUAL "")
-                    set(ARM_MCPU_FLAG -mcpu=native)
-                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
-                endif()
-
-                include(CheckCXXSourceRuns)
-
-                function(check_arm_feature tag code)
-                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                    set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
-                    check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
-                    if (GGML_MACHINE_SUPPORTS_${tag})
-                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
-                    else()
-                        set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
-                        check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
-                        if (GGML_MACHINE_SUPPORTS_no${tag})
-                            set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
-                        endif()
-                    endif()
-                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-                endfunction()
-
-                check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
-                check_arm_feature(i8mm    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
-                check_arm_feature(sve     "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
-                check_arm_feature(sme     "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
-
-                list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
-            else()
-                if (GGML_CPU_ARM_ARCH)
-                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
-                endif()
-            endif()
-
-            # show enabled features
-            if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-                set(FEAT_INPUT_FILE "NUL")
-            else()
-                set(FEAT_INPUT_FILE "/dev/null")
-            endif()
-
-            execute_process(
-                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
-                INPUT_FILE ${FEAT_INPUT_FILE}
-                OUTPUT_VARIABLE ARM_FEATURE
-                RESULT_VARIABLE ARM_FEATURE_RESULT
-            )
-            if (ARM_FEATURE_RESULT)
-                message(WARNING "Failed to get ARM features")
-            else()
-                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
-                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
-                    if (NOT ${feature_pos} EQUAL -1)
-                        message(STATUS "ARM feature ${feature} enabled")
-                    endif()
-                endforeach()
-            endif()
-        endif()
-    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
-
-        message(STATUS "x86 detected")
-
-        if (MSVC)
-            # instruction set detection for MSVC only
-            if (GGML_NATIVE)
-                include(ggml-cpu/cmake/FindSIMD.cmake)
-            endif ()
-            if (GGML_AVX512)
-                list(APPEND ARCH_FLAGS /arch:AVX512)
-                # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
-                # MSVC has no compile-time flags enabling specific
-                # AVX512 extensions, neither it defines the
-                # macros corresponding to the extensions.
-                # Do it manually.
-                list(APPEND ARCH_DEFINITIONS GGML_AVX512)
-                if (GGML_AVX512_VBMI)
-                    list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
-                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                        list(APPEND ARCH_FLAGS -mavx512vbmi)
-                    endif()
-                endif()
-                if (GGML_AVX512_VNNI)
-                    list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
-                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                        list(APPEND ARCH_FLAGS -mavx512vnni)
-                    endif()
-                endif()
-                if (GGML_AVX512_BF16)
-                    list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
-                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                        list(APPEND ARCH_FLAGS -mavx512bf16)
-                    endif()
-                endif()
-                if (GGML_AMX_TILE)
-                    list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
-                endif()
-                if (GGML_AMX_INT8)
-                    list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
-                endif()
-                if (GGML_AMX_BF16)
-                    list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
-                endif()
-            elseif (GGML_AVX2)
-                list(APPEND ARCH_FLAGS /arch:AVX2)
-                list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
-            elseif (GGML_AVX)
-                list(APPEND ARCH_FLAGS /arch:AVX)
-                list(APPEND ARCH_DEFINITIONS GGML_AVX)
-            else ()
-                list(APPEND ARCH_FLAGS /arch:SSE4.2)
-                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
-            endif()
-            if (GGML_AVX_VNNI)
-                list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
-            endif()
-            if (GGML_BMI2)
-                # MSVC does not define macro __BMI2__
-                list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
-            endif()
-        else ()
-            if (GGML_NATIVE)
-                list(APPEND ARCH_FLAGS -march=native)
-            else ()
-                list(APPEND ARCH_FLAGS -msse4.2)
-                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
-                if (GGML_F16C)
-                    list(APPEND ARCH_FLAGS -mf16c)
-                    list(APPEND ARCH_DEFINITIONS GGML_F16C)
-                endif()
-                if (GGML_FMA)
-                    list(APPEND ARCH_FLAGS -mfma)
-                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
-                endif()
-                if (GGML_BMI2)
-                    list(APPEND ARCH_FLAGS -mbmi2)
-                    list(APPEND ARCH_DEFINITIONS GGML_BMI2)
-                endif()
-                if (GGML_AVX)
-                    list(APPEND ARCH_FLAGS -mavx)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
-                endif()
-                if (GGML_AVX2)
-                    list(APPEND ARCH_FLAGS -mavx2)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX2)
-                endif()
-                if (GGML_AVX_VNNI)
-                    list(APPEND ARCH_FLAGS -mavxvnni)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
-                endif()
-                if (GGML_AVX512)
-                    list(APPEND ARCH_FLAGS -mavx512f)
-                    list(APPEND ARCH_FLAGS -mavx512cd)
-                    list(APPEND ARCH_FLAGS -mavx512vl)
-                    list(APPEND ARCH_FLAGS -mavx512dq)
-                    list(APPEND ARCH_FLAGS -mavx512bw)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX512)
-                endif()
-                if (GGML_AVX512_VBMI)
-                    list(APPEND ARCH_FLAGS -mavx512vbmi)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
-                endif()
-                if (GGML_AVX512_VNNI)
-                    list(APPEND ARCH_FLAGS -mavx512vnni)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
-                endif()
-                if (GGML_AVX512_BF16)
-                    list(APPEND ARCH_FLAGS -mavx512bf16)
-                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
-                endif()
-                if (GGML_AMX_TILE)
-                    list(APPEND ARCH_FLAGS -mamx-tile)
-                    list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
-                endif()
-                if (GGML_AMX_INT8)
-                    list(APPEND ARCH_FLAGS -mamx-int8)
-                    list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
-                endif()
-                if (GGML_AMX_BF16)
-                    list(APPEND ARCH_FLAGS -mamx-bf16)
-                    list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
-                endif()
-            endif()
-        endif()
-    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
-        message(STATUS "PowerPC detected")
-        if (GGML_NATIVE)
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-                file(READ "/proc/cpuinfo" POWER10_M)
-            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
-                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
-            endif()
-
-            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
-            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
-
-            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
-                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
-            elseif (EXTRACTED_NUMBER EQUAL 9)
-                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
-            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
-            else()
-                list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
-            endif()
-        else()
-            if (GGML_CPU_POWERPC_CPUTYPE)
-                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
-            endif()
-        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-        message(STATUS "loongarch64 detected")
-
-        list(APPEND ARCH_FLAGS -march=loongarch64)
-        if (GGML_LASX)
-            list(APPEND ARCH_FLAGS -mlasx)
-        endif()
-        if (GGML_LSX)
-            list(APPEND ARCH_FLAGS -mlsx)
-        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
-        message(STATUS "RISC-V detected")
-        if (GGML_RVV)
-            if (GGML_RV_ZFH)
-                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
-            else()
-                list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
-            endif()
-        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-        message(STATUS "s390x detected")
-        file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
-        string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
-
-        # TODO: Separation to determine activation of VX/VXE/VXE2
-        if (${S390X_M} MATCHES "8561|8562")
-            message(STATUS "z15 target")
-            list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
-        elseif (${S390X_M} MATCHES "3931")
-            message(STATUS "z16 target")
-            list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
-        else()
-            message(STATUS "Unknown target")
-            message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
-            list(APPEND ARCH_FLAGS -march=native -mtune=native)
-        endif()
-
-        if (GGML_VXE)
-            list(APPEND ARCH_FLAGS -mvx -mzvector)
-        endif()
-    else()
-        message(STATUS "Unknown architecture")
-    endif()
-
-    if (GGML_CPU_AARCH64)
-        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
-    endif()
-
-    if (GGML_CPU_KLEIDIAI)
-        message(STATUS "Using KleidiAI optimized kernels if applicable")
-
-        # Disable the KleidiAI tests
-        set(KLEIDIAI_BUILD_TESTS  OFF)
-
-        # Fetch KleidiAI sources:
-        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
-        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")
-
-        if (POLICY CMP0135)
-            cmake_policy(SET CMP0135 NEW)
-        endif()
-
-        FetchContent_Declare(KleidiAI_Download
-            URL ${KLEIDIAI_DOWNLOAD_URL}
-            DOWNLOAD_EXTRACT_TIMESTAMP NEW
-            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
-
-        FetchContent_MakeAvailable(KleidiAI_Download)
-        FetchContent_GetProperties(KleidiAI_Download
-            SOURCE_DIR  KLEIDIAI_SRC
-            POPULATED   KLEIDIAI_POPULATED)
-
-        if (NOT KLEIDIAI_POPULATED)
-            message(FATAL_ERROR "KleidiAI source downloaded failed.")
-        endif()
-
-        add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
-
-        # Remove kleidiai target after fetching it
-        if (TARGET kleidiai)
-            set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
-        endif()
-
-        list(APPEND GGML_CPU_SOURCES
-            ggml-cpu/kleidiai/kleidiai.cpp
-            ggml-cpu/kleidiai/kernels.cpp
-            ggml-cpu/kleidiai/kleidiai.h
-            ggml-cpu/kleidiai/kernels.h
-            )
-
-        # KleidiAI
-        include_directories(
-            ${KLEIDIAI_SRC}/
-            ${KLEIDIAI_SRC}/kai/
-            ${KLEIDIAI_SRC}/kai/ukernels/
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
-
-        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
-        if (NOT ARCH_FLAGS_TEMP)
-            string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}")
-        endif()
-        string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
-        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
-        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
-
-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
-
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
-
-        if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
-        endif()
-
-        if (NOT I8MM_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
-        endif()
-
-        if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
-        endif()
-
-        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
-        list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
-    endif()
-
-    message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
-    target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
-    target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
-    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
-
-    if (GGML_BACKEND_DL)
-        if (GGML_NATIVE)
-            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
-            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
-        endif()
-
-        # The feature detection code is compiled as a separate target so that
-        # it can be built without the architecture flags
-        # Since multiple variants of the CPU backend may be included in the same
-        # build, using set_source_files_properties() to set the arch flags is not possible
-        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
-        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
-        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
-        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
-        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
-        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
-    endif()
-
-    if (EMSCRIPTEN)
-        set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
-    endif()
-endfunction()
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@ -1,221 +0,0 @@
-#include "amx.h"
-#include "common.h"
-#include "mmq.h"
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
-#include "ggml-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-cpu-traits.h"
-
-#if defined(__gnu_linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-
-// AMX type_trais
-namespace ggml::cpu::amx {
-class tensor_traits : public ggml::cpu::tensor_traits {
-    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        size = ggml_backend_amx_desired_wsize(op);
-        return true;
-    }
-
-    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT) {
-            ggml_backend_amx_mul_mat(params, op);
-            return true;
-        }
-        return false;
-    }
-};
-
-static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
-    static tensor_traits traits;
-    return &traits;
-}
-}  // namespace ggml::cpu::amx
-
-// AMX buffer interface
-static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *) (buffer->context);
-}
-
-static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
-
-    GGML_UNUSED(buffer);
-    return GGML_STATUS_SUCCESS;
-}
-
-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
-                                                  uint8_t value, size_t offset, size_t size) {
-    memset((char *) tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
-                                               const void * data, size_t offset, size_t size) {
-    if (qtype_has_amx_kernels(tensor->type)) {
-        GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
-        ggml_backend_amx_convert_weight(tensor, data, offset, size);
-    } else {
-        memcpy((char *) tensor->data + offset, data, size);
-    }
-
-    GGML_UNUSED(buffer);
-}
-
-/*
-// need to figure what we need to do with buffer->extra.
-static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        if (qtype_has_amx_kernels(src->type)) {
-            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
-        } else {
-            memcpy(dst->data, src->data, ggml_nbytes(src));
-        }
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-*/
-
-static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_amx_buffer_init_tensor,
-    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ nullptr,
-    /* .cpy_tensor      = */ nullptr,
-    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ nullptr,
-};
-
-static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "AMX";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = ggml_aligned_malloc(size);
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-namespace ggml::cpu::amx {
-class extra_buffer_type : ggml::cpu::extra_buffer_type {
-    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        // handle only 2d gemm for now
-        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-        };
-
-        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
-            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
-            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
-            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
-            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
-            // src1 must be host buffer
-            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
-                return false;
-            }
-            // src1 must be float32
-            if (op->src[1]->type == GGML_TYPE_F32) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
-            op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
-            return (ggml::cpu::tensor_traits *) op->src[0]->extra;
-        }
-
-        return nullptr;
-    }
-};
-}  // namespace ggml::cpu::amx
-
-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    return ggml_backend_amx_get_alloc_size(tensor);
-
-    GGML_UNUSED(buft);
-}
-
-#define ARCH_GET_XCOMP_PERM     0x1022
-#define ARCH_REQ_XCOMP_PERM     0x1023
-#define XFEATURE_XTILECFG       17
-#define XFEATURE_XTILEDATA      18
-
-static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
-    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
-        fprintf(stderr, "AMX is not ready to be used!\n");
-        return false;
-    }
-    return true;
-#elif defined(_WIN32)
-    return true;
-#endif
-}
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
-        /* .iface = */ {
-                        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-                        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-                        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-                        /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
-                        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-                        /* .is_host          = */ nullptr,
-                        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
-    };
-
-    if (!ggml_amx_init()) {
-        return nullptr;
-    }
-
-    return &ggml_backend_buffer_type_amx;
-}
-
-#endif  // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
--- a/ggml/src/ggml-cpu/amx/amx.h
+++ b/ggml/src/ggml-cpu/amx/amx.h
@ -1,8 +0,0 @@
-#include "ggml-backend.h"
-#include "ggml-cpu-impl.h"
-
-// GGML internal header
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-#endif
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@ -1,91 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-cpu-impl.h"
-
-#include <algorithm>
-#include <memory>
-#include <type_traits>
-
-#if defined(GGML_USE_OPENMP)
-#include <omp.h>
-#endif
-
-#define TILE_M 16
-#define TILE_N 16
-#define TILE_K 32
-#define VNNI_BLK 4
-
-#define AMX_BLK_SIZE 32
-
-#define TMM0 0
-#define TMM1 1
-#define TMM2 2
-#define TMM3 3
-#define TMM4 4
-#define TMM5 5
-#define TMM6 6
-#define TMM7 7
-
-// parallel routines
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline T div_up(T x, T y) { return (x + y - 1) / y; }
-
-template <typename T>
-inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
-#if 0
-    // onednn partition pattern
-    T& n_my = n_end;
-    if (nth <= 1 || n == 0) {
-        n_start = 0;
-        n_my = n;
-    } else {
-        T n1 = div_up(n, nth);
-        T n2 = n1 - 1;
-        T T1 = n - n2 * nth;
-        n_my = ith < T1 ? n1 : n2;
-        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
-    }
-    n_end += n_start;
-#else
-    // pytorch aten partition pattern
-    T n_my = div_up(n, nth);
-    n_start = ith * n_my;
-    n_end = std::min(n_start + n_my, n);
-#endif
-}
-
-template <typename func_t>
-inline void parallel_for(int n, const func_t& f) {
-#if defined(GGML_USE_OPENMP)
-#pragma omp parallel
-{
-    int nth = omp_get_num_threads();
-    int ith = omp_get_thread_num();
-    int tbegin, tend;
-    balance211(n, nth, ith, tbegin, tend);
-    f(tbegin, tend);
-}
-#else
-    f(0, n);
-#endif
-}
-
-template <typename func_t>
-inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
-    int tbegin, tend;
-    balance211(n, params->nth, params->ith, tbegin, tend);
-    f(tbegin, tend);
-}
-
-// quantized types that have AMX support
-inline bool qtype_has_amx_kernels(const enum ggml_type type) {
-    // TODO: fix padding for vnni format
-    return (type == GGML_TYPE_Q4_0) ||
-        (type == GGML_TYPE_Q4_1) ||
-        (type == GGML_TYPE_Q8_0) ||
-        (type == GGML_TYPE_Q4_K) ||
-        (type == GGML_TYPE_Q5_K) ||
-        (type == GGML_TYPE_Q6_K) ||
-        (type == GGML_TYPE_IQ4_XS);
-}
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
--- a/ggml/src/ggml-cpu/amx/mmq.h
+++ b/ggml/src/ggml-cpu/amx/mmq.h
@ -1,10 +0,0 @@
-#pragma once
-#include "common.h"
-
-size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
-
-size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
-
-void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-
-void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ggml/src/ggml-cpu/binary-ops.cpp
+++ b/ggml/src/ggml-cpu/binary-ops.cpp
@ -1,158 +0,0 @@
-#include "binary-ops.h"
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-
-using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
-#endif
-
-static inline float op_add(float a, float b) {
-    return a + b;
-}
-
-static inline float op_sub(float a, float b) {
-    return a - b;
-}
-
-static inline float op_mul(float a, float b) {
-    return a * b;
-}
-
-static inline float op_div(float a, float b) {
-    return a / b;
-}
-
-template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
-static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
-    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
-    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
-    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
-
-    for (int i = 0; i < n; i++) {
-        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
-    }
-}
-
-template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
-static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
-    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
-    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
-    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
-
-    for (int i = 0; i < n; i++) {
-        int i10 = i % ne10;
-        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
-        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
-    }
-}
-
-template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
-static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(dst_t));
-    GGML_ASSERT(nb00 == sizeof(src0_t));
-
-    const auto [ir0, ir1] = get_thread_range(params, src0);
-    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
-
-    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
-        GGML_ASSERT(ggml_are_same_shape(src0, src1));
-    }
-
-#ifdef GGML_USE_ACCELERATE
-    vDSP_fn_t vDSP_op = nullptr;
-    // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        if (op == op_add) {
-            vDSP_op = vDSP_vadd;
-        } else if (op == op_sub) {
-            vDSP_op = vDSP_vsub;
-        } else if (op == op_mul) {
-            vDSP_op = vDSP_vmul;
-        } else if (op == op_div) {
-            vDSP_op = vDSP_vdiv;
-        }
-    }
-#endif
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir/(ne02*ne01);
-        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int64_t i13 = i03 % ne13;
-        const int64_t i12 = i02 % ne12;
-        const int64_t i11 = i01 % ne11;
-
-        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-        if (is_src1_contiguous) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t nr0 = ne00 / ne10;
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
-                    if (vDSP_op != nullptr) {
-                        vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-                        continue;
-                    }
-                }
-#endif
-                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-            }
-        } else {
-            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
-        }
-    }
-}
-
-// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
-template <float (*op)(float, float)>
-static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
-        apply_binary_op<op, float, float, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
-        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
-        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
-        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
-        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
-        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
-        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
-    } else {
-        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-    }
-}
-
-void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_add>(params, dst);
-}
-
-void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_sub>(params, dst);
-}
-
-void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_mul>(params, dst);
-}
-
-void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_div>(params, dst);
-}
--- a/ggml/src/ggml-cpu/binary-ops.h
+++ b/ggml/src/ggml-cpu/binary-ops.h
@ -1,16 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
+++ b/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
@ -1,100 +0,0 @@
-include(CheckCSourceRuns)
-
-set(AVX_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 a;
-        a = _mm256_set1_ps(0);
-        return 0;
-    }
-")
-
-set(AVX512_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0);
-        __m512i b = a;
-        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
-        return 0;
-    }
-")
-
-set(AVX2_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m256i a = {0};
-        a = _mm256_abs_epi16(a);
-        __m256i x;
-        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
-        return 0;
-    }
-")
-
-set(FMA_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 acc = _mm256_setzero_ps();
-        const __m256 d = _mm256_setzero_ps();
-        const __m256 p = _mm256_setzero_ps();
-        acc = _mm256_fmadd_ps( d, p, acc );
-        return 0;
-    }
-")
-
-macro(check_sse type flags)
-    set(__FLAG_I 1)
-    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-    foreach (__FLAG ${flags})
-        if (NOT ${type}_FOUND)
-            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
-            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
-            if (HAS_${type}_${__FLAG_I})
-                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
-                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
-            endif()
-            math(EXPR __FLAG_I "${__FLAG_I}+1")
-        endif()
-    endforeach()
-    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-    if (NOT ${type}_FOUND)
-        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
-        set(${type}_FLAGS "" CACHE STRING "${type} flags")
-    endif()
-
-    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
-endmacro()
-
-# flags are for MSVC only!
-check_sse("AVX" " ;/arch:AVX")
-if (NOT ${AVX_FOUND})
-    set(GGML_AVX OFF)
-else()
-    set(GGML_AVX ON)
-endif()
-
-check_sse("AVX2" " ;/arch:AVX2")
-check_sse("FMA" " ;/arch:AVX2")
-if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(GGML_AVX2 OFF)
-else()
-    set(GGML_AVX2 ON)
-endif()
-
-check_sse("AVX512" " ;/arch:AVX512")
-if (NOT ${AVX512_FOUND})
-    set(GGML_AVX512 OFF)
-else()
-    set(GGML_AVX512 ON)
-endif()
--- a/ggml/src/ggml-cpu/common.h
+++ b/ggml/src/ggml-cpu/common.h
@ -1,72 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-cpu-traits.h"
-#include "ggml-cpu-impl.h"
-#include "ggml-impl.h"
-
-#ifdef __cplusplus
-
-#include <utility>
-
-// convenience functions/macros for use in template calls
-// note: these won't be required after the 'traits' lookup table is used.
-static inline ggml_fp16_t f32_to_f16(float x) {
-    return GGML_FP32_TO_FP16(x);
-}
-
-static inline float f16_to_f32(ggml_fp16_t x) {
-    return GGML_FP16_TO_FP32(x);
-}
-
-static inline ggml_bf16_t f32_to_bf16(float x) {
-    return GGML_FP32_TO_BF16(x);
-}
-
-static inline float bf16_to_f32(ggml_bf16_t x) {
-    return GGML_BF16_TO_FP32(x);
-}
-
-static inline float f32_to_f32(float x) {
-    return x;
-}
-
-// TODO - merge this into the traits table, after using row-based conversions
-template <class T>
-struct type_conversion_table;
-
-template <>
-struct type_conversion_table<ggml_fp16_t> {
-    static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
-    static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
-};
-
-template <>
-struct type_conversion_table<float> {
-    static constexpr float (*to_f32)(float) = f32_to_f32;
-    static constexpr float (*from_f32)(float) = f32_to_f32;
-};
-
-template <>
-struct type_conversion_table<ggml_bf16_t> {
-    static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
-    static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
-};
-
-static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
-    const int64_t ith = params->ith;
-    const int64_t nth = params->nth;
-
-    const int64_t nr  = ggml_nrows(src0);
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    return {ir0, ir1};
-}
-
-#endif
--- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@ -1,327 +0,0 @@
-#include "ggml-backend-impl.h"
-
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-#include <cstring>
-#include <vector>
-#include <bitset>
-#include <array>
-#include <string>
-
-// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
-struct cpuid_x86 {
-    bool SSE3(void) { return f_1_ecx[0]; }
-    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
-    bool MONITOR(void) { return f_1_ecx[3]; }
-    bool SSSE3(void) { return f_1_ecx[9]; }
-    bool FMA(void) { return f_1_ecx[12]; }
-    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
-    bool SSE41(void) { return f_1_ecx[19]; }
-    bool SSE42(void) { return f_1_ecx[20]; }
-    bool MOVBE(void) { return f_1_ecx[22]; }
-    bool POPCNT(void) { return f_1_ecx[23]; }
-    bool AES(void) { return f_1_ecx[25]; }
-    bool XSAVE(void) { return f_1_ecx[26]; }
-    bool OSXSAVE(void) { return f_1_ecx[27]; }
-    bool AVX(void) { return f_1_ecx[28]; }
-    bool F16C(void) { return f_1_ecx[29]; }
-    bool RDRAND(void) { return f_1_ecx[30]; }
-
-    bool MSR(void) { return f_1_edx[5]; }
-    bool CX8(void) { return f_1_edx[8]; }
-    bool SEP(void) { return f_1_edx[11]; }
-    bool CMOV(void) { return f_1_edx[15]; }
-    bool CLFSH(void) { return f_1_edx[19]; }
-    bool MMX(void) { return f_1_edx[23]; }
-    bool FXSR(void) { return f_1_edx[24]; }
-    bool SSE(void) { return f_1_edx[25]; }
-    bool SSE2(void) { return f_1_edx[26]; }
-
-    bool FSGSBASE(void) { return f_7_ebx[0]; }
-    bool BMI1(void) { return f_7_ebx[3]; }
-    bool HLE(void) { return is_intel && f_7_ebx[4]; }
-    bool AVX2(void) { return f_7_ebx[5]; }
-    bool BMI2(void) { return f_7_ebx[8]; }
-    bool ERMS(void) { return f_7_ebx[9]; }
-    bool INVPCID(void) { return f_7_ebx[10]; }
-    bool RTM(void) { return is_intel && f_7_ebx[11]; }
-    bool AVX512F(void) { return f_7_ebx[16]; }
-    bool AVX512DQ(void) { return f_7_ebx[17]; }
-    bool RDSEED(void) { return f_7_ebx[18]; }
-    bool ADX(void) { return f_7_ebx[19]; }
-    bool AVX512PF(void) { return f_7_ebx[26]; }
-    bool AVX512ER(void) { return f_7_ebx[27]; }
-    bool AVX512CD(void) { return f_7_ebx[28]; }
-    bool AVX512BW(void) { return f_7_ebx[30]; }
-    bool AVX512VL(void) { return f_7_ebx[31]; }
-
-    bool SHA(void) { return f_7_ebx[29]; }
-
-    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
-
-    bool LAHF(void) { return f_81_ecx[0]; }
-    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
-    bool ABM(void) { return is_amd && f_81_ecx[5]; }
-    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
-    bool XOP(void) { return is_amd && f_81_ecx[11]; }
-    bool TBM(void) { return is_amd && f_81_ecx[21]; }
-
-    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
-    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
-    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
-    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
-    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
-
-    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
-    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
-    bool AVX512_FP16(void) { return f_7_edx[23]; }
-    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
-    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
-
-    bool AMX_TILE(void) { return f_7_edx[24]; }
-    bool AMX_INT8(void) { return f_7_edx[25]; }
-    bool AMX_FP16(void) { return f_7_1_eax[21]; }
-    bool AMX_BF16(void) { return f_7_edx[22]; }
-
-#ifdef _MSC_VER
-    static void cpuid(int cpu_info[4], int eax) {
-        __cpuid(cpu_info, eax);
-    }
-    static void cpuidex(int cpu_info[4], int eax, int ecx) {
-        __cpuidex(cpu_info, eax, ecx);
-    }
-#else
-    static void cpuid(int cpu_info[4], int eax) {
-        __asm__ __volatile__(
-            "cpuid"
-            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-            : "a"(eax), "c"(0));
-    }
-    static void cpuidex(int cpu_info[4], int eax, int ecx) {
-        __asm__ __volatile__(
-            "cpuid"
-            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-            : "a"(eax), "c"(ecx));
-    }
-#endif
-
-    cpuid_x86() {
-        std::array<int, 4> cpui;
-        std::vector<std::array<int, 4>> data;
-
-        // calling __cpuid with 0x0 as the function_id argument
-        // gets the number of the highest valid function ID.
-        cpuid(cpui.data(), 0);
-        int n_ids = cpui[0];
-
-        for (int i = 0; i <= n_ids; ++i) {
-            cpuidex(cpui.data(), i, 0);
-            data.push_back(cpui);
-        }
-
-        // capture vendor string
-        char vendor[0x20] = {};
-        *reinterpret_cast<int *>(vendor)     = data[0][1];
-        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
-        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
-        this->vendor = vendor;
-        if (this->vendor == "GenuineIntel") {
-            is_intel = true;
-        } else if (this->vendor == "AuthenticAMD") {
-            is_amd = true;
-        }
-
-        // load bitset with flags for function 0x00000001
-        if (n_ids >= 1) {
-            f_1_ecx = data[1][2];
-            f_1_edx = data[1][3];
-        }
-
-        // load bitset with flags for function 0x00000007
-        if (n_ids >= 7) {
-            f_7_ebx = data[7][1];
-            f_7_ecx = data[7][2];
-            f_7_edx = data[7][3];
-            cpuidex(cpui.data(), 7, 1);
-            f_7_1_eax = cpui[0];
-        }
-
-        // calling __cpuid with 0x80000000 as the function_id argument
-        // gets the number of the highest valid extended ID.
-        cpuid(cpui.data(), 0x80000000);
-        unsigned int n_ex_ids = cpui[0];
-
-        std::vector<std::array<int, 4>> ext_data;
-        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
-            cpuidex(cpui.data(), i, 0);
-            ext_data.push_back(cpui);
-        }
-
-        // load bitset with flags for function 0x80000001
-        if (n_ex_ids >= 0x80000001) {
-            f_81_ecx = ext_data[1][2];
-            f_81_edx = ext_data[1][3];
-        }
-
-        // interpret CPU brand string if reported
-        char brand[0x40] = {};
-        if (n_ex_ids >= 0x80000004) {
-            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
-            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
-            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
-            this->brand = brand;
-        }
-    }
-
-    bool is_intel = false;
-    bool is_amd = false;
-    std::string vendor;
-    std::string brand;
-    std::bitset<32> f_1_ecx;
-    std::bitset<32> f_1_edx;
-    std::bitset<32> f_7_ebx;
-    std::bitset<32> f_7_ecx;
-    std::bitset<32> f_7_edx;
-    std::bitset<32> f_7_1_eax;
-    std::bitset<32> f_81_ecx;
-    std::bitset<32> f_81_edx;
-};
-
-#if 0
-void test_x86_is() {
-    cpuid_x86 is;
-    printf("CPU Vendor: %s\n", is.vendor.c_str());
-    printf("Brand: %s\n", is.brand.c_str());
-    printf("is_intel: %d\n", is.is_intel);
-    printf("is_amd: %d\n", is.is_amd);
-    printf("sse3: %d\n", is.SSE3());
-    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
-    printf("ssse3: %d\n", is.SSSE3());
-    printf("fma: %d\n", is.FMA());
-    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
-    printf("sse41: %d\n", is.SSE41());
-    printf("sse42: %d\n", is.SSE42());
-    printf("movbe: %d\n", is.MOVBE());
-    printf("popcnt: %d\n", is.POPCNT());
-    printf("aes: %d\n", is.AES());
-    printf("xsave: %d\n", is.XSAVE());
-    printf("osxsave: %d\n", is.OSXSAVE());
-    printf("avx: %d\n", is.AVX());
-    printf("f16c: %d\n", is.F16C());
-    printf("rdrand: %d\n", is.RDRAND());
-    printf("msr: %d\n", is.MSR());
-    printf("cx8: %d\n", is.CX8());
-    printf("sep: %d\n", is.SEP());
-    printf("cmov: %d\n", is.CMOV());
-    printf("clflush: %d\n", is.CLFSH());
-    printf("mmx: %d\n", is.MMX());
-    printf("fxsr: %d\n", is.FXSR());
-    printf("sse: %d\n", is.SSE());
-    printf("sse2: %d\n", is.SSE2());
-    printf("fsgsbase: %d\n", is.FSGSBASE());
-    printf("bmi1: %d\n", is.BMI1());
-    printf("hle: %d\n", is.HLE());
-    printf("avx2: %d\n", is.AVX2());
-    printf("bmi2: %d\n", is.BMI2());
-    printf("erms: %d\n", is.ERMS());
-    printf("invpcid: %d\n", is.INVPCID());
-    printf("rtm: %d\n", is.RTM());
-    printf("avx512f: %d\n", is.AVX512F());
-    printf("rdseed: %d\n", is.RDSEED());
-    printf("adx: %d\n", is.ADX());
-    printf("avx512pf: %d\n", is.AVX512PF());
-    printf("avx512er: %d\n", is.AVX512ER());
-    printf("avx512cd: %d\n", is.AVX512CD());
-    printf("sha: %d\n", is.SHA());
-    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
-    printf("lahf: %d\n", is.LAHF());
-    printf("lzcnt: %d\n", is.LZCNT());
-    printf("abm: %d\n", is.ABM());
-    printf("sse4a: %d\n", is.SSE4a());
-    printf("xop: %d\n", is.XOP());
-    printf("tbm: %d\n", is.TBM());
-    printf("syscall: %d\n", is.SYSCALL());
-    printf("mmxext: %d\n", is.MMXEXT());
-    printf("rdtscp: %d\n", is.RDTSCP());
-    printf("3dnowext: %d\n", is._3DNOWEXT());
-    printf("3dnow: %d\n", is._3DNOW());
-    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
-    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
-    printf("avx512_fp16: %d\n", is.AVX512_FP16());
-    printf("avx512_bf16: %d\n", is.AVX512_BF16());
-    printf("amx_tile: %d\n", is.AMX_TILE());
-    printf("amx_int8: %d\n", is.AMX_INT8());
-    printf("amx_fp16: %d\n", is.AMX_FP16());
-    printf("amx_bf16: %d\n", is.AMX_BF16());
-}
-#endif
-
-static int ggml_backend_cpu_x86_score() {
-    // FIXME: this does not check for OS support
-
-    int score = 0;
-    cpuid_x86 is;
-
-#ifdef GGML_FMA
-    if (!is.FMA()) { return 0; }
-    score += 1;
-#endif
-#ifdef GGML_F16C
-    if (!is.F16C()) { return 0; }
-    score += 1<<1;
-#endif
-#ifdef GGML_SSE42
-    if (!is.SSE42()) { return 0; }
-    score += 1<<2;
-#endif
-#ifdef GGML_BMI2
-    if (!is.BMI2()) { return 0; }
-    score += 1<<3;
-#endif
-#ifdef GGML_AVX
-    if (!is.AVX()) { return 0; }
-    score += 1<<4;
-#endif
-#ifdef GGML_AVX2
-    if (!is.AVX2()) { return 0; }
-    score += 1<<5;
-#endif
-#ifdef GGML_AVX_VNNI
-    if (!is.AVX_VNNI()) { return 0; }
-    score += 1<<6;
-#endif
-#ifdef GGML_AVX512
-    if (!is.AVX512F()) { return 0; }
-    if (!is.AVX512CD()) { return 0; }
-    if (!is.AVX512VL()) { return 0; }
-    if (!is.AVX512DQ()) { return 0; }
-    if (!is.AVX512BW()) { return 0; }
-    score += 1<<7;
-#endif
-#ifdef GGML_AVX512_VBMI
-    if (!is.AVX512_VBMI()) { return 0; }
-    score += 1<<8;
-#endif
-#ifdef GGML_AVX512_BF16
-    if (!is.AVX512_BF16()) { return 0; }
-    score += 1<<9;
-#endif
-#ifdef GGML_AVX512_VNNI
-    if (!is.AVX512_VNNI()) { return 0; }
-    score += 1<<10;
-#endif
-#ifdef GGML_AMX_INT8
-    if (!is.AMX_INT8()) { return 0; }
-    score += 1<<11;
-#endif
-
-    return score;
-}
-
-GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
-
-#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@ -1,8 +0,0 @@
-#pragma once
-
-#include "ggml-cpu-traits.h"
-#include "ggml.h"
-
-// GGML internal header
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
--- a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
@ -1,55 +0,0 @@
-#ifdef GGML_USE_CPU_HBM
-
-#include "ggml-backend.h"
-#include "ggml-backend-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-impl.h"
-
-#include "ggml-cpu-hbm.h"
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                                                           size_t                     size) {
-    void * ptr;
-    int    result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft                 = buft;
-    buffer->iface.free_buffer    = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-                           /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-                           /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
-                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
-                           /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-                           },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
--- a/ggml/src/ggml-cpu/ggml-cpu-hbm.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.h
@ -1,8 +0,0 @@
-#pragma once
-
-#include "ggml-backend.h"
-#include "ggml.h"
-
-// GGML CPU internal header
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -1,531 +0,0 @@
-#pragma once
-
-// GGML CPU internal header
-
-#include "ggml.h"
-#include "ggml-impl.h"
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-//#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-
-    struct ggml_threadpool * threadpool;
-};
-
-
-#if defined(_MSC_VER)
-
-#define m512bh(p) p
-#define m512i(p) p
-
-#else
-
-#define m512bh(p) (__m512bh)(p)
-#define m512i(p) (__m512i)(p)
-
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#endif
-
-// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
-#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-#endif
-
-#if defined(__s390x__) && defined(__VEC__)
-#ifndef __VXE__
-#define __VXE__
-#endif
-#ifndef __VXE2__
-#define __VXE2__
-#endif
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-#include <sys/prctl.h>
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-
-typedef uint16_t ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-typedef __fp16 ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif // _MSC_VER
-
-#if !defined(__aarch64__)
-
-// 32-bit ARM compatibility
-
-// vaddlvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddlvq_s16(int16x8_t v) {
-    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
-    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
-    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
-    return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vld1q_s16(ptr + 0);
-    res.val[1] = vld1q_s16(ptr + 8);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-    res.val[2] = vld1q_u8(ptr + 32);
-    res.val[3] = vld1q_u8(ptr + 48);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x2_t {
-    int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
-    ggml_int8x16x2_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-    res.val[2] = vld1q_s8(ptr + 32);
-    res.val[3] = vld1q_s8(ptr + 48);
-
-    return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-// NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t  int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t  int8x16x2_t
-#define ggml_int8x16x4_t  int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2  vld1q_u8_x2
-#define ggml_vld1q_u8_x4  vld1q_u8_x4
-#define ggml_vld1q_s8_x2  vld1q_s8_x2
-#define ggml_vld1q_s8_x4  vld1q_s8_x4
-#define ggml_vqtbl1q_s8   vqtbl1q_s8
-#define ggml_vqtbl1q_u8   vqtbl1q_u8
-
-#endif // !defined(__aarch64__)
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
-    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
-    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif // !defined(__ARM_FEATURE_DOTPROD)
-
-#endif // defined(__ARM_NEON)
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#if defined(__loongarch64)
-#if defined(__loongarch_asx)
-#include <lasxintrin.h>
-#endif
-#if defined(__loongarch_sx)
-#include <lsxintrin.h>
-#endif
-#endif
-
-#if defined(__VXE__) || defined(__VXE2__)
-#include <vecintrin.h>
-
-#define vec_neg(a)    (-(a))                // Vector Negate
-#define vec_add(a, b) ((a) + (b))           // Vector Add
-#define vec_sub(a, b) ((a) - (b))           // Vector Subtract
-#define vec_mul(a, b) ((a) * (b))           // Vector Multiply
-#define vec_div(a, b) ((a) / (b))           // Vector Divide
-#define vec_sl(a, b)  ((a) << (b))          // Vector Shift Left
-#define vec_sra(a, b) ((a) >> (b))          // Vector Shift Right
-#define vec_sr(a, b)  ((a) >> (b))          // Vector Shift Right Algebraic
-#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
-#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
-
-#ifndef vec_and
-#define vec_and(a, b) ((a) & (b)) // Vector AND
-#endif
-
-#ifndef vec_or
-#define vec_or(a, b)  ((a) | (b)) // Vector OR
-#endif
-
-#ifndef vec_xor
-#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
-#endif
-
-typedef signed char char8x16_t __attribute__((vector_size(16)));
-typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
-
-typedef int8_t  int8x16_t __attribute__((vector_size(16)));
-typedef int16_t int16x8_t __attribute__((vector_size(16)));
-typedef int32_t int32x4_t __attribute__((vector_size(16)));
-
-typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
-typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
-typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
-
-typedef float float32x4_t __attribute__((vector_size(16)));
-typedef double double64x2_t __attribute((vector_size(16)));
-
-typedef signed long long long64x2_t __attribute((vector_size(16)));
-typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vec_xl( 0, ptr);
-    res.val[1] = vec_xl(16, ptr);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vec_xl( 0, ptr);
-    res.val[1] = vec_xl(16, ptr);
-    res.val[2] = vec_xl(32, ptr);
-    res.val[3] = vec_xl(48, ptr);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vec_xl( 0, ptr);
-    res.val[1] = vec_xl(16, ptr);
-    res.val[2] = vec_xl(32, ptr);
-    res.val[3] = vec_xl(48, ptr);
-
-    return res;
-}
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vec_xl( 0, ptr);
-    res.val[1] = vec_xl(16, ptr);
-
-    return res;
-}
-
-/*
-    ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
-    !          or iq4_nl for example implementation.
-*/
-inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
-    const uchar8x16_t v_maske = {  0,  1,  4,  5,  8,  9, 12, 13,
-                                  16, 17, 20, 21, 24, 25, 28, 29 };
-
-    const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
-    const int16x8_t v_abe = vec_perm(a, b, v_maske);
-    return v_abo + v_abe;
-}
-
-inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
-    return acc + (vec_unpackh(p) + vec_unpackl(p));
-}
-
-#endif
-
-#if defined(__loongarch_asx)
-/* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(const float val) {
-    v4f32 res = {val, val, val, val};
-    return (__m128)res;
-}
-
-static __m256 __lasx_xvreplfr2vr_s(const float val) {
-    v8f32 res = {val, val, val, val, val, val, val, val};
-    return (__m256)res;
-}
-#endif
-
-// TODO: move to ggml-threading
-void ggml_barrier(struct ggml_threadpool * tp);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h
@ -1,63 +0,0 @@
-#pragma once
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-
-#include "ggml.h"
-
-// GGML CPU internal header
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization
-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
-// Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
@ -1,36 +0,0 @@
-#include "ggml-cpu-traits.h"
-
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
-
-namespace ggml::cpu {
-tensor_traits::~tensor_traits() {}
-
-extra_buffer_type::~extra_buffer_type() {}
-}  // namespace ggml::cpu
-
-bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra->context) {
-            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
-            auto tensor_traits = buf_extra->get_tensor_traits(op);
-            if (tensor_traits && tensor_traits->compute_forward(params, op)) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra->context) {
-            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
-            auto tensor_traits = buf_extra->get_tensor_traits(op);
-            if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
--- a/ggml/src/ggml-cpu/ggml-cpu-traits.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h
@ -1,38 +0,0 @@
-#pragma once
-#include "ggml-backend-impl.h"
-#include "ggml-cpu-impl.h"
-#include "ggml.h"
-
-#ifdef __cplusplus
-#    include <vector>
-extern "C" {
-#endif
-
-// return true if op part of extra "accelerator"
-bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
-bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
-
-#ifdef __cplusplus
-}
-
-namespace ggml::cpu {
-// register in tensor->extra
-class tensor_traits {
-  public:
-    virtual ~tensor_traits();
-    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
-    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
-};
-
-class extra_buffer_type {
-  public:
-    virtual ~extra_buffer_type();
-    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
-    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
-};
-}  // namespace ggml::cpu
-
-// implemented in ggml-cpu.cpp.
-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
-
-#endif
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -1,656 +0,0 @@
-#include "ggml-backend.h"
-#include "ggml-backend-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-cpu-aarch64.h"
-#include "ggml-cpu-traits.h"
-#include "ggml-impl.h"
-#include "amx/amx.h"
-
-#include <cctype>
-#include <string>
-#include <vector>
-
-#ifdef GGML_USE_CPU_HBM
-#include "ggml-cpu-hbm.h"
-#endif
-
-#ifdef GGML_USE_CPU_KLEIDIAI
-#include "kleidiai/kleidiai.h"
-#endif
-
-#if defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-// ggml-backend interface
-
-std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
-    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
-        std::vector<ggml_backend_buffer_type_t> bufts;
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-        if (ggml_backend_amx_buffer_type()) {
-            bufts.push_back(ggml_backend_amx_buffer_type());
-        }
-#endif
-
-#ifdef GGML_USE_CPU_KLEIDIAI
-        if (ggml_backend_cpu_kleidiai_buffer_type()) {
-            bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
-        }
-#endif
-
-#ifdef GGML_USE_CPU_AARCH64
-        if (ggml_backend_cpu_aarch64_buffer_type()) {
-            bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
-        }
-#endif
-
-        bufts.push_back(NULL);
-
-        return bufts;
-    }();
-
-    return bufts;
-}
-
-static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
-    return ggml_backend_cpu_get_extra_buffers_type().data();
-
-    GGML_UNUSED(device);
-}
-
-static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra == buft) return true;
-    }
-    return false;
-}
-
-// CPU backend - backend (stream)
-
-struct ggml_backend_cpu_context {
-    int                 n_threads;
-    ggml_threadpool_t   threadpool;
-
-    uint8_t *           work_data;
-    size_t              work_size;
-
-    ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
-
-static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
-    return "CPU";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    delete[] cpu_ctx->work_data;
-    delete cpu_ctx;
-    delete backend;
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
-        if (cpu_plan->cplan.work_data == NULL) {
-            delete cpu_plan;
-            return NULL;
-        }
-    }
-
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return cpu_plan;
-}
-
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    delete[] cpu_plan->cplan.work_data;
-    delete cpu_plan;
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    GGML_UNUSED(backend);
-}
-
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        delete[] cpu_ctx->work_data;
-        cpu_ctx->work_data = new uint8_t[cplan.work_size];
-        if (cpu_ctx->work_data == NULL) {
-            cpu_ctx->work_size = 0;
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-        cpu_ctx->work_size = cplan.work_size;
-    }
-    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return ggml_graph_compute(cgraph, &cplan);
-}
-
-static const struct ggml_backend_i ggml_backend_cpu_i = {
-    /* .get_name                = */ ggml_backend_cpu_get_name,
-    /* .free                    = */ ggml_backend_cpu_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_cpu_guid(void) {
-    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    // initialize CPU backend now to avoid slowing the first graph computation
-    ggml_cpu_init();
-
-    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
-    ctx->threadpool          = NULL;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
-    ctx->abort_callback      = NULL;
-    ctx->abort_callback_data = NULL;
-
-    ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_cpu_guid(),
-        /* .interface = */ ggml_backend_cpu_i,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context   = */ ctx,
-    };
-
-    if (cpu_backend == NULL) {
-        delete ctx;
-        return NULL;
-    }
-
-    return cpu_backend;
-}
-
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-
-    if (ctx->threadpool && ctx->threadpool != threadpool) {
-        // already had a different threadpool, pause/suspend it before switching
-        ggml_threadpool_pause(ctx->threadpool);
-    }
-    ctx->threadpool = threadpool;
-}
-
-void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->abort_callback = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-}
-
-// CPU backend - device
-
-struct ggml_backend_cpu_device_context {
-    std::string description = "CPU";
-
-    ggml_backend_cpu_device_context() {
-#ifdef __APPLE__
-        size_t len = 0;
-        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
-            description.resize(len);
-            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
-        }
-#elif defined(__linux__)
-        FILE * f = fopen("/proc/cpuinfo", "r");
-        if (f) {
-            char buf[1024];
-            while (fgets(buf, sizeof(buf), f)) {
-                if (strncmp(buf, "model name", 10) == 0) {
-                    char * p = strchr(buf, ':');
-                    if (p) {
-                        p++;
-                        while (std::isspace(*p)) {
-                            p++;
-                        }
-                        while (std::isspace(p[strlen(p) - 1])) {
-                            p[strlen(p) - 1] = '\0';
-                        }
-                        description = p;
-                        break;
-                    }
-                }
-            }
-            fclose(f);
-        }
-#elif defined(_WIN32)
-        HKEY hKey;
-        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                        0,
-                        KEY_READ,
-                        &hKey) == ERROR_SUCCESS) {
-            DWORD cpu_brand_size = 0;
-            if (RegQueryValueExA(hKey,
-                                "ProcessorNameString",
-                                NULL,
-                                NULL,
-                                NULL,
-                                &cpu_brand_size) == ERROR_SUCCESS) {
-                description.resize(cpu_brand_size);
-                if (RegQueryValueExA(hKey,
-                                    "ProcessorNameString",
-                                    NULL,
-                                    NULL,
-                                    (LPBYTE)&description[0], // NOLINT
-                                    &cpu_brand_size) == ERROR_SUCCESS) {
-                    if (description.find('\0') != std::string::npos) {
-                        description.resize(description.find('\0'));
-                    }
-                }
-            }
-            RegCloseKey(hKey);
-        }
-#endif
-    }
-};
-
-static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
-    return "CPU";
-
-    GGML_UNUSED(dev);
-}
-
-static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
-    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
-
-    return ctx->description.c_str();
-}
-
-static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-
-    GGML_UNUSED(dev);
-}
-
-static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
-    return GGML_BACKEND_DEVICE_TYPE_CPU;
-
-    GGML_UNUSED(dev);
-}
-
-static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_cpu_device_get_name(dev);
-    props->description = ggml_backend_cpu_device_get_description(dev);
-    props->type        = ggml_backend_cpu_device_get_type(dev);
-    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    return ggml_backend_cpu_init();
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(params);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(dev);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-
-    if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
-        return true;
-    }
-
-    // extra_buffer_op?
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra) {
-            auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
-            if (buf_extra && buf_extra->supports_op(dev, op)) {
-                return true;
-            }
-        }
-    }
-
-    // the other case need host buffer.
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
-            return false;
-        }
-    }
-
-    switch (op->op) {
-        case GGML_OP_CPY:
-            return
-                op->type != GGML_TYPE_IQ3_XXS &&
-                op->type != GGML_TYPE_IQ3_S   &&
-                op->type != GGML_TYPE_IQ2_XXS &&
-                op->type != GGML_TYPE_IQ2_XS  &&
-                op->type != GGML_TYPE_IQ2_S   &&
-                op->type != GGML_TYPE_IQ1_S   &&
-                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
-        case GGML_OP_MUL_MAT:
-            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
-        case GGML_OP_SOFT_MAX_BACK: {
-            if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
-                return false;
-            }
-            float max_bias = 0.0f;
-
-            memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
-
-            return max_bias == 0.0f;
-        }
-        case GGML_OP_IM2COL_BACK:
-            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
-        case GGML_OP_OUT_PROD:
-            return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
-                src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
-        default:
-            return true;
-    }
-}
-
-static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
-    GGML_UNUSED(dev);
-}
-
-static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
-    /* .get_name             = */ ggml_backend_cpu_device_get_name,
-    /* .get_description      = */ ggml_backend_cpu_device_get_description,
-    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
-    /* .get_type             = */ ggml_backend_cpu_device_get_type,
-    /* .get_props            = */ ggml_backend_cpu_device_get_props,
-    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
-    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
-    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-// CPU backend - backend (reg)
-
-static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
-    return "CPU";
-
-    GGML_UNUSED(reg);
-}
-
-static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
-}
-
-static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    static ggml_backend_cpu_device_context ctx;
-    static ggml_backend_device ggml_backend_cpu_device = {
-        /* .iface   = */ ggml_backend_cpu_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ &ctx,
-    };
-
-    return &ggml_backend_cpu_device;
-}
-
-// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
-// and additionally to allow other backends to expose their own list of features that applications can query using the same API
-static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
-    static std::vector<ggml_backend_feature> features = []() {
-        ggml_cpu_init();
-
-        std::vector<ggml_backend_feature> features;
-        if (ggml_cpu_has_sse3()) {
-            features.push_back({ "SSE3", "1" });
-        }
-        if (ggml_cpu_has_ssse3()) {
-            features.push_back({ "SSSE3", "1" });
-        }
-        if (ggml_cpu_has_avx()) {
-            features.push_back({ "AVX", "1" });
-        }
-        if (ggml_cpu_has_avx_vnni()) {
-            features.push_back({ "AVX_VNNI", "1" });
-        }
-        if (ggml_cpu_has_avx2()) {
-            features.push_back({ "AVX2", "1" });
-        }
-        if (ggml_cpu_has_f16c()) {
-            features.push_back({ "F16C", "1" });
-        }
-        if (ggml_cpu_has_fma()) {
-            features.push_back({ "FMA", "1" });
-        }
-        if (ggml_cpu_has_bmi2()) {
-            features.push_back({ "BMI2", "1" });
-        }
-        if (ggml_cpu_has_avx512()) {
-            features.push_back({ "AVX512", "1" });
-        }
-        if (ggml_cpu_has_avx512_vbmi()) {
-            features.push_back({ "AVX512_VBMI", "1" });
-        }
-        if (ggml_cpu_has_avx512_vnni()) {
-            features.push_back({ "AVX512_VNNI", "1" });
-        }
-        if (ggml_cpu_has_avx512_bf16()) {
-            features.push_back({ "AVX512_BF16", "1" });
-        }
-        if (ggml_cpu_has_amx_int8()) {
-            features.push_back({ "AMX_INT8", "1" });
-        }
-        if (ggml_cpu_has_neon()) {
-            features.push_back({ "NEON", "1" });
-        }
-        if (ggml_cpu_has_arm_fma()) {
-            features.push_back({ "ARM_FMA", "1" });
-        }
-        if (ggml_cpu_has_fp16_va()) {
-            features.push_back({ "FP16_VA", "1" });
-        }
-        if (ggml_cpu_has_matmul_int8()) {
-            features.push_back({ "MATMUL_INT8", "1" });
-        }
-        if (ggml_cpu_has_sve()) {
-            features.push_back({ "SVE", "1" });
-        }
-        if (ggml_cpu_has_dotprod()) {
-            features.push_back({ "DOTPROD", "1" });
-        }
-        if (ggml_cpu_get_sve_cnt() > 0) {
-            static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
-            features.push_back({ "SVE_CNT", sve_cnt.c_str() });
-        }
-        if (ggml_cpu_has_sme()) {
-            features.push_back({ "SME", "1" });
-        }
-        if (ggml_cpu_has_riscv_v()) {
-            features.push_back({ "RISCV_V", "1" });
-        }
-        if (ggml_cpu_has_vsx()) {
-            features.push_back({ "VSX", "1" });
-        }
-        if (ggml_cpu_has_vxe()) {
-            features.push_back({ "VXE", "1" });
-        }
-        if (ggml_cpu_has_wasm_simd()) {
-            features.push_back({ "WASM_SIMD", "1" });
-        }
-        if (ggml_cpu_has_llamafile()) {
-            features.push_back({ "LLAMAFILE", "1" });
-        }
-    #ifdef GGML_USE_ACCELERATE
-        features.push_back({ "ACCELERATE", "1" });
-    #endif
-    #ifdef GGML_USE_CPU_HBM
-        features.push_back({ "CPU_HBM", "1" });
-    #endif
-    #ifdef GGML_USE_OPENMP
-        features.push_back({ "OPENMP", "1" });
-    #endif
-    #ifdef GGML_USE_CPU_KLEIDIAI
-        features.push_back({ "KLEIDIAI", "1" });
-    #endif
-    #ifdef GGML_USE_CPU_AARCH64
-        features.push_back({ "AARCH64_REPACK", "1" });
-    #endif
-
-        features.push_back({ nullptr, nullptr });
-
-        return features;
-    }();
-
-    return features.data();
-
-    GGML_UNUSED(reg);
-}
-
-static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
-        return (void *)fct;
-    }
-    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
-        ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
-        return (void *)fct;
-    }
-    if (strcmp(name, "ggml_backend_get_features") == 0) {
-        return (void *)ggml_backend_cpu_get_features;
-    }
-    if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
-        return (void *)ggml_backend_cpu_set_abort_callback;
-    }
-    if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
-        return (void *)ggml_numa_init;
-    }
-    if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
-        return (void *)ggml_is_numa;
-    }
-
-    // threadpool - TODO:  move to ggml-base
-    if (strcmp(name, "ggml_threadpool_new") == 0) {
-        return (void *)ggml_threadpool_new;
-    }
-    if (strcmp(name, "ggml_threadpool_free") == 0) {
-        return (void *)ggml_threadpool_free;
-    }
-    if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
-        return (void *)ggml_backend_cpu_set_threadpool;
-    }
-
-    return NULL;
-
-    GGML_UNUSED(reg);
-}
-
-static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
-    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
-    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
-    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_cpu_reg(void) {
-    // init CPU feature detection
-    ggml_cpu_init();
-
-    static struct ggml_backend_reg ggml_backend_cpu_reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_cpu_reg_i,
-        /* .context     = */ NULL,
-    };
-
-    return &ggml_backend_cpu_reg;
-}
-
-GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@ -1,254 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-
-// KleidiAI micro-kernels
-#include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
-#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
-#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
-#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
-#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
-#include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
-#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
-#include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
-#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
-#include "kai_common.h"
-
-#include "kernels.h"
-
-#define NELEMS(x) sizeof(x) / sizeof(*x)
-static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
-#if defined(__ARM_FEATURE_SME)
-    {
-        /* SME GEMM */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
-        },
-        /* SME GEMV */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
-        },
-        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
-            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
-            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
-        },
-        /* .rhs_info = */ {
-            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
-            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
-        },
-        /* .required_cpu       = */ CPU_FEATURE_SME,
-    },
-#endif
-#if defined(__APPLE__)
-#if defined(__ARM_FEATURE_DOTPROD)
-    {
-        /* DOTPROD GEMM */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-        },
-        /* DOTPROD GEMV */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-        },
-        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
-            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-        },
-        /* .rhs_info = */ {
-            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-        },
-        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
-    },
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    {
-        /* i8mm GEMM */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-        },
-        /* i8mm GEMV */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-        },
-        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
-            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-        },
-        /* .rhs_info = */ {
-            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-        },
-        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
-    },
-#endif
-#else
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    {
-        /* i8mm GEMM */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
-        },
-        /* i8mm GEMV */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
-        },
-        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
-            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-        },
-        /* .rhs_info = */ {
-            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-        },
-        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
-    },
-#endif
-#if defined(__ARM_FEATURE_DOTPROD)
-    {
-        /* DOTPROD GEMM */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
-        },
-        /* DOTPROD GEMV */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
-        },
-        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
-            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-        },
-        /* .rhs_info = */ {
-            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
-        },
-        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
-    },
-#endif
-#endif
-};
-
-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature features) {
-    ggml_kleidiai_kernels * kernels = nullptr;
-
-    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
-        if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
-            kernels = &gemm_gemv_kernels[i];
-            break;
-        }
-    }
-
-    return kernels;
-}
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@ -1,60 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-enum cpu_feature {
-    CPU_FEATURE_NONE    = 0,
-    CPU_FEATURE_DOTPROD = 1,
-    CPU_FEATURE_I8MM    = 2,
-    CPU_FEATURE_SVE     = 4,
-    CPU_FEATURE_SME     = 8
-};
-inline cpu_feature& operator|=(cpu_feature& lhs, cpu_feature rhs) {
-    lhs = static_cast<cpu_feature>(lhs | rhs);
-    return lhs;
-}
-inline cpu_feature operator|(cpu_feature lhs, cpu_feature rhs) {
-    return static_cast<cpu_feature>(static_cast<int>(lhs) | static_cast<int>(rhs));
-}
-
-struct kernel_info {
-    size_t (*get_m_step)(void);
-    size_t (*get_n_step)(void);
-    size_t (*get_mr)(void);
-    size_t (*get_nr)(void);
-    size_t (*get_kr)(void);
-    size_t (*get_sr)(void);
-    size_t (*get_lhs_offset)(size_t m_idx, size_t k, size_t bl);
-    size_t (*get_rhs_packed_offset)(size_t n_idx, size_t k, size_t bl);
-    size_t (*get_dst_offset)(size_t m_idx, size_t n_idx, size_t stride);
-    size_t (*get_dst_size)(size_t m, size_t n);
-    void (*run_kernel)(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
-                         float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max);
-};
-
-struct lhs_packing_info {
-    size_t (*get_offset)(size_t m_idx, size_t lhs_stride);
-    size_t (*get_packed_offset)(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
-    size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
-    void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
-                      size_t lhs_stride, void* lhs_packed);
-};
-
-struct rhs_packing_info {
-    size_t (*packed_size)(size_t n, size_t k, size_t nr, size_t kr, size_t bl);
-    void (*pack_func)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
-                      const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params);
-};
-
-struct ggml_kleidiai_kernels {
-    kernel_info gemm;
-    kernel_info gemv;
-    lhs_packing_info lhs_info;
-    rhs_packing_info rhs_info;
-
-    cpu_feature required_cpu;
-};
-
-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features);
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@ -1,287 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-#include <arm_neon.h>
-#include <assert.h>
-#include <cfloat>
-#include <stdint.h>
-#include <string.h>
-#if defined(__linux__)
-#include <asm/hwcap.h>
-#include <sys/auxv.h>
-#elif defined(__APPLE__)
-#include <string_view>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#elif defined(_WIN32)
-#include <windows.h>
-#include <excpt.h>
-#endif
-
-#include "kleidiai.h"
-
-#include "ggml-cpu.h"
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-#include "ggml-threading.h"
-#include "ggml-cpu-traits.h"
-
-#include "kernels.h"
-
-#include "kai_common.h"
-
-#define GGML_COMMON_DECL_CPP
-#include "ggml-common.h"
-
-struct ggml_kleidiai_context {
-    ggml_kleidiai_kernels * kernels;
-} static ctx = { NULL };
-
-static void init_kleidiai_context(void) {
-
-    ggml_critical_section_start();
-    static bool initialized = false;
-
-    if (!initialized) {
-        initialized = true;
-        const char *env_var = getenv("GGML_KLEIDIAI_SME");
-        int sme_enabled = 0;
-
-        cpu_feature features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
-                                (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
-                                (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);
-
-        if (env_var) {
-            sme_enabled = atoi(env_var);
-        }
-
-        if (sme_enabled != 0) {
-            features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
-        }
-        ctx.kernels = ggml_kleidiai_select_kernels(features);
-    }
-    ggml_critical_section_end();
-}
-
-static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
-    GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
-    return tensor->ne[dim];
-}
-
-namespace ggml::cpu::kleidiai {
-class tensor_traits : public ggml::cpu::tensor_traits {
-    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        GGML_ASSERT(ctx.kernels);
-        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
-
-        size_t k = op->src[0]->ne[0];
-        size_t m = op->src[1]->ne[1];
-
-        size_t mr = kernel->get_mr();
-        size_t kr = kernel->get_kr();
-        size_t sr = kernel->get_sr();
-
-        size = ctx.kernels->lhs_info.packed_size(m, k, QK4_0, mr, kr, sr);
-
-        return true;
-    }
-
-    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
-        if (dst->op == GGML_OP_MUL_MAT) {
-            const ggml_tensor * src0 = dst->src[0];
-            const ggml_tensor * src1 = dst->src[1];
-
-            GGML_TENSOR_BINARY_OP_LOCALS
-
-            GGML_ASSERT(ctx.kernels);
-            kernel_info * kernel = src1->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
-            lhs_packing_info * lhs_info = &ctx.kernels->lhs_info;
-
-            GGML_ASSERT(kernel);
-
-            const int ith = params->ith;
-            const int nth = params->nth;
-
-            const size_t k = ne00;
-            const size_t m = ne11;
-            const size_t n = ne01;
-
-            const size_t n_step = kernel->get_n_step();
-            const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
-            const size_t n_start = ith * num_n_per_thread;
-
-            size_t n_to_process = num_n_per_thread;
-            if ((n_start + n_to_process) > n) {
-                n_to_process = n - n_start;
-            }
-
-            const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
-            uint8_t * lhs_packed       = (uint8_t*)params->wdata;
-            const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
-
-            size_t mr = kernel->get_mr();
-            size_t kr = kernel->get_kr();
-            size_t sr = kernel->get_sr();
-
-            // Calculate number of columns to be processed per thread
-            const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
-            const size_t m_start = ith * num_m_per_thread;
-            size_t m_to_process = num_m_per_thread;
-            if ((m_start + m_to_process) > m) {
-                m_to_process = m - m_start;
-            }
-
-            if(m_start < m) {
-                // Transform LHS
-                const size_t src_stride        = src1->nb[1];
-                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
-                const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
-                void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
-
-                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
-            }
-
-            ggml_barrier(params->threadpool);
-
-            // Perform the operation
-            const size_t dst_stride        = dst->nb[1];
-            const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, QK4_0, mr, kr, sr);
-            const size_t rhs_packed_offset = kernel->get_rhs_packed_offset(n_start, k, QK4_0);
-            const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
-            const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
-            const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
-            float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
-
-            kernel->run_kernel(m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr,
-                               dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
-            return true;
-        }
-        return false;
-    }
-
-public:
-    int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
-        GGML_ASSERT(ctx.kernels);
-        const size_t n = tensor->ne[1];
-        const size_t k = tensor->ne[0];
-        size_t nr      = ctx.kernels->gemm.get_nr();
-        size_t kr      = ctx.kernels->gemm.get_kr();
-        size_t sr      = ctx.kernels->gemm.get_sr();
-
-#ifndef NDEBUG
-        const size_t repacked_size = ctx.kernels->rhs_info.packed_size(n, k, nr, kr, QK4_0);
-        GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
-#endif
-        struct kai_rhs_pack_qs4cxs1s0_param params;
-        params.lhs_zero_point = 1;
-        params.rhs_zero_point = 8;
-        ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, QK4_0, (const uint8_t *)data, NULL, tensor->data, 0, &params);
-
-        return 0;
-
-        GGML_UNUSED(data_size);
-    }
-};
-
-static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
-    static tensor_traits traits;
-    return &traits;
-}
-}  // namespace ggml::cpu::kleidiai
-
-GGML_API enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
-
-    GGML_UNUSED(buffer);
-    return GGML_STATUS_SUCCESS;
-}
-
-static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
-                                                       const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    auto tensor_traits = (ggml::cpu::kleidiai::tensor_traits *) tensor->extra;
-    auto OK            = tensor_traits->repack(tensor, data, size);
-
-    GGML_ASSERT(OK == 0);
-    GGML_UNUSED(buffer);
-}
-
-static const char * ggml_backend_cpu_kleidiai_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_KLEIDIAI";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-
-    if (buffer == nullptr) {
-        return nullptr;
-    }
-
-    buffer->buft              = buft;
-    buffer->iface.init_tensor = ggml_backend_cpu_kleidiai_buffer_init_tensor;
-    buffer->iface.set_tensor  = ggml_backend_cpu_kleidiai_buffer_set_tensor;
-    buffer->iface.get_tensor  = nullptr;
-    buffer->iface.cpy_tensor  = nullptr;
-    return buffer;
-}
-
-static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-namespace ggml::cpu::kleidiai {
-class extra_buffer_type : ggml::cpu::extra_buffer_type {
-    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        if (    op->op == GGML_OP_MUL_MAT &&
-                op->src[0]->type == GGML_TYPE_Q4_0 &&
-                op->src[0]->buffer &&
-                (ggml_n_dims(op->src[0]) == 2) &&
-                op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
-                ) {
-            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
-                return false;
-            }
-            if (op->src[1]->type == GGML_TYPE_F32 &&
-                ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT) {
-            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
-                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
-            }
-        }
-        return nullptr;
-    }
-};
-}  // namespace ggml::cpu::kleidiai
-
-ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
-    static ggml::cpu::kleidiai::extra_buffer_type ctx;
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_kleidiai = {
-        /* .iface    = */ {
-                           /* .get_name         = */ ggml_backend_cpu_kleidiai_buffer_type_get_name,
-                           /* .alloc_buffer     = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer,
-                           /* .get_alignment    = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment,
-                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
-                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
-                           /* .is_host          = */ nullptr,
-                           },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ &ctx,
-    };
-
-    init_kleidiai_context();
-
-    return &ggml_backend_cpu_buffer_type_kleidiai;
-}
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.h
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.h
@ -1,17 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "ggml-alloc.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@ -1,14 +0,0 @@
-#pragma once
-#include <stdint.h>
-#include <stdbool.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
-                     const void *, int64_t, const void *, int64_t, void *, int64_t,
-                     int, int, int);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@ -1,128 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-//
-// cache line
-//
-
-#if defined(__cpp_lib_hardware_interference_size)
-#define CACHE_LINE_SIZE std::hardware_destructive_interference_size
-#else
-#if defined(__POWER9_VECTOR__)
-#define CACHE_LINE_SIZE 128
-#elif defined(__VXE__) || defined(__VXE2__)
-#define CACHE_LINE_SIZE 256
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
-static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_flash_attn_ext(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * q,
-    const struct ggml_tensor * k,
-    const struct ggml_tensor * v,
-    const struct ggml_tensor * mask,
-    struct ggml_tensor * dst);
-void ggml_compute_forward_flash_attn_back(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst);
-void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_map_unary(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst,
-    const ggml_unary_op_f32_t fun);
-void ggml_compute_forward_map_binary(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst,
-    const ggml_binary_op_f32_t fun);
-void ggml_compute_forward_map_custom1_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst,
-    const ggml_custom1_op_f32_t fun);
-void ggml_compute_forward_map_custom2_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst,
-    const ggml_custom2_op_f32_t fun);
-void ggml_compute_forward_map_custom3_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst,
-    const ggml_custom3_op_f32_t fun);
-void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@ -1,884 +0,0 @@
-#pragma once
-
-#include "ggml-cpu-impl.h"
-
-//
-// simd mappings
-//
-
-// we define a common set of C macros which map to specific intrinsics based on the current architecture
-// we then implement the fundamental computation operations below using only these macros
-// adding support for new architectures requires to define the corresponding SIMD macros
-//
-// GGML_F32_STEP / GGML_F16_STEP
-//   number of elements to process in a single step
-//
-// GGML_F32_EPR / GGML_F16_EPR
-//   number of elements to fit in a single register
-//
-
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-
-#define GGML_SIMD
-
-// F32 NEON
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              float32x4_t
-#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
-#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
-#define GGML_F32x4_LOAD         vld1q_f32
-#define GGML_F32x4_STORE        vst1q_f32
-#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
-#define GGML_F32x4_ADD          vaddq_f32
-#define GGML_F32x4_MUL          vmulq_f32
-#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)                       \
-{                                                       \
-    int offset = GGML_F32_ARR >> 1;                     \
-    for (int i = 0; i < offset; ++i) {                  \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
-    }                                                   \
-    offset >>= 1;                                       \
-    for (int i = 0; i < offset; ++i) {                  \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
-    }                                                   \
-    offset >>= 1;                                       \
-    for (int i = 0; i < offset; ++i) {                  \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
-    }                                                   \
-    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 NEON
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    #define GGML_F16_STEP 32
-    #define GGML_F16_EPR  8
-
-    #define GGML_F16x8              float16x8_t
-    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
-    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x))
-    #define GGML_F16x8_STORE        vst1q_f16
-    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
-    #define GGML_F16x8_ADD          vaddq_f16
-    #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                               \
-    do {                                                            \
-        int offset = GGML_F16_ARR >> 1;                             \
-        for (int i = 0; i < offset; ++i) {                          \
-            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
-        }                                                           \
-        offset >>= 1;                                               \
-        for (int i = 0; i < offset; ++i) {                          \
-            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
-        }                                                           \
-        offset >>= 1;                                               \
-        for (int i = 0; i < offset; ++i) {                          \
-            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
-        }                                                           \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
-        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    } while (0)
-
-    #define GGML_F16_VEC                GGML_F16x8
-    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
-    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
-    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
-    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-#else
-    // if FP16 vector arithmetic is not supported, we use FP32 instead
-    // and take advantage of the vcvt_ functions to convert to/from FP16
-
-    #define GGML_F16_STEP 16
-    #define GGML_F16_EPR  4
-
-    #define GGML_F32Cx4              float32x4_t
-    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
-    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
-    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
-    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
-    #define GGML_F32Cx4_ADD          vaddq_f32
-    #define GGML_F32Cx4_MUL          vmulq_f32
-    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-
-    #define GGML_F16_VEC                GGML_F32Cx4
-    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
-    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
-    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
-    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-#endif
-
-#elif defined(__AVX512F__)
-
-#define GGML_SIMD
-
-// F32 AVX512
-
-#define GGML_F32_STEP 64
-#define GGML_F32_EPR  16
-
-#define GGML_F32x16         __m512
-#define GGML_F32x16_ZERO    _mm512_setzero_ps()
-#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
-#define GGML_F32x16_LOAD    _mm512_loadu_ps
-#define GGML_F32x16_STORE   _mm512_storeu_ps
-// _mm512_fmadd_ps is defined in AVX512F so no guard is required
-#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
-#define GGML_F32x16_ADD     _mm512_add_ps
-#define GGML_F32x16_MUL     _mm512_mul_ps
-#define GGML_F32x16_REDUCE(res, x)                                    \
-do {                                                                  \
-    int offset = GGML_F32_ARR >> 1;                                   \
-    for (int i = 0; i < offset; ++i) {                                \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
-    }                                                                 \
-    offset >>= 1;                                                     \
-    for (int i = 0; i < offset; ++i) {                                \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
-    }                                                                 \
-    offset >>= 1;                                                     \
-    for (int i = 0; i < offset; ++i) {                                \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
-    }                                                                 \
-    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
-} while (0)
-
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x16
-#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
-
-// F16 AVX512
-
-// F16 AVX
-
-#define GGML_F16_STEP 64
-#define GGML_F16_EPR  16
-
-// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
-
-#define GGML_F32Cx16             __m512
-#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
-#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
-
-// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
-// so F16C guard isn't required
-#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
-#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
-
-#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
-#define GGML_F32Cx16_ADD         _mm512_add_ps
-#define GGML_F32Cx16_MUL         _mm512_mul_ps
-#define GGML_F32Cx16_REDUCE(res, x)                               \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
-} while (0)
-
-#define GGML_F16_VEC                GGML_F32Cx16
-#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
-
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
-#elif defined(__AVX__)
-
-#define GGML_SIMD
-
-// F32 AVX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    _mm256_setzero_ps()
-#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
-#define GGML_F32x8_LOAD    _mm256_loadu_ps
-#define GGML_F32x8_STORE   _mm256_storeu_ps
-#if defined(__FMA__)
-    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
-#endif
-#define GGML_F32x8_ADD     _mm256_add_ps
-#define GGML_F32x8_MUL     _mm256_mul_ps
-#define GGML_F32x8_REDUCE(res, x)                                 \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
-                                 _mm256_extractf128_ps(x[0], 1)); \
-    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
-} while (0)
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 AVX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8             __m256
-#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
-#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-
-#if defined(__F16C__)
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
-#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-#else
-static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
-    float arr[8];
-
-    _mm256_storeu_ps(arr, y);
-
-    for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-}
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
-#endif
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         _mm256_add_ps
-#define GGML_F32Cx8_MUL         _mm256_mul_ps
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_SIMD
-
-// F32 POWER9
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              vector float
-#define GGML_F32x4_ZERO         0.0f
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    res = vec_extract(x[0], 0) +               \
-          vec_extract(x[0], 1) +               \
-          vec_extract(x[0], 2) +               \
-          vec_extract(x[0], 3);                \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 POWER9
-#define GGML_F16_STEP       GGML_F32_STEP
-#define GGML_F16_EPR        GGML_F32_EPR
-#define GGML_F16_VEC        GGML_F32x4
-#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
-// Use vec_xl, not vec_ld, in case the load address is not aligned.
-#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
-  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
-  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-#define GGML_F16_VEC_STORE(p, r, i)                             \
-  if (i & 0x1)                                                  \
-    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
-                                   r[i - GGML_ENDIAN_BYTE(0)]), \
-            0, p - GGML_F16_EPR)
-
-#elif defined(__wasm_simd128__)
-
-#define GGML_SIMD
-
-// F32 WASM
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              v128_t
-#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
-#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
-#define GGML_F32x4_LOAD         wasm_v128_load
-#define GGML_F32x4_STORE        wasm_v128_store
-#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
-#define GGML_F32x4_ADD          wasm_f32x4_add
-#define GGML_F32x4_MUL          wasm_f32x4_mul
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 WASM
-
-#define GGML_F16_STEP 16
-#define GGML_F16_EPR  4
-
-inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
-
-    return wasm_v128_load(tmp);
-}
-
-inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
-    float tmp[4];
-
-    wasm_v128_store(tmp, x);
-
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
-}
-
-#define GGML_F16x4             v128_t
-#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
-#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
-#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
-#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
-#define GGML_F16x4_FMA         GGML_F32x4_FMA
-#define GGML_F16x4_ADD         wasm_f32x4_add
-#define GGML_F16x4_MUL         wasm_f32x4_mul
-#define GGML_F16x4_REDUCE(res, x)                           \
-{                                                           \
-    int offset = GGML_F16_ARR >> 1;                         \
-    for (int i = 0; i < offset; ++i) {                      \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
-    }                                                       \
-    offset >>= 1;                                           \
-    for (int i = 0; i < offset; ++i) {                      \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
-    }                                                       \
-    offset >>= 1;                                           \
-    for (int i = 0; i < offset; ++i) {                      \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
-    }                                                       \
-    res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) +  \
-          wasm_f32x4_extract_lane(x[0], 1) +                \
-          wasm_f32x4_extract_lane(x[0], 2) +                \
-          wasm_f32x4_extract_lane(x[0], 3));                \
-}
-
-#define GGML_F16_VEC                GGML_F16x4
-#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-
-#elif defined(__SSE3__)
-
-#define GGML_SIMD
-
-// F32 SSE
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    _mm_setzero_ps()
-#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
-#define GGML_F32x4_LOAD    _mm_loadu_ps
-#define GGML_F32x4_STORE   _mm_storeu_ps
-#if defined(__FMA__)
-    // TODO: Does this work?
-    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
-#endif
-#define GGML_F32x4_ADD     _mm_add_ps
-#define GGML_F32x4_MUL     _mm_mul_ps
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 SSE
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return _mm_loadu_ps(tmp);
-}
-
-static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
-    float arr[4];
-
-    _mm_storeu_ps(arr, y);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
-#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
-#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         _mm_add_ps
-#define GGML_F32Cx4_MUL         _mm_mul_ps
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#elif defined(__loongarch_asx)
-
-#define GGML_SIMD
-
-// F32 LASX
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
-#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
-#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
-#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
-#define GGML_F32x8_ADD     __lasx_xvfadd_s
-#define GGML_F32x8_MUL     __lasx_xvfmul_s
-#define GGML_F32x8_REDUCE(res, x)                                 \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
-    }                                                             \
-    float *tmp_p = (float *)&x[0]; \
-    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
-} while (0)
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 LASX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by LASX, so we use F32 instead
-
-#define GGML_F32Cx8          __m256
-#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
-
-static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
-    __m256i a;
-    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
-    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
-    return __lasx_xvfcvtl_s_h(a);
-}
-
-static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
-    __m256i a = __lasx_xvfcvt_h_s(y, y);
-    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
-    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
-}
-#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
-#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__loongarch_sx)
-
-#define GGML_SIMD
-
-// F32 LSX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    __lsx_vldi(0)
-#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
-#define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
-#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
-#define GGML_F32x4_ADD     __lsx_vfadd_s
-#define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                                     \
-{                                                                                     \
-    int offset = GGML_F32_ARR >> 1;                                                   \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
-    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 LSX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return __lsx_vld(tmp, 0);
-}
-
-static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
-    float arr[4];
-
-    __lsx_vst(y, arr, 0);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
-#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         __lsx_vfadd_s
-#define GGML_F32Cx4_MUL         __lsx_vfmul_s
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#elif defined(__VXE__) || defined(__VXE2__)
-
-#define GGML_SIMD
-
-// F32 s390x
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              __vector float
-#define GGML_F32x4_ZERO         vec_splats(0.0f)
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)                   \
-{                                                   \
-    int offset = GGML_F32_ARR >> 1;                 \
-    for (int i = 0; i < offset; ++i) {              \
-        x[i] = vec_add(x[i], x[offset + i]);        \
-    }                                               \
-    offset >>= 1;                                   \
-    for (int i = 0; i < offset; ++i) {              \
-        x[i] = vec_add(x[i], x[offset + i]);        \
-    }                                               \
-    offset >>= 1;                                   \
-    for (int i = 0; i < offset; ++i) {              \
-        x[i] = vec_add(x[i], x[offset + i]);        \
-    }                                               \
-    res = vec_extract(x[0], 0) +                    \
-          vec_extract(x[0], 1) +                    \
-          vec_extract(x[0], 2) +                    \
-          vec_extract(x[0], 3);                     \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 s390x
-#define GGML_F16_STEP GGML_F32_STEP
-#define GGML_F16_EPR  GGML_F32_EPR
-
-static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
-    float tmp[4];
-
-    for (int i = 0; i < 4; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return vec_xl(0, tmp);
-}
-
-static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
-    float arr[4];
-
-    vec_xst(y, 0, arr);
-
-    for (int i = 0; i < 4; i++) {
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-    }
-}
-
-#define GGML_F16_VEC                GGML_F32x4
-#define GGML_F16_VEC_ZERO           GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
-#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F32x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F32x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32x4_REDUCE
-
-#endif
-
-// GGML_F32_ARR / GGML_F16_ARR
-//   number of registers to use per step
-#ifdef GGML_SIMD
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
-#endif
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@ -1,186 +0,0 @@
-#include "unary-ops.h"
-
-static inline float op_abs(float x) {
-    return fabsf(x);
-}
-
-static inline float op_sgn(float x) {
-    return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
-}
-
-static inline float op_neg(float x) {
-    return -x;
-}
-
-static inline float op_step(float x) {
-    return (x > 0.f) ? 1.f : 0.f;
-}
-
-static inline float op_tanh(float x) {
-    return tanhf(x);
-}
-
-static inline float op_elu(float x) {
-    return (x > 0.f) ? x : expm1f(x);
-}
-
-static inline float op_relu(float x) {
-    return (x > 0.f) ? x : 0.f;
-}
-
-static inline float op_sigmoid(float x) {
-    return 1.f / (1.f + expf(-x));
-}
-
-static inline float op_hardsigmoid(float x) {
-    return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
-}
-
-static inline float op_exp(float x) {
-    return expf(x);
-}
-
-static inline float op_hardswish(float x) {
-    return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
-}
-
-static inline float op_sqr(float x) {
-    return x * x;
-}
-
-static inline float op_sqrt(float x) {
-    return sqrtf(x);
-}
-
-static inline float op_sin(float x) {
-    return sinf(x);
-}
-
-static inline float op_cos(float x) {
-    return cosf(x);
-}
-
-static inline float op_log(float x) {
-    return logf(x);
-}
-
-template <float (*op)(float), typename src0_t, typename dst_t>
-static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
-    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
-    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
-
-    for (int i = 0; i < n; i++) {
-        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
-    }
-}
-
-template <float (*op)(float), typename src0_t, typename dst_t>
-static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(dst_t));
-    GGML_ASSERT(nb00 == sizeof(src0_t));
-
-    const auto [ir0, ir1] = get_thread_range(params, src0);
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir/(ne02*ne01);
-        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
-    }
-}
-
-// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
-template <float (*op)(float)>
-static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
-        apply_unary_op<op, float, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
-        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
-        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
-        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
-        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type));
-        GGML_ABORT("fatal error");
-    }
-}
-
-void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_abs>(params, dst);
-}
-
-void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_sgn>(params, dst);
-}
-
-void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_neg>(params, dst);
-}
-
-void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_step>(params, dst);
-}
-
-void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_tanh>(params, dst);
-}
-
-void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_elu>(params, dst);
-}
-
-void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_relu>(params, dst);
-}
-
-void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_sigmoid>(params, dst);
-}
-
-void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_hardsigmoid>(params, dst);
-}
-
-void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_exp>(params, dst);
-}
-
-void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_hardswish>(params, dst);
-}
-
-void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_sqr>(params, dst);
-}
-
-void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_sqrt>(params, dst);
-}
-
-void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_sin>(params, dst);
-}
-
-void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_cos>(params, dst);
-}
-
-void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_log>(params, dst);
-}
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@ -1,28 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@ -1,258 +0,0 @@
-#include "vec.h"
-
-#include <cassert>
-
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-#endif
-
-// precomputed gelu table for f16 (128 KB)
-ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-
-// precomputed quick gelu table for f16 (128 KB)
-ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
-
-void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
-   assert(nrc == 1);
-   GGML_UNUSED(nrc);
-   GGML_UNUSED(bx);
-   GGML_UNUSED(by);
-   GGML_UNUSED(bs);
-
-#if defined(GGML_SIMD)
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
-#else
-    // scalar
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
-    }
-#endif
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
-    assert(nrc == 1);
-    GGML_UNUSED(nrc);
-    GGML_UNUSED(bx);
-    GGML_UNUSED(by);
-    GGML_UNUSED(bs);
-    int i = 0;
-    ggml_float sumf = 0;
-
-#if defined(__AVX512BF16__)
-    __m512 c1 = _mm512_setzero_ps();
-    __m512 c2 = _mm512_setzero_ps();
-    for (; i + 64 <= n; i += 64) {
-        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
-                             m512bh(_mm512_loadu_si512((y + i))));
-        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
-                             m512bh(_mm512_loadu_si512((y + i + 32))));
-    }
-    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
-    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
-
-#elif defined(__AVX512F__)
-#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
-    __m512 c1 = _mm512_setzero_ps();
-    __m512 c2 = _mm512_setzero_ps();
-    for (; i + 32 <= n; i += 32) {
-        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
-        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
-    }
-    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
-    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
-
-#undef LOAD
-#elif defined(__AVX2__) || defined(__AVX__)
-#if defined(__AVX2__)
-#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
-#else
-#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
-#endif
-    __m256 c1 = _mm256_setzero_ps();
-    __m256 c2 = _mm256_setzero_ps();
-    __m256 c3 = _mm256_setzero_ps();
-    __m256 c4 = _mm256_setzero_ps();
-    for (; i + 32 <= n; i += 32) {
-        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
-        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
-        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
-        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
-    }
-    __m128 g;
-    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
-                       _mm256_add_ps(c2, c4));
-    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
-                   _mm256_castps256_ps128(c1));
-    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
-    g = _mm_add_ss(g, _mm_movehdup_ps(g));
-    sumf += (ggml_float)_mm_cvtss_f32(g);
-
-#undef LOAD
-#endif
-
-    for (; i < n; ++i) {
-        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
-                             GGML_BF16_TO_FP32(y[i]));
-    }
-    *s = sumf;
-}
-
-void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
-    assert(nrc == 1);
-    GGML_UNUSED(nrc);
-    GGML_UNUSED(bx);
-    GGML_UNUSED(by);
-    GGML_UNUSED(bs);
-
-    ggml_float sumf = 0.0;
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F16_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#endif
-
-    *s = sumf;
-}
-
-void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    int i = 0;
-#if defined(__AVX512F__) && defined(__AVX512DQ__)
-    for (; i + 15 < n; i += 16) {
-        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
-    }
-#elif defined(__AVX2__) && defined(__FMA__)
-    for (; i + 7 < n; i += 8) {
-        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
-    }
-#elif defined(__SSE2__)
-    for (; i + 3 < n; i += 4) {
-        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
-    }
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    for (; i + 3 < n; i += 4) {
-        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
-    }
-#endif
-    for (; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
-    }
-}
-
-ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
-    int i = 0;
-    ggml_float sum = 0;
-#if defined(__AVX512F__) && defined(__AVX512DQ__)
-    for (; i + 15 < n; i += 16) {
-        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
-                                               _mm512_set1_ps(max)));
-        _mm512_storeu_ps(y + i, val);
-        sum += (ggml_float)_mm512_reduce_add_ps(val);
-    }
-#elif defined(__AVX2__) && defined(__FMA__)
-    for (; i + 7 < n; i += 8) {
-        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
-                                               _mm256_set1_ps(max)));
-        _mm256_storeu_ps(y + i, val);
-        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
-                                 _mm256_castps256_ps128(val));
-        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
-        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
-        sum += (ggml_float)_mm_cvtss_f32(val2);
-    }
-#elif defined(__SSE2__)
-    for (; i + 3 < n; i += 4) {
-        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
-                                            _mm_set1_ps(max)));
-        _mm_storeu_ps(y + i, val);
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
-        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
-        val = _mm_add_ss(val, _mm_movehdup_ps(val));
-#else
-        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
-        val = _mm_add_ps(val, tmp);
-        tmp = _mm_movehl_ps(tmp, val);
-        val = _mm_add_ss(val, tmp);
-#endif
-        sum += (ggml_float)_mm_cvtss_f32(val);
-    }
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    for (; i + 3 < n; i += 4) {
-        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
-                                                vdupq_n_f32(max)));
-        vst1q_f32(y + i, val);
-        sum += (ggml_float)vaddvq_f32(val);
-    }
-#endif
-    for (; i < n; ++i) {
-        float val = expf(x[i] - max);
-        sum += (ggml_float)val;
-        y[i] = val;
-    }
-    return sum;
-}
-
-ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
-    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
-
-    int i = 0;
-    ggml_float sum = 0;
-    for (; i < n; ++i) {
-        float val = x[i] - max;
-        y[i] = val;
-        sum += (ggml_float)expf(val);
-    }
-    return sum = (ggml_float)logf(sum);
-}
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@ -1,802 +0,0 @@
-// Vectorized functions for fundamental operations
-
-#pragma once
-
-#include "ggml-impl.h"
-#include "simd-mappings.h"
-#include "ggml.h"
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-#endif
-
-// floating point type used to accumulate sums
-typedef double ggml_float;
-
-#define GGML_GELU_FP16
-#define GGML_GELU_QUICK_FP16
-
-#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  2
-#define GGML_VEC_MAD_UNROLL  32
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-
-// precomputed quick gelu table for f16 (128 KB)
-extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
-
-//
-// fundamental operations
-//
-
-void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
-void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
-
-void ggml_vec_silu_f32(const int n, float * y, const float * x);
-ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
-ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
-
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
-inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
-    for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
-    }
-}
-inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
-    for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
-    }
-}
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
-inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
-    }
-}
-
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
-inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
-    for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
-    }
-}
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
-    for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
-    }
-}
-
-// compute GGML_VEC_DOT_UNROLL dot products at once
-// xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
-    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-
-    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
-
-                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
-            }
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#endif
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        s[i] = (float)sumf[i];
-    }
-}
-
-inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#endif
-}
-
-inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
-#endif
-}
-
-// xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
-
-    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
-    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
-        x[i] = (const float *) ((const char *) xv + i*xs);
-        v[i] = (const float *) ((const char *) vv + i*vs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
-
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
-    }
-
-    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
-                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
-            }
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = np; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#else
-    // scalar
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = 0; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#endif
-}
-
-//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
-inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_USE_ACCELERATE)
-    vDSP_vsmul(y, 1, &v, y, 1, n);
-#elif defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] *= v;
-    }
-#endif
-}
-
-inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
-
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
-
-            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
-    }
-#endif
-}
-
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
-inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(v*v);
-    }
-}
-inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
-inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
-inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
-inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
-inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
-inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
-    }
-}
-inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
-    }
-}
-inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
-inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
-    }
-}
-inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
-inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
-    }
-}
-inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
-inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
-    }
-}
-// TODO: optimize performance
-inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
-    }
-}
-inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
-    }
-}
-inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
-inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
-    }
-}
-
-static const float GELU_COEF_A     = 0.044715f;
-static const float GELU_QUICK_COEF = -1.702f;
-static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        if (x[i] <= -10.0f) {
-            y[i] = 0.0f;
-        } else if (x[i] >= 10.0f) {
-            y[i] = x[i];
-        } else {
-            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-            memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
-        }
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
-inline static float ggml_gelu_quick_f32(float x) {
-    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
-}
-
-//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-//    }
-//}
-
-#ifdef GGML_GELU_QUICK_FP16
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_quick_f32(x[i]);
-    }
-}
-#endif
-
-inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
-    }
-}
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
-    float v = GGML_FP16_TO_FP32(x);
-    return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
-}
-
-#if __FINITE_MATH_ONLY__
-#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
-#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
-#endif
-
-#if defined(__ARM_NEON) && defined(__aarch64__)
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static float32x4_t ggml_v_expf(float32x4_t x) {
-    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
-    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
-    const float32x4_t n = vsubq_f32(z, r);
-    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
-                                    vdupq_n_f32(0x1.7f7d1cp-20f));
-    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
-    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
-    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
-    const float32x4_t u = vmulq_f32(b, b);
-    const float32x4_t j = vfmaq_f32(
-        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
-        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
-                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
-    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
-        return vfmaq_f32(k, j, k);
-    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
-    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
-    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
-    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
-                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static float32x4_t ggml_v_silu(float32x4_t x) {
-    const float32x4_t one = vdupq_n_f32(1.0f);
-    const float32x4_t zero = vdupq_n_f32(0.0f);
-    const float32x4_t neg_x = vsubq_f32(zero, x);
-    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
-    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
-    return vdivq_f32(x, one_plus_exp_neg_x);
-}
-
-#elif defined(__AVX512F__) && defined(__AVX512DQ__)
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static __m512 ggml_v_expf(__m512 x) {
-  const __m512 r = _mm512_set1_ps(0x1.8p23f);
-  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
-  const __m512 n = _mm512_sub_ps(z, r);
-  const __m512 b =
-      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
-                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
-  const __mmask16 d =
-      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
-  const __m512 u = _mm512_mul_ps(b, b);
-  const __m512 j = _mm512_fmadd_ps(
-      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
-                                      _mm512_set1_ps(0x1.573e2ep-5f)),
-                      u,
-                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
-                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
-      u,
-      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
-  const __m512 res = _mm512_scalef_ps(j, n);
-  if (_mm512_kortestz(d, d))
-    return res;
-  const __m512 zero = _mm512_setzero_ps();
-  const __m512 alt = _mm512_mask_blend_ps(
-      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
-  return _mm512_mask_blend_ps(d, res, alt);
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m512 ggml_v_silu(__m512 x) {
-    const __m512 one = _mm512_set1_ps(1);
-    const __m512 zero = _mm512_setzero_ps();
-    const __m512 neg_x = _mm512_sub_ps(zero, x);
-    const __m512 exp_neg_x = ggml_v_expf(neg_x);
-    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
-    return _mm512_div_ps(x, one_plus_exp_neg_x);
-}
-
-#elif defined(__AVX2__) && defined(__FMA__)
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static __m256 ggml_v_expf(__m256 x) {
-  const __m256 r = _mm256_set1_ps(0x1.8p23f);
-  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
-  const __m256 n = _mm256_sub_ps(z, r);
-  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
-                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
-  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
-  const __m256 k = _mm256_castsi256_ps(
-      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
-  const __m256i c = _mm256_castps_si256(
-      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
-                    _mm256_set1_ps(126), _CMP_GT_OQ));
-  const __m256 u = _mm256_mul_ps(b, b);
-  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
-                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
-                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
-                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
-                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
-  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
-    return _mm256_fmadd_ps(j, k, k);
-  const __m256i g = _mm256_and_si256(
-      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
-      _mm256_set1_epi32(0x82000000u));
-  const __m256 s1 =
-      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
-  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
-  const __m256i d = _mm256_castps_si256(
-      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
-                    _mm256_set1_ps(192), _CMP_GT_OQ));
-  return _mm256_or_ps(
-      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
-      _mm256_andnot_ps(
-          _mm256_castsi256_ps(d),
-          _mm256_or_ps(
-              _mm256_and_ps(_mm256_castsi256_ps(c),
-                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
-              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m256 ggml_v_silu(__m256 x) {
-    const __m256 one = _mm256_set1_ps(1);
-    const __m256 zero = _mm256_setzero_ps();
-    const __m256 neg_x = _mm256_sub_ps(zero, x);
-    const __m256 exp_neg_x = ggml_v_expf(neg_x);
-    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
-    return _mm256_div_ps(x, one_plus_exp_neg_x);
-}
-
-#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
-
-#if defined(__FMA__)
-#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
-#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
-#else
-#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
-#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
-#endif
-
-// adapted from arm limited optimized routine
-// the maximum error is 1.45358 plus 0.5 ulps
-// numbers above 88.38 will flush to infinity
-// numbers beneath -103.97 will flush to zero
-inline static __m128 ggml_v_expf(__m128 x) {
-    const __m128 r = _mm_set1_ps(0x1.8p23f);
-    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
-    const __m128 n = _mm_sub_ps(z, r);
-    const __m128 b =
-        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
-    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
-    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
-    const __m128i c =
-        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
-    const __m128 u = _mm_mul_ps(b, b);
-    const __m128 j =
-        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
-                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
-                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
-    if (!_mm_movemask_epi8(c))
-        return MADD128(j, k, k);
-    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
-                                    _mm_set1_epi32(0x82000000u));
-    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
-    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
-    const __m128i d =
-        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
-    return _mm_or_ps(
-        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
-        _mm_andnot_ps(_mm_castsi128_ps(d),
-                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
-                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
-}
-
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m128 ggml_v_silu(__m128 x) {
-    const __m128 one = _mm_set1_ps(1);
-    const __m128 zero = _mm_setzero_ps();
-    const __m128 neg_x = _mm_sub_ps(zero, x);
-    const __m128 exp_neg_x = ggml_v_expf(neg_x);
-    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
-    return _mm_div_ps(x, one_plus_exp_neg_x);
-}
-
-#endif // __ARM_NEON / __AVX2__ / __SSE2__
-
-inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_silu_f16(x[i]);
-    }
-}
-
-inline static float ggml_silu_backward_f32(float x, float dy) {
-    const float s = 1.0f/(1.0f + expf(-x));
-    return dy*s*(1.0f + x*(1.0f - s));
-}
-
-inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
-    const float v = GGML_FP16_TO_FP32(x);
-    const float s = 1.0f/(1.0f + expf(-v));
-    return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
-}
-
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
-    }
-}
-
-inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
-    for (int i = 0; i < n; ++i) {
-        dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
-    }
-}
-
-inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = (float)sum;
-#else
-    vDSP_sve(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
-    float sum = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        sum += GGML_FP16_TO_FP32(x[i]);
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
-    float sum = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        sum += GGML_BF16_TO_FP32(x[i]);
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    float max = -INFINITY;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    *s = max;
-#else
-    vDSP_maxv(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
-    ggml_vec_norm_f32(n, s, x);
-    *s = 1.f/(*s);
-}
-
-inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
-    float max = -INFINITY;
-    int idx = 0;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-        if (max == x[i]) { idx = i; }
-    }
-    *s = idx;
-}
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -1,165 +0,0 @@
-cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-find_package(CUDAToolkit)
-
-if (CUDAToolkit_FOUND)
-    message(STATUS "CUDA Toolkit found")
-
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # native == GPUs available at build time
-        # 50     == Maxwell, lowest CUDA 12 standard
-        # 60     == P100, FP16 CUDA intrinsics
-        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
-        # 70     == V100, FP16 tensor cores
-        # 75     == Turing, int8 tensor cores
-        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
-            set(CMAKE_CUDA_ARCHITECTURES "native")
-        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    enable_language(CUDA)
-
-    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
-    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
-
-    file(GLOB   GGML_SOURCES_CUDA "*.cu")
-    file(GLOB   SRCS "template-instances/fattn-mma*.cu")
-    list(APPEND GGML_SOURCES_CUDA ${SRCS})
-    file(GLOB   SRCS "template-instances/mmq*.cu")
-    list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-    if (GGML_CUDA_FA_ALL_QUANTS)
-        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-    endif()
-
-    ggml_add_backend_library(ggml-cuda
-                             ${GGML_HEADERS_CUDA}
-                             ${GGML_SOURCES_CUDA}
-                            )
-
-    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
-
-    if (GGML_CUDA_GRAPHS)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
-    endif()
-
-    if (GGML_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-    endif()
-
-    if (GGML_CUDA_FORCE_CUBLAS)
-        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
-    endif()
-
-    if (GGML_CUDA_NO_VMM)
-        add_compile_definitions(GGML_CUDA_NO_VMM)
-    endif()
-
-    if (NOT GGML_CUDA_FA)
-        add_compile_definitions(GGML_CUDA_NO_FA)
-    endif()
-
-    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-        add_compile_definitions(GGML_CUDA_F16)
-    endif()
-
-    if (GGML_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
-    if (GGML_STATIC)
-        if (WIN32)
-            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-        else ()
-            target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-        endif()
-    else()
-        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
-    endif()
-
-    if (GGML_CUDA_NO_VMM)
-        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
-    else()
-        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
-    endif()
-
-    set(CUDA_CXX_FLAGS "")
-
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
-        # Options are:
-        # - none (not recommended)
-        # - speed (nvcc's default)
-        # - balance
-        # - size
-        list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
-    endif()
-
-    if (GGML_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
-    endif()
-
-    if (GGML_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    endif()
-
-    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-
-    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-else()
-    message(FATAL_ERROR "CUDA Toolkit not found")
-endif()
--- a/ggml/src/ggml-cuda/acc.cu
+++ b/ggml/src/ggml-cuda/acc.cu
@ -1,47 +0,0 @@
-#include "acc.cuh"
-
-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset) {
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
-}
-
-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    const float * src0_d = (const float *)src0->data;
-    const float * src1_d = (const float *)src1->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
-}
--- a/ggml/src/ggml-cuda/acc.cuh
+++ b/ggml/src/ggml-cuda/acc.cuh
@ -1,5 +0,0 @@
-#include "common.cuh"
-
-#define CUDA_ACC_BLOCK_SIZE 256
-
-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/arange.cu
+++ b/ggml/src/ggml-cuda/arange.cu
@ -1,34 +0,0 @@
-#include "arange.cuh"
-
-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
-}
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    float start;
-    float stop;
-    float step;
-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
-}
--- a/ggml/src/ggml-cuda/arange.cuh
+++ b/ggml/src/ggml-cuda/arange.cuh
@ -1,5 +0,0 @@
-#include "common.cuh"
-
-#define CUDA_ARANGE_BLOCK_SIZE 256
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@ -1,91 +0,0 @@
-#include <algorithm>
-#include <cstdint>
-
-#include "argmax.cuh"
-#include "common.cuh"
-#include "sum.cuh"
-
-static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
-    const int64_t row = blockIdx.x;
-
-    float maxval = -FLT_MAX;
-    int   argmax = -1;
-    const float * rowx = x + row * ncols;
-
-    for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
-        const float val = rowx[col];
-        if (val > maxval) {
-            maxval = val;
-            argmax = col;
-        }
-    }
-
-#pragma unroll
-    for (int offset = 16; offset > 0; offset >>= 1) {
-        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
-        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
-        if (val > maxval) {
-            maxval = val;
-            argmax = col;
-        }
-    }
-
-    const int n_warps = blockDim.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    if (n_warps > 1) {
-        constexpr int    max_warps = 1024 / WARP_SIZE;
-        __shared__ float shared_maxval[max_warps];
-        __shared__ int   shared_argmax[max_warps];
-        if (lane_id == 0) {
-            shared_maxval[warp_id] = maxval;
-            shared_argmax[warp_id] = argmax;
-        }
-
-        __syncthreads();
-
-        if (warp_id == 0) {
-            if (lane_id < n_warps) {
-                maxval = shared_maxval[lane_id];
-                argmax = shared_argmax[lane_id];
-            }
-#pragma unroll
-            for (int offset = 16; offset > 0; offset >>= 1) {
-                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
-                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
-                if (val > maxval) {
-                    maxval = val;
-                    argmax = col;
-                }
-            }
-        }
-    }
-
-    if (warp_id == 0 && lane_id == 0) {
-        dst[row] = argmax;
-    }
-}
-
-void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t ne00  = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    const float * src0_d = (const float *) src0->data;
-    int32_t     * dst_d  = (int32_t     *) dst->data;
-
-    cudaStream_t stream = ctx.stream();
-
-    const int64_t num_blocks = nrows;
-    const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
-    const dim3 blocks_dim(num_threads, 1, 1);
-    const dim3 blocks_num(num_blocks, 1, 1);
-
-    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
-}
--- a/ggml/src/ggml-cuda/argmax.cuh
+++ b/ggml/src/ggml-cuda/argmax.cuh
@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@ -1,104 +0,0 @@
-#include "argsort.cuh"
-
-template<typename T>
-static inline __device__ void ggml_cuda_swap(T & a, T & b) {
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template<ggml_sort_order order>
-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
-    // bitonic sort
-    int col = threadIdx.x;
-    int row = blockIdx.y;
-
-    if (col >= ncols_pad) {
-        return;
-    }
-
-    const float * x_row = x + row * ncols;
-    extern __shared__ int dst_row[];
-
-    // initialize indices
-    dst_row[col] = col;
-
-    __syncthreads();
-
-    for (int k = 2; k <= ncols_pad; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (dst_row[col] >= ncols ||
-                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
-                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
-                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
-                    ) {
-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (dst_row[ixj] >= ncols ||
-                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
-                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
-                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
-                    ) {
-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            __syncthreads();
-        }
-    }
-
-    // copy the result to dst without the padding
-    if (col < ncols) {
-        dst[row * ncols + col] = dst_row[col];
-    }
-}
-
-static int next_power_of_2(int x) {
-    int n = 1;
-    while (n < x) {
-        n *= 2;
-    }
-    return n;
-}
-
-static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
-    // bitonic sort requires ncols to be power of 2
-    const int ncols_pad = next_power_of_2(ncols);
-
-    const dim3 block_dims(ncols_pad, 1, 1);
-    const dim3 block_nums(1, nrows, 1);
-    const size_t shared_mem = ncols_pad * sizeof(int);
-
-    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
-    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
-
-    if (order == GGML_SORT_ORDER_ASC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
-    } else if (order == GGML_SORT_ORDER_DESC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
-    } else {
-        GGML_ABORT("fatal error");
-    }
-}
-
-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
-}
--- a/ggml/src/ggml-cuda/argsort.cuh
+++ b/ggml/src/ggml-cuda/argsort.cuh
@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@ -1,363 +0,0 @@
-#include "binbcast.cuh"
-#include <cstdint>
-
-static __device__ __forceinline__ float op_repeat(const float a, const float b) {
-    return b;
-    GGML_UNUSED(a);
-}
-
-static __device__ __forceinline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __device__ __forceinline__ float op_sub(const float a, const float b) {
-    return a - b;
-}
-
-static __device__ __forceinline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __device__ __forceinline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13) {
-    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
-    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
-    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13) {
-
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-template <typename T>
-static __global__ void k_repeat_back(
-    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
-    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
-
-    const int64_t tid0  = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
-    const int64_t tid1  = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
-    const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
-    const int64_t tid2  = tid23 % ne2;
-    const int64_t tid3  = tid23 / ne2;
-
-    if (tid0 >= ne0) {
-        return;
-    }
-
-    T sum = 0;
-    for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
-        for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
-            for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
-                for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
-                    sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
-                }
-            }
-        }
-    }
-    dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
-}
-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_cuda {
-    template<typename src0_t, typename src1_t, typename dst_t>
-    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
-            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
-            cudaStream_t stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne[] = {ne0, ne1, ne2, ne3};
-        int64_t cne0[] = {ne00, ne01, ne02, ne03};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-
-        size_t cnb[] = {nb0, nb1, nb2, nb3};
-        size_t cnb0[] = {nb00, nb01, nb02, nb03};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
-            for (int i = 0; i < 4; i++) {
-                if (nr[i] != 1) {
-                    break;
-                }
-                if (i > 0) {
-                    collapse_nb(cnb, cne);
-                    collapse_nb(cnb0, cne0);
-                    collapse_nb(cnb1, cne1);
-                    collapse(cne);
-                    collapse(cne0);
-                    collapse(cne1);
-                }
-            }
-        }
-
-        {
-            int64_t ne0 = cne[0];
-            int64_t ne1 = cne[1];
-            int64_t ne2 = cne[2];
-            int64_t ne3 = cne[3];
-
-            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
-            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
-            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
-            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb[0];
-            size_t nb1 = cnb[1];
-            size_t nb2 = cnb[2];
-            size_t nb3 = cnb[3];
-
-            size_t nb00 = cnb0[0];
-            size_t nb01 = cnb0[1];
-            size_t nb02 = cnb0[2];
-            size_t nb03 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            size_t s00 = nb00 / sizeof(src0_t);
-            size_t s01 = nb01 / sizeof(src0_t);
-            size_t s02 = nb02 / sizeof(src0_t);
-            size_t s03 = nb03 / sizeof(src0_t);
-
-            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
-
-            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
-
-            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s00 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            dim3 block_dims;
-            block_dims.x = std::min<unsigned int>(hne0, block_size);
-            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
-            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
-
-            dim3 block_nums(
-                (hne0 + block_dims.x - 1) / block_dims.x,
-                (ne1 + block_dims.y - 1) / block_dims.y,
-                (ne2*ne3 + block_dims.z - 1) / block_dims.z
-            );
-
-            if (block_nums.z > 65535) {
-                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd,
-                    ne0, ne1, ne2, ne3,
-                    ne10, ne11, ne12, ne13,
-                    /* s0, */ s1, s2, s3,
-                    /* s00, */ s01, s02, s03,
-                    /* s10, */ s11, s12, s13);
-            } else {
-                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd,
-                    ne0, ne1, ne2, ne3,
-                    ne10, ne11, ne12, ne13,
-                    /* s0, */ s1, s2, s3,
-                    /* s00, */ s01, s02, s03,
-                    /* s10, */ s11, s12, s13);
-            }
-        }
-    }
-};
-
-template <typename T>
-static void repeat_back_cuda(
-    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
-    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
-
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
-    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
-        (src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
-}
-
-template<class op>
-static void ggml_cuda_op_bin_bcast(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const half *) src0_dd, (const half *)src1_dd, (half *) dst_dd, stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ABORT("fatal error");
-    }
-}
-
-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
-}
-
-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
-}
-
-void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
-}
-
-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
-}
-
-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
-}
-
-void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->type == dst->type);
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_can_repeat(dst, src0));
-
-    cudaStream_t stream = ctx.stream();
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    GGML_ASSERT(ne2*ne3 <= (1 << 15));
-
-    const size_t ts = ggml_type_size(src0->type);
-    const size_t s00 = nb00 / ts;
-    const size_t s01 = nb01 / ts;
-    const size_t s02 = nb02 / ts;
-    const size_t s03 = nb03 / ts;
-
-    switch (dst->type) {
-        case GGML_TYPE_F32: {
-            const float * src0_d = (const float *) src0->data;
-            float       * dst_d  = (float       *) dst->data;
-            repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
-        } break;
-        default: {
-            GGML_ASSERT(false);
-        } break;
-    }
-}
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@ -1,9 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/clamp.cu
+++ b/ggml/src/ggml-cuda/clamp.cu
@ -1,45 +0,0 @@
-#include "clamp.cuh"
-
-static __device__ __forceinline__ float op_clamp(float x, float min, float max) {
-    return fminf(fmaxf(x, min), max);
-}
-
-template <class T>
-static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
-}
-
-template <class T>
-static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    op_clamp_kernel<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
-
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const void * src0_d = src0->data;
-    void * dst_d = dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    if (src0->type == GGML_TYPE_F16) {
-        clamp_cuda((const half *)src0_d, (half *)dst_d, (half)min, (half)max, ggml_nelements(src0), stream);
-    } else {
-        clamp_cuda((const float *)src0_d, (float *)dst_d, (float)min, (float)max, ggml_nelements(src0), stream);
-    }
-}
--- a/Show More
+++ b/Show More