ggml : move AMX to the CPU backend (llama/10570)

ggml : automatic selection of best CPU backend (llama/10606)
2025-06-15 05:18:07 +00:00 · 2024-12-03 20:22:12 +02:00
parent 4d73962da4
commit 3daeacad24
22 changed files with 3546 additions and 231 deletions
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -96,6 +96,7 @@ option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)

 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI    "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
 option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
@ -161,7 +162,6 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
-option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -261,21 +261,15 @@ function(ggml_add_backend backend)
    if (${backend_id})
        string(TOLOWER "ggml-${backend}" backend_target)
        add_subdirectory(${backend_target})
-        # check again in case the backend disabled itself
-        # note that this should NOT be the normal behavior, in case of errors the backend should fail the build
-        # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
-        if (${backend_id})
-            message(STATUS "Including ${backend} backend")
-            if (NOT GGML_BACKEND_DL)
-                string(TOUPPER "GGML_USE_${backend}" backend_use)
-                target_compile_definitions(ggml PUBLIC ${backend_use})
-            endif()
+        message(STATUS "Including ${backend} backend")
+        if (NOT GGML_BACKEND_DL)
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
        endif()
    endif()
 endfunction()

 ggml_add_backend(CPU)
-ggml_add_backend(AMX)
 ggml_add_backend(BLAS)
 ggml_add_backend(CANN)
 ggml_add_backend(CUDA)
@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)

 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
-    target_compile_features   (${target} PRIVATE c_std_11) # don't bump
+    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
 endforeach()

 target_link_libraries(ggml-base PRIVATE Threads::Threads)
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -211,27 +211,45 @@ extern "C" {
    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);

    // Add backend dynamic loading support to the backend
-    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);

-    #ifdef GGML_BACKEND_DL
-        #ifdef __cplusplus
-        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
-                extern "C" {                                                     \
-                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-                }                                                                \
-                ggml_backend_reg_t ggml_backend_init(void) {                     \
-                    return reg_fn();                                             \
-                }
-        #else
-        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
-                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
-                ggml_backend_reg_t ggml_backend_init(void) {                 \
-                    return reg_fn();                                         \
-                }
-        #endif
-    #else
-    #    define GGML_BACKEND_DL_IMPL(reg_fn)
-    #endif
+    // Initialize the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+    // Optional: obtain a score for the backend based on the system configuration
+    // Higher scores are preferred, 0 means the backend is not supported in the current system
+    typedef int                (*ggml_backend_score_t)(void);
+
+#ifdef GGML_BACKEND_DL
+#    ifdef __cplusplus
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+            extern "C" {                                                 \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+            }                                                            \
+            ggml_backend_reg_t ggml_backend_init(void) {                 \
+                return reg_fn();                                         \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \
+            extern "C" {                                   \
+            GGML_BACKEND_API int ggml_backend_score(void); \
+            }                                              \
+            int ggml_backend_score(void) {                 \
+                return score_fn();                         \
+            }
+#    else
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                              \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \
+            ggml_backend_reg_t                  ggml_backend_init(void) { \
+                return reg_fn();                                          \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \
+            GGML_BACKEND_API int ggml_backend_score(void);  \
+            int                  ggml_backend_score(void) { \
+                return score_fn();                          \
+            }
+#    endif
+#else
+#    define GGML_BACKEND_DL_IMPL(reg_fn)
+#    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
+#endif

 #ifdef  __cplusplus
 }
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -2,8 +2,13 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include <algorithm>
+#include <codecvt>
 #include <cstring>
+#include <filesystem>
+#include <locale>
+#include <memory>
 #include <string>
+#include <type_traits>
 #include <vector>

 #ifdef _WIN32
@ -49,10 +54,6 @@
 #include "ggml-rpc.h"
 #endif

-#ifdef GGML_USE_AMX
-#  include "ggml-amx.h"
-#endif
-
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
@ -61,9 +62,71 @@
 #include "ggml-kompute.h"
 #endif

+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+static dl_handle * dl_load_library(const std::wstring & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.c_str());
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+static dl_handle * dl_load_library(const std::string & path) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return dl_load_library(converter.from_bytes(path));
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+static void * dl_load_library(const std::string & path) {
+    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
 struct ggml_backend_reg_entry {
    ggml_backend_reg_t reg;
-    void * handle;
+    dl_handle_ptr handle;
 };

 struct ggml_backend_registry {
@ -92,9 +155,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif
-#ifdef GGML_USE_AMX
-        register_backend(ggml_backend_amx_reg());
-#endif
 #ifdef GGML_USE_KOMPUTE
        register_backend(ggml_backend_kompute_reg());
 #endif
@ -104,13 +164,16 @@ struct ggml_backend_registry {
    }

    ~ggml_backend_registry() {
-        while (!backends.empty()) {
-            // use silent since the log system may have been destroyed at this point
-            unload_backend(backends.back().reg, true);
+        // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
+        // since backend threads may still be running and accessing resources from the dynamic library
+        for (auto & entry : backends) {
+            if (entry.handle) {
+                entry.handle.release(); // NOLINT
+            }
        }
    }

-    void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
+    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
        if (!reg) {
            return;
        }
@ -119,7 +182,7 @@ struct ggml_backend_registry {
        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
-        backends.push_back({ reg, handle });
+        backends.push_back({ reg, std::move(handle) });
        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
            register_device(ggml_backend_reg_dev_get(reg, i));
        }
@ -133,79 +196,53 @@ struct ggml_backend_registry {
    }

    ggml_backend_reg_t load_backend(const char * path, bool silent) {
-#ifdef _WIN32
-        // suppress error dialogs for missing DLLs
-        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-        HMODULE handle = LoadLibraryA(path);
-
+        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
            }
-            SetErrorMode(old_mode);
            return nullptr;
        }

-        ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
-
-        SetErrorMode(old_mode);
-
-        if (!backend_init) {
+        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+        if (score_fn && score_fn() == 0) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
            }
-            FreeLibrary(handle);
            return nullptr;
        }
-#else
-        void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);

-        if (!handle) {
+        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
+        if (!backend_init_fn) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
            }
            return nullptr;
        }

-        auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
-
-        if (!backend_init) {
-            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
-            }
-            dlclose(handle);
-            return nullptr;
-        }
-#endif
-        ggml_backend_reg_t reg = backend_init();
-
+        ggml_backend_reg_t reg = backend_init_fn();
        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
            if (!silent) {
                if (!reg) {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
                } else {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                                   __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
                }
            }
-#ifdef _WIN32
-            FreeLibrary(handle);
-#else
-            dlclose(handle);
-#endif
            return nullptr;
        }

        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
-        register_backend(reg, handle);
+
+        register_backend(reg, std::move(handle));
+
        return reg;
    }

    void unload_backend(ggml_backend_reg_t reg, bool silent) {
        auto it = std::find_if(backends.begin(), backends.end(),
-                                [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
+                               [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });

        if (it == backends.end()) {
            if (!silent) {
@ -224,15 +261,6 @@ struct ggml_backend_registry {
                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
            devices.end());

-        // unload library
-        if (it->handle) {
-#ifdef _WIN32
-            FreeLibrary((HMODULE) it->handle);
-#else
-            dlclose(it->handle);
-#endif
-        }
-
        // remove backend
        backends.erase(it);
    }
@ -348,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
    get_reg().unload_backend(reg, true);
 }

-void ggml_backend_load_all() {
-    std::vector<std::string> search_prefix;
-
-    // add the executable directory to the search path
-    // FIXME: this is convenient for development, but it should probably be disabled in production
-
+static std::string get_executable_path() {
 #if defined(__APPLE__)
    // get executable path
    std::vector<char> path;
@ -371,7 +394,7 @@ void ggml_backend_load_all() {
    if (last_slash != std::string::npos) {
        base_path = base_path.substr(0, last_slash);
    }
-    search_prefix.push_back(base_path + "/");
+    return base_path + "/";
 #elif defined(__linux__)
    std::string base_path = ".";
    std::vector<char> path(1024);
@ -393,38 +416,104 @@ void ggml_backend_load_all() {
        path.resize(path.size() * 2);
    }

-    search_prefix.push_back(base_path + "/");
+    return base_path + "/";
+#elif defined(_WIN32)
+    std::vector<char> path(MAX_PATH);
+    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    if (len == 0) {
+        return "";
+    }
+    std::string base_path(path.data(), len);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('\\');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + "\\";
 #endif
+}

-    auto & reg = get_reg();
-
-    auto try_load = [&](const std::string & name) {
-        std::string os_name;
+static std::string backend_filename_prefix() {
 #ifdef _WIN32
-        os_name = "ggml-" + name + ".dll";
+    return "ggml-";
 #else
-        os_name = "libggml-" + name + ".so";
+    return "libggml-";
 #endif
-        if (reg.load_backend(os_name.c_str(), true)) {
-            return;
+}
+
+static std::string backend_filename_suffix() {
+#ifdef _WIN32
+    return ".dll";
+#else
+    return ".so";
+#endif
+}
+
+static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
+    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
+     // TODO: search system paths
+    std::vector<std::string> search_paths = { "./", get_executable_path() };
+    std::string file_prefix = backend_filename_prefix() + name + "-";
+
+    int best_score = 0;
+    std::string best_path;
+
+    namespace fs = std::filesystem;
+    for (const auto & search_path : search_paths) {
+        if (!fs::exists(search_path)) {
+            continue;
        }
-        for (const auto & prefix : search_prefix) {
-            if (reg.load_backend((prefix + os_name).c_str(), true)) {
-                return;
+        for (const auto & entry : fs::directory_iterator(search_path)) {
+            if (entry.is_regular_file()) {
+                std::string filename = entry.path().filename().string();
+                std::string ext = entry.path().extension().string();
+                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    if (!handle && !silent) {
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                    }
+                    if (handle) {
+                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                        if (score_fn) {
+                            int s = score_fn();
+#ifndef NDEBUG
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+#endif
+                            if (s > best_score) {
+                                best_score = s;
+                                best_path = entry.path().string();
+                            }
+                        }
+                    }
+                }
            }
        }
-    };
+    }

-    try_load("amx");
-    try_load("blas");
-    try_load("cann");
-    try_load("cuda");
-    try_load("hip");
-    try_load("kompute");
-    try_load("metal");
-    try_load("rpc");
-    try_load("sycl");
-    try_load("vulkan");
-    try_load("musa");
-    try_load("cpu");
+    if (best_score == 0) {
+        // try to load the base backend
+        for (const auto & search_path : search_paths) {
+            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            if (fs::exists(path)) {
+                return get_reg().load_backend(path.c_str(), silent);
+            }
+        }
+        return nullptr;
+    }
+
+    return get_reg().load_backend(best_path.c_str(), silent);
+}
+
+void ggml_backend_load_all() {
+    ggml_backend_load_best("blas", true);
+    ggml_backend_load_best("cann", true);
+    ggml_backend_load_best("cuda", true);
+    ggml_backend_load_best("hip", true);
+    ggml_backend_load_best("kompute", true);
+    ggml_backend_load_best("metal", true);
+    ggml_backend_load_best("rpc", true);
+    ggml_backend_load_best("sycl", true);
+    ggml_backend_load_best("vulkan", true);
+    ggml_backend_load_best("musa", true);
+    ggml_backend_load_best("cpu", true);
 }
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st

    if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
        // since the tensor is pre-allocated, it cannot be moved to another backend
-        GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
+        ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+        GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
    }

    // graph input
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -1,12 +1,20 @@
-ggml_add_backend_library(ggml-cpu
-                         ggml-cpu.c
-                         ggml-cpu.cpp
-                         ggml-cpu-aarch64.c
-                         ggml-cpu-aarch64.h
-                         ggml-cpu-quants.c
-                         ggml-cpu-quants.h
-                        )
+ggml_add_backend_library(ggml-cpu)

+list (APPEND GGML_CPU_SOURCES
+    ggml-cpu.c
+    ggml-cpu.cpp
+    ggml-cpu-aarch64.c
+    ggml-cpu-aarch64.h
+    ggml-cpu-quants.c
+    ggml-cpu-quants.h
+    amx/amx.cpp
+    amx/amx.h
+    amx/mmq.cpp
+    amx/mmq.h
+    ggml-cpu-impl.h
+    )
+
+target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
 target_include_directories(ggml-cpu PRIVATE .)

 if (APPLE AND GGML_ACCELERATE)
@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
    if (ACCELERATE_FRAMEWORK)
        message(STATUS "Accelerate framework found")

-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
+        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
+        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)

        target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
    else()
@ -29,15 +37,9 @@ if (GGML_OPENMP)
    if (OpenMP_FOUND)
        message(STATUS "OpenMP found")

-        add_compile_definitions(GGML_USE_OPENMP)
+        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)

        target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-
-        # FIXME: should be replaced with a compiler id check
-        #if (GGML_MUSA)
-        #    list(APPEND GGML_CPU_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
-        #    list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
-        #endif()
    else()
        message(WARNING "OpenMP not found")
    endif()
@ -46,11 +48,11 @@ endif()
 if (GGML_LLAMAFILE)
    message(STATUS "Using llamafile")

-    add_compile_definitions(GGML_USE_LLAMAFILE)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)

-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
+    list(APPEND GGML_CPU_SOURCES
+                llamafile/sgemm.cpp
+                llamafile/sgemm.h)
 endif()

 if (GGML_CPU_HBM)
@ -58,7 +60,7 @@ if (GGML_CPU_HBM)

    message(STATUS "Using memkind for CPU HBM")

-    add_compile_definitions(GGML_USE_CPU_HBM)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)

    target_link_libraries(ggml-cpu PUBLIC memkind)
 endif()
@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
    message(STATUS "ARM detected")

    if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
+        list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
+        list(APPEND ARCH_DEFINITIONS __ARM_NEON)
+        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)

        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")

        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)

            message(STATUS "ARM feature DOTPROD enabled")
        endif ()
@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)

        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)

            message(STATUS "ARM feature MATMUL_INT8 enabled")
        endif ()

        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

            message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
        endif ()
@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
                if (GGML_COMPILER_SUPPORT_DOTPROD)
                    set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
-                    add_compile_definitions(__ARM_FEATURE_DOTPROD)
+                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)

                    message(STATUS "ARM feature DOTPROD enabled")
                endif ()
@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                    set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
-                    add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)

                    message(STATUS "ARM feature MATMUL_INT8 enabled")
                endif ()
@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
    if (MSVC)
        # instruction set detection for MSVC only
        if (GGML_NATIVE)
-            # TODO: improve, should not reference files from the parent folder
            include(cmake/FindSIMD.cmake)
        endif ()
        if (GGML_AVX512)
@ -185,43 +186,43 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
            # macros corresponding to the extensions.
            # Do it manually.
            if (GGML_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                    list(APPEND ARCH_FLAGS -mavx512vbmi)
                endif()
            endif()
            if (GGML_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                    list(APPEND ARCH_FLAGS -mavx512vnni)
                endif()
            endif()
            if (GGML_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+                list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
                    list(APPEND ARCH_FLAGS -mavx512bf16)
                endif()
            endif()
            if (GGML_AMX_TILE)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
            endif()
            if (GGML_AMX_INT8)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
            endif()
            if (GGML_AMX_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+                list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
            endif()
        elseif (GGML_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (GGML_AVX)
            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
+        if (GGML_AVX_VNNI)
+            list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
+            if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                list(APPEND ARCH_FLAGS -mavxvnni)
+            endif()
+        endif()
    else()
        if (GGML_NATIVE)
            list(APPEND ARCH_FLAGS -march=native)
@ -238,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
        if (GGML_AVX2)
            list(APPEND ARCH_FLAGS -mavx2)
        endif()
+        if (GGML_AVX_VNNI)
+            list(APPEND ARCH_FLAGS -mavxvnni)
+        endif()
        if (GGML_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f)
            list(APPEND ARCH_FLAGS -mavx512dq)
@ -276,7 +280,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
    message(STATUS "loongarch64 detected")
@ -299,11 +303,16 @@ endif()

 if (GGML_CPU_AARCH64)
    message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
-    add_compile_definitions(GGML_USE_CPU_AARCH64)
+    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
 endif()

-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
+set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
+set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
+
+# the feature detection code must be compiled without any architecture flags
+target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
+# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection

 if (EMSCRIPTEN)
    set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@ -0,0 +1,196 @@
+#include "amx.h"
+#include "common.h"
+#include "mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+// AMX buffer interface
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)(buffer->context);
+}
+
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *)tensor->data + offset, data, size);
+    }
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+
+static bool ggml_amx_init() {
+#if defined(__gnu_linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#endif
+}
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
+        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    if (!ggml_amx_init()) {
+        return NULL;
+    }
+
+    return &ggml_backend_buffer_type_amx;
+}
+
+bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
+}
+
+bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
+    // handle only 2d gemm for now
+    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+    };
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT: {
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const enum ggml_type type = src0->type;
+            const int64_t ne0 = op->ne[0];
+
+            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
+            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
+            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
+
+            bool can_use_amx =
+                is_contiguous_2d(src0) &&       // src0 must be contiguous
+                is_contiguous_2d(src1) &&       // src1 must be contiguous
+                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
+                has_amx_kernels &&              // with amx kernel impls
+                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
+
+            return can_use_amx;
+        }
+        default:
+            return false;
+    }
+}
+
+#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
--- a/ggml/src/ggml-cpu/amx/amx.d
+++ b/ggml/src/ggml-cpu/amx/amx.d
@ -0,0 +1,6 @@
+ggml/src/ggml-cpu/amx/amx.o: ggml/src/ggml-cpu/amx/amx.cpp \
+  ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
+  ggml/include/ggml.h ggml/include/ggml-alloc.h \
+  ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
+  ggml/src/ggml-cpu/amx/common.h ggml/src/ggml-cpu/amx/mmq.h \
+  ggml/src/ggml-backend-impl.h ggml/include/ggml-cpu.h
--- a/ggml/src/ggml-cpu/amx/amx.h
+++ b/ggml/src/ggml-cpu/amx/amx.h
@ -0,0 +1,20 @@
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
+bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@ -0,0 +1,100 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-impl.h"
+
+#include <algorithm>
+#include <memory>
+#include <type_traits>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+#define VNNI_BLK 4
+
+#define AMX_BLK_SIZE 32
+
+#define TMM0 0
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+#define TMM4 4
+#define TMM5 5
+#define TMM6 6
+#define TMM7 7
+
+// parallel routines
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int nth, int n, const func_t& f) {
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(nth)
+{
+    //int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+
+    GGML_UNUSED(nth);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
+    int tbegin, tend;
+    balance211(n, params->nth, params->ith, tbegin, tend);
+    f(tbegin, tend);
+}
+
+// quantized types that have AMX support
+inline bool qtype_has_amx_kernels(const enum ggml_type type) {
+    // TODO: fix padding for vnni format
+    return (type == GGML_TYPE_Q4_0) ||
+        (type == GGML_TYPE_Q4_1) ||
+        (type == GGML_TYPE_Q8_0) ||
+        (type == GGML_TYPE_Q4_K) ||
+        (type == GGML_TYPE_Q5_K) ||
+        (type == GGML_TYPE_Q6_K) ||
+        (type == GGML_TYPE_IQ4_XS);
+}
+
+// ggml backend context
+struct ggml_backend_amx_context {
+    int n_threads = GGML_DEFAULT_N_THREADS;
+    std::unique_ptr<char[]> work_data;
+    size_t work_size = 0;
+};
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
--- a/ggml/src/ggml-cpu/amx/mmq.d
+++ b/ggml/src/ggml-cpu/amx/mmq.d
@ -0,0 +1,7 @@
+ggml/src/ggml-cpu/amx/mmq.o: ggml/src/ggml-cpu/amx/mmq.cpp \
+  ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
+  ggml/include/ggml.h ggml/include/ggml-alloc.h \
+  ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
+  ggml/src/ggml-cpu/amx/mmq.h ggml/src/ggml-cpu/amx/common.h \
+  ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h \
+  ggml/src/ggml-quants.h
--- a/ggml/src/ggml-cpu/amx/mmq.h
+++ b/ggml/src/ggml-cpu/amx/mmq.h
@ -0,0 +1,16 @@
+#pragma once
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
+
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@ -0,0 +1,298 @@
+#include "ggml-cpu.h"
+#include "ggml-backend-impl.h"
+
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+
+struct cpuid_x86 {
+    bool SSE3(void) { return f_1_ecx[0]; }
+    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
+    bool MONITOR(void) { return f_1_ecx[3]; }
+    bool SSSE3(void) { return f_1_ecx[9]; }
+    bool FMA(void) { return f_1_ecx[12]; }
+    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
+    bool SSE41(void) { return f_1_ecx[19]; }
+    bool SSE42(void) { return f_1_ecx[20]; }
+    bool MOVBE(void) { return f_1_ecx[22]; }
+    bool POPCNT(void) { return f_1_ecx[23]; }
+    bool AES(void) { return f_1_ecx[25]; }
+    bool XSAVE(void) { return f_1_ecx[26]; }
+    bool OSXSAVE(void) { return f_1_ecx[27]; }
+    bool AVX(void) { return f_1_ecx[28]; }
+    bool F16C(void) { return f_1_ecx[29]; }
+    bool RDRAND(void) { return f_1_ecx[30]; }
+
+    bool MSR(void) { return f_1_edx[5]; }
+    bool CX8(void) { return f_1_edx[8]; }
+    bool SEP(void) { return f_1_edx[11]; }
+    bool CMOV(void) { return f_1_edx[15]; }
+    bool CLFSH(void) { return f_1_edx[19]; }
+    bool MMX(void) { return f_1_edx[23]; }
+    bool FXSR(void) { return f_1_edx[24]; }
+    bool SSE(void) { return f_1_edx[25]; }
+    bool SSE2(void) { return f_1_edx[26]; }
+
+    bool FSGSBASE(void) { return f_7_ebx[0]; }
+    bool BMI1(void) { return f_7_ebx[3]; }
+    bool HLE(void) { return is_intel && f_7_ebx[4]; }
+    bool AVX2(void) { return f_7_ebx[5]; }
+    bool BMI2(void) { return f_7_ebx[8]; }
+    bool ERMS(void) { return f_7_ebx[9]; }
+    bool INVPCID(void) { return f_7_ebx[10]; }
+    bool RTM(void) { return is_intel && f_7_ebx[11]; }
+    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool RDSEED(void) { return f_7_ebx[18]; }
+    bool ADX(void) { return f_7_ebx[19]; }
+    bool AVX512PF(void) { return f_7_ebx[26]; }
+    bool AVX512ER(void) { return f_7_ebx[27]; }
+    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool SHA(void) { return f_7_ebx[29]; }
+
+    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
+
+    bool LAHF(void) { return f_81_ecx[0]; }
+    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
+    bool ABM(void) { return is_amd && f_81_ecx[5]; }
+    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
+    bool XOP(void) { return is_amd && f_81_ecx[11]; }
+    bool TBM(void) { return is_amd && f_81_ecx[21]; }
+
+    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
+    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
+    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
+    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
+    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
+
+    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
+    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
+    bool AVX512_FP16(void) { return f_7_edx[23]; }
+    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
+    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
+
+    bool AMX_TILE(void) { return f_7_edx[24]; }
+    bool AMX_INT8(void) { return f_7_edx[25]; }
+    bool AMX_FP16(void) { return f_7_1_eax[21]; }
+    bool AMX_BF16(void) { return f_7_edx[22]; }
+
+#ifdef _MSC_VER
+    static void cpuid(int cpu_info[4], int eax) {
+        __cpuid(cpu_info, eax);
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __cpuidex(cpu_info, eax, ecx);
+    }
+#else
+    static void cpuid(int cpu_info[4], int eax) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(0));
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(ecx));
+    }
+#endif
+
+    cpuid_x86() {
+        std::array<int, 4> cpui;
+        std::vector<std::array<int, 4>> data;
+
+        // calling __cpuid with 0x0 as the function_id argument
+        // gets the number of the highest valid function ID.
+        cpuid(cpui.data(), 0);
+        int n_ids = cpui[0];
+
+        for (int i = 0; i <= n_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            data.push_back(cpui);
+        }
+
+        // capture vendor string
+        char vendor[0x20] = {};
+        *reinterpret_cast<int *>(vendor)     = data[0][1];
+        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
+        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
+        this->vendor = vendor;
+        if (this->vendor == "GenuineIntel") {
+            is_intel = true;
+        } else if (this->vendor == "AuthenticAMD") {
+            is_amd = true;
+        }
+
+        // load bitset with flags for function 0x00000001
+        if (n_ids >= 1) {
+            f_1_ecx = data[1][2];
+            f_1_edx = data[1][3];
+        }
+
+        // load bitset with flags for function 0x00000007
+        if (n_ids >= 7) {
+            f_7_ebx = data[7][1];
+            f_7_ecx = data[7][2];
+            f_7_edx = data[7][3];
+            cpuidex(cpui.data(), 7, 1);
+            f_7_1_eax = cpui[0];
+        }
+
+        // calling __cpuid with 0x80000000 as the function_id argument
+        // gets the number of the highest valid extended ID.
+        cpuid(cpui.data(), 0x80000000);
+        unsigned int n_ex_ids = cpui[0];
+
+        std::vector<std::array<int, 4>> ext_data;
+        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            ext_data.push_back(cpui);
+        }
+
+        // load bitset with flags for function 0x80000001
+        if (n_ex_ids >= 0x80000001) {
+            f_81_ecx = ext_data[1][2];
+            f_81_edx = ext_data[1][3];
+        }
+
+        // interpret CPU brand string if reported
+        char brand[0x40] = {};
+        if (n_ex_ids >= 0x80000004) {
+            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
+            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
+            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
+            this->brand = brand;
+        }
+    }
+
+    bool is_intel = false;
+    bool is_amd = false;
+    std::string vendor;
+    std::string brand;
+    std::bitset<32> f_1_ecx;
+    std::bitset<32> f_1_edx;
+    std::bitset<32> f_7_ebx;
+    std::bitset<32> f_7_ecx;
+    std::bitset<32> f_7_edx;
+    std::bitset<32> f_7_1_eax;
+    std::bitset<32> f_81_ecx;
+    std::bitset<32> f_81_edx;
+};
+
+#if 0
+void test_x86_is() {
+    cpuid_x86 is;
+    printf("CPU Vendor: %s\n", is.vendor.c_str());
+    printf("Brand: %s\n", is.brand.c_str());
+    printf("is_intel: %d\n", is.is_intel);
+    printf("is_amd: %d\n", is.is_amd);
+    printf("sse3: %d\n", is.SSE3());
+    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
+    printf("ssse3: %d\n", is.SSSE3());
+    printf("fma: %d\n", is.FMA());
+    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
+    printf("sse41: %d\n", is.SSE41());
+    printf("sse42: %d\n", is.SSE42());
+    printf("movbe: %d\n", is.MOVBE());
+    printf("popcnt: %d\n", is.POPCNT());
+    printf("aes: %d\n", is.AES());
+    printf("xsave: %d\n", is.XSAVE());
+    printf("osxsave: %d\n", is.OSXSAVE());
+    printf("avx: %d\n", is.AVX());
+    printf("f16c: %d\n", is.F16C());
+    printf("rdrand: %d\n", is.RDRAND());
+    printf("msr: %d\n", is.MSR());
+    printf("cx8: %d\n", is.CX8());
+    printf("sep: %d\n", is.SEP());
+    printf("cmov: %d\n", is.CMOV());
+    printf("clflush: %d\n", is.CLFSH());
+    printf("mmx: %d\n", is.MMX());
+    printf("fxsr: %d\n", is.FXSR());
+    printf("sse: %d\n", is.SSE());
+    printf("sse2: %d\n", is.SSE2());
+    printf("fsgsbase: %d\n", is.FSGSBASE());
+    printf("bmi1: %d\n", is.BMI1());
+    printf("hle: %d\n", is.HLE());
+    printf("avx2: %d\n", is.AVX2());
+    printf("bmi2: %d\n", is.BMI2());
+    printf("erms: %d\n", is.ERMS());
+    printf("invpcid: %d\n", is.INVPCID());
+    printf("rtm: %d\n", is.RTM());
+    printf("avx512f: %d\n", is.AVX512F());
+    printf("rdseed: %d\n", is.RDSEED());
+    printf("adx: %d\n", is.ADX());
+    printf("avx512pf: %d\n", is.AVX512PF());
+    printf("avx512er: %d\n", is.AVX512ER());
+    printf("avx512cd: %d\n", is.AVX512CD());
+    printf("sha: %d\n", is.SHA());
+    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
+    printf("lahf: %d\n", is.LAHF());
+    printf("lzcnt: %d\n", is.LZCNT());
+    printf("abm: %d\n", is.ABM());
+    printf("sse4a: %d\n", is.SSE4a());
+    printf("xop: %d\n", is.XOP());
+    printf("tbm: %d\n", is.TBM());
+    printf("syscall: %d\n", is.SYSCALL());
+    printf("mmxext: %d\n", is.MMXEXT());
+    printf("rdtscp: %d\n", is.RDTSCP());
+    printf("3dnowext: %d\n", is._3DNOWEXT());
+    printf("3dnow: %d\n", is._3DNOW());
+    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
+    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
+    printf("avx512_fp16: %d\n", is.AVX512_FP16());
+    printf("avx512_bf16: %d\n", is.AVX512_BF16());
+    printf("amx_tile: %d\n", is.AMX_TILE());
+    printf("amx_int8: %d\n", is.AMX_INT8());
+    printf("amx_fp16: %d\n", is.AMX_FP16());
+    printf("amx_bf16: %d\n", is.AMX_BF16());
+}
+#endif
+
+static int ggml_backend_cpu_x86_score() {
+    // FIXME: this does not check for OS support
+
+    cpuid_x86 is;
+    // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
+    if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
+    if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
+    if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
+    if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
+    if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
+    if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
+    if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
+    if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
+    if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
+    if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
+    if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
+    if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
+
+    // calculate a backend score based on the supported features
+    // more important features have a higher weight
+    int score = 0;
+    score +=  ggml_cpu_has_fma        () * 1;
+    score +=  ggml_cpu_has_f16c       () * 1<<1;
+    score +=  ggml_cpu_has_ssse3      () * 1<<2;
+    score +=  ggml_cpu_has_sse3       () * 1<<3;
+    score +=  ggml_cpu_has_avx_vnni   () * 1<<4;
+    score +=  ggml_cpu_has_avx        () * 1<<5;
+    score +=  ggml_cpu_has_avx2       () * 1<<6;
+    score +=  ggml_cpu_has_avx512     () * 1<<7;
+    // score +=  ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
+    score +=  ggml_cpu_has_avx512_bf16() * 1<<9;
+    score +=  ggml_cpu_has_avx512_vnni() * 1<<10;
+    score +=  ggml_cpu_has_amx_int8   () * 1<<11;
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
+
+#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
 }

 static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__)
    const __m512i zero = _mm512_setzero_si512();
    return _mm512_dpbusd_epi32(zero, ax, sy);
 #else
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -15,6 +15,18 @@
 extern "C" {
 #endif

+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+
+    struct ggml_threadpool * threadpool;
+};
+
+
 #if defined(_MSC_VER)

 #define m512bh(p) p
@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
 }
 #endif

+// TODO: move to ggml-threading
+void ggml_barrier(struct ggml_threadpool * tp);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -10,6 +10,7 @@
 #include "ggml-quants.h"
 #include "ggml-cpu-quants.h"
 #include "ggml-threading.h"
+#include "amx/amx.h"
 #include "ggml.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
@ -624,7 +625,7 @@ do {                                                                  \
    for (int i = 0; i < offset; ++i) {                                \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
    }                                                                 \
-    res = _mm512_reduce_add_ps(x[0]);                                 \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
 } while (0)

 // TODO: is this optimal ?
@ -674,7 +675,7 @@ do {                                                              \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
-    res = _mm512_reduce_add_ps(x[0]);                             \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
 } while (0)

 #define GGML_F16_VEC                GGML_F32Cx16
@ -685,8 +686,8 @@ do {                                                              \
 #define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
 #define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
 #define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE

+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
 #elif defined(__AVX__)

 #define GGML_SIMD
@ -1178,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
-    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
+#define GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
 }

 #define GGML_F32_VEC        GGML_F32x4
@ -1367,31 +1368,15 @@ struct ggml_compute_state {
    int ith;
 };

-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-
-    struct ggml_threadpool * threadpool;
-};
-
 //
 // fundamental operations
 //

 inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
 inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
@ -2286,7 +2271,7 @@ struct ggml_state {

 static struct ggml_state g_state = {0};

-static void ggml_barrier(struct ggml_threadpool * tp) {
+void ggml_barrier(struct ggml_threadpool * tp) {
    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
    if (n_threads == 1) {
        return;
@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
        type = (enum ggml_type)(intptr_t)src0->extra;
    }

+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
+        ggml_backend_amx_mul_mat(params, dst);
+        return;
+    }
+#endif
+
    enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
    ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
                } break;
            case GGML_OP_MUL_MAT:
                {
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+                    if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
+                        cur = ggml_backend_amx_desired_wsize(node);
+                    }
+#endif
                    const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;

                    if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        cur = MAX(cur, cur2);
                    }
                } break;
            case GGML_OP_MUL_MAT_ID:
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -3,6 +3,7 @@
 #include "ggml-cpu.h"
 #include "ggml-cpu-aarch64.h"
 #include "ggml-impl.h"
+#include "amx/amx.h"
 #include <cctype>
 #include <string>
 #include <vector>
@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
        std::vector<ggml_backend_buffer_type_t> bufts;

-#ifdef GGML_USE_CPU_HBM
-        bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+        if (ggml_backend_amx_buffer_type()) {
+            bufts.push_back(ggml_backend_amx_buffer_type());
+        }
 #endif

 #ifdef GGML_USE_CPU_AARCH64
-        bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        if (ggml_backend_cpu_aarch64_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        }
 #endif

        bufts.push_back(NULL);
@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
    const struct ggml_tensor * src0 = op->src[0];
    const struct ggml_tensor * src1 = op->src[1];

+    if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
+        return true;
+    }
+
    if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
        if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
            return false;
        }
    }

+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
+        return ggml_backend_amx_device_supports_op(op);
+    }
+    for (int i = 1; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
+            return false;
+        }
+    }
+#endif
+
    for (int i = 1; i < GGML_MAX_SRC; i++) {
        if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
            return false;
@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
 }

 static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
+    bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+    supported = supported || ggml_backend_amx_buft_is_amx(buft);
+#endif
+
+    return supported;

    GGML_UNUSED(dev);
 }
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -50,8 +50,7 @@

 #include "sgemm.h"
 #include "ggml-impl.h"
-// hack until moved into the CPU backend
-#include "../ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
 #include "ggml-quants.h"

 #ifdef _MSC_VER
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -30,11 +30,13 @@
 extern "C" {
 #endif

-#undef MIN
-#undef MAX
+#ifndef MIN
+#    define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif

-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#ifndef MAX
+#    define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif

 // required for mmap as gguf only guarantees 32-byte alignment
 #define TENSOR_ALIGNMENT 32
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -2911,7 +2911,6 @@ static void ggml_metal_encode_node(
            } break;
        case GGML_OP_GROUP_NORM:
            {
-                GGML_ASSERT(ne00 % 4 == 0);
                GGML_ASSERT(ggml_is_contiguous(src0));

                float eps;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)