ggml : move AMX to the CPU backend (llama/10570)

ggml : automatic selection of best CPU backend (llama/10606)
This commit is contained in:
Diego Devesa 2024-12-03 20:22:12 +02:00 committed by Georgi Gerganov
parent 4d73962da4
commit 3daeacad24
22 changed files with 3546 additions and 231 deletions

View File

@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512" OFF) option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
@ -161,7 +162,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON) option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF) option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING set (GGML_SYCL_TARGET "INTEL" CACHE STRING

View File

@ -261,21 +261,15 @@ function(ggml_add_backend backend)
if (${backend_id}) if (${backend_id})
string(TOLOWER "ggml-${backend}" backend_target) string(TOLOWER "ggml-${backend}" backend_target)
add_subdirectory(${backend_target}) add_subdirectory(${backend_target})
# check again in case the backend disabled itself
# note that this should NOT be the normal behavior, in case of errors the backend should fail the build
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
if (${backend_id})
message(STATUS "Including ${backend} backend") message(STATUS "Including ${backend} backend")
if (NOT GGML_BACKEND_DL) if (NOT GGML_BACKEND_DL)
string(TOUPPER "GGML_USE_${backend}" backend_use) string(TOUPPER "GGML_USE_${backend}" backend_use)
target_compile_definitions(ggml PUBLIC ${backend_use}) target_compile_definitions(ggml PUBLIC ${backend_use})
endif() endif()
endif() endif()
endif()
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
ggml_add_backend(AMX)
ggml_add_backend(BLAS) ggml_add_backend(BLAS)
ggml_add_backend(CANN) ggml_add_backend(CANN)
ggml_add_backend(CUDA) ggml_add_backend(CUDA)
@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
foreach (target ggml-base ggml) foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
target_compile_features (${target} PRIVATE c_std_11) # don't bump target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
endforeach() endforeach()
target_link_libraries(ggml-base PRIVATE Threads::Threads) target_link_libraries(ggml-base PRIVATE Threads::Threads)

View File

@ -211,27 +211,45 @@ extern "C" {
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend // Add backend dynamic loading support to the backend
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
#ifdef GGML_BACKEND_DL // Initialize the backend
#ifdef __cplusplus typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
# define GGML_BACKEND_DL_IMPL(reg_fn) \ // Optional: obtain a score for the backend based on the system configuration
// Higher scores are preferred, 0 means the backend is not supported in the current system
typedef int (*ggml_backend_score_t)(void);
#ifdef GGML_BACKEND_DL
# ifdef __cplusplus
# define GGML_BACKEND_DL_IMPL(reg_fn) \
extern "C" { \ extern "C" { \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
} \ } \
ggml_backend_reg_t ggml_backend_init(void) { \ ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \ return reg_fn(); \
} }
#else # define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
# define GGML_BACKEND_DL_IMPL(reg_fn) \ extern "C" { \
GGML_BACKEND_API int ggml_backend_score(void); \
} \
int ggml_backend_score(void) { \
return score_fn(); \
}
# else
# define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
ggml_backend_reg_t ggml_backend_init(void) { \ ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \ return reg_fn(); \
} }
#endif # define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
#else GGML_BACKEND_API int ggml_backend_score(void); \
# define GGML_BACKEND_DL_IMPL(reg_fn) int ggml_backend_score(void) { \
#endif return score_fn(); \
}
# endif
#else
# define GGML_BACKEND_DL_IMPL(reg_fn)
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -2,8 +2,13 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include <algorithm> #include <algorithm>
#include <codecvt>
#include <cstring> #include <cstring>
#include <filesystem>
#include <locale>
#include <memory>
#include <string> #include <string>
#include <type_traits>
#include <vector> #include <vector>
#ifdef _WIN32 #ifdef _WIN32
@ -49,10 +54,6 @@
#include "ggml-rpc.h" #include "ggml-rpc.h"
#endif #endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
#include "ggml-cann.h" #include "ggml-cann.h"
#endif #endif
@ -61,9 +62,71 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
static dl_handle * dl_load_library(const std::wstring & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.c_str());
SetErrorMode(old_mode);
return handle;
}
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
static void * dl_load_library(const std::string & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
struct ggml_backend_reg_entry { struct ggml_backend_reg_entry {
ggml_backend_reg_t reg; ggml_backend_reg_t reg;
void * handle; dl_handle_ptr handle;
}; };
struct ggml_backend_registry { struct ggml_backend_registry {
@ -92,9 +155,6 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC #ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
#ifdef GGML_USE_KOMPUTE #ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg()); register_backend(ggml_backend_kompute_reg());
#endif #endif
@ -104,13 +164,16 @@ struct ggml_backend_registry {
} }
~ggml_backend_registry() { ~ggml_backend_registry() {
while (!backends.empty()) { // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
// use silent since the log system may have been destroyed at this point // since backend threads may still be running and accessing resources from the dynamic library
unload_backend(backends.back().reg, true); for (auto & entry : backends) {
if (entry.handle) {
entry.handle.release(); // NOLINT
}
} }
} }
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
if (!reg) { if (!reg) {
return; return;
} }
@ -119,7 +182,7 @@ struct ggml_backend_registry {
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif #endif
backends.push_back({ reg, handle }); backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i)); register_device(ggml_backend_reg_dev_get(reg, i));
} }
@ -133,54 +196,31 @@ struct ggml_backend_registry {
} }
ggml_backend_reg_t load_backend(const char * path, bool silent) { ggml_backend_reg_t load_backend(const char * path, bool silent) {
#ifdef _WIN32 dl_handle_ptr handle { dl_load_library(path) };
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryA(path);
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
} }
SetErrorMode(old_mode);
return nullptr; return nullptr;
} }
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) {
SetErrorMode(old_mode);
if (!backend_init) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
} }
FreeLibrary(handle);
return nullptr; return nullptr;
} }
#else
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
if (!handle) { auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
} }
return nullptr; return nullptr;
} }
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); ggml_backend_reg_t reg = backend_init_fn();
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
}
dlclose(handle);
return nullptr;
}
#endif
ggml_backend_reg_t reg = backend_init();
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) { if (!silent) {
if (!reg) { if (!reg) {
@ -190,22 +230,19 @@ struct ggml_backend_registry {
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION); __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
} }
} }
#ifdef _WIN32
FreeLibrary(handle);
#else
dlclose(handle);
#endif
return nullptr; return nullptr;
} }
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, handle);
register_backend(reg, std::move(handle));
return reg; return reg;
} }
void unload_backend(ggml_backend_reg_t reg, bool silent) { void unload_backend(ggml_backend_reg_t reg, bool silent) {
auto it = std::find_if(backends.begin(), backends.end(), auto it = std::find_if(backends.begin(), backends.end(),
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
if (it == backends.end()) { if (it == backends.end()) {
if (!silent) { if (!silent) {
@ -224,15 +261,6 @@ struct ggml_backend_registry {
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
devices.end()); devices.end());
// unload library
if (it->handle) {
#ifdef _WIN32
FreeLibrary((HMODULE) it->handle);
#else
dlclose(it->handle);
#endif
}
// remove backend // remove backend
backends.erase(it); backends.erase(it);
} }
@ -348,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true); get_reg().unload_backend(reg, true);
} }
void ggml_backend_load_all() { static std::string get_executable_path() {
std::vector<std::string> search_prefix;
// add the executable directory to the search path
// FIXME: this is convenient for development, but it should probably be disabled in production
#if defined(__APPLE__) #if defined(__APPLE__)
// get executable path // get executable path
std::vector<char> path; std::vector<char> path;
@ -371,7 +394,7 @@ void ggml_backend_load_all() {
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
search_prefix.push_back(base_path + "/"); return base_path + "/";
#elif defined(__linux__) #elif defined(__linux__)
std::string base_path = "."; std::string base_path = ".";
std::vector<char> path(1024); std::vector<char> path(1024);
@ -393,38 +416,104 @@ void ggml_backend_load_all() {
path.resize(path.size() * 2); path.resize(path.size() * 2);
} }
search_prefix.push_back(base_path + "/"); return base_path + "/";
#elif defined(_WIN32)
std::vector<char> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
if (len == 0) {
return "";
}
std::string base_path(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "\\";
#endif #endif
}
auto & reg = get_reg();
static std::string backend_filename_prefix() {
auto try_load = [&](const std::string & name) { #ifdef _WIN32
std::string os_name; return "ggml-";
#ifdef _WIN32 #else
os_name = "ggml-" + name + ".dll"; return "libggml-";
#else #endif
os_name = "libggml-" + name + ".so"; }
#endif
if (reg.load_backend(os_name.c_str(), true)) { static std::string backend_filename_suffix() {
return; #ifdef _WIN32
} return ".dll";
for (const auto & prefix : search_prefix) { #else
if (reg.load_backend((prefix + os_name).c_str(), true)) { return ".so";
return; #endif
} }
}
}; static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
try_load("amx"); // TODO: search system paths
try_load("blas"); std::vector<std::string> search_paths = { "./", get_executable_path() };
try_load("cann"); std::string file_prefix = backend_filename_prefix() + name + "-";
try_load("cuda");
try_load("hip"); int best_score = 0;
try_load("kompute"); std::string best_path;
try_load("metal");
try_load("rpc"); namespace fs = std::filesystem;
try_load("sycl"); for (const auto & search_path : search_paths) {
try_load("vulkan"); if (!fs::exists(search_path)) {
try_load("musa"); continue;
try_load("cpu"); }
for (const auto & entry : fs::directory_iterator(search_path)) {
if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
}
if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) {
int s = score_fn();
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
#endif
if (s > best_score) {
best_score = s;
best_path = entry.path().string();
}
}
}
}
}
}
}
if (best_score == 0) {
// try to load the base backend
for (const auto & search_path : search_paths) {
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
if (fs::exists(path)) {
return get_reg().load_backend(path.c_str(), silent);
}
}
return nullptr;
}
return get_reg().load_backend(best_path.c_str(), silent);
}
void ggml_backend_load_all() {
ggml_backend_load_best("blas", true);
ggml_backend_load_best("cann", true);
ggml_backend_load_best("cuda", true);
ggml_backend_load_best("hip", true);
ggml_backend_load_best("kompute", true);
ggml_backend_load_best("metal", true);
ggml_backend_load_best("rpc", true);
ggml_backend_load_best("sycl", true);
ggml_backend_load_best("vulkan", true);
ggml_backend_load_best("musa", true);
ggml_backend_load_best("cpu", true);
} }

View File

@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
// since the tensor is pre-allocated, it cannot be moved to another backend // since the tensor is pre-allocated, it cannot be moved to another backend
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name); ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
} }
// graph input // graph input

View File

@ -1,12 +1,20 @@
ggml_add_backend_library(ggml-cpu ggml_add_backend_library(ggml-cpu)
list (APPEND GGML_CPU_SOURCES
ggml-cpu.c ggml-cpu.c
ggml-cpu.cpp ggml-cpu.cpp
ggml-cpu-aarch64.c ggml-cpu-aarch64.c
ggml-cpu-aarch64.h ggml-cpu-aarch64.h
ggml-cpu-quants.c ggml-cpu-quants.c
ggml-cpu-quants.h ggml-cpu-quants.h
amx/amx.cpp
amx/amx.h
amx/mmq.cpp
amx/mmq.h
ggml-cpu-impl.h
) )
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
target_include_directories(ggml-cpu PRIVATE .) target_include_directories(ggml-cpu PRIVATE .)
if (APPLE AND GGML_ACCELERATE) if (APPLE AND GGML_ACCELERATE)
@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
if (ACCELERATE_FRAMEWORK) if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found") message(STATUS "Accelerate framework found")
add_compile_definitions(GGML_USE_ACCELERATE) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK) target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64) target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
else() else()
@ -29,15 +37,9 @@ if (GGML_OPENMP)
if (OpenMP_FOUND) if (OpenMP_FOUND)
message(STATUS "OpenMP found") message(STATUS "OpenMP found")
add_compile_definitions(GGML_USE_OPENMP) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
# FIXME: should be replaced with a compiler id check
#if (GGML_MUSA)
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
#endif()
else() else()
message(WARNING "OpenMP not found") message(WARNING "OpenMP not found")
endif() endif()
@ -46,9 +48,9 @@ endif()
if (GGML_LLAMAFILE) if (GGML_LLAMAFILE)
message(STATUS "Using llamafile") message(STATUS "Using llamafile")
add_compile_definitions(GGML_USE_LLAMAFILE) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
target_sources(ggml-cpu PRIVATE list(APPEND GGML_CPU_SOURCES
llamafile/sgemm.cpp llamafile/sgemm.cpp
llamafile/sgemm.h) llamafile/sgemm.h)
endif() endif()
@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
message(STATUS "Using memkind for CPU HBM") message(STATUS "Using memkind for CPU HBM")
add_compile_definitions(GGML_USE_CPU_HBM) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
target_link_libraries(ggml-cpu PUBLIC memkind) target_link_libraries(ggml-cpu PUBLIC memkind)
endif() endif()
@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
add_compile_definitions(__ARM_NEON) list(APPEND ARCH_DEFINITIONS __ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled") message(STATUS "ARM feature DOTPROD enabled")
endif () endif ()
@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled") message(STATUS "ARM feature MATMUL_INT8 enabled")
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
endif () endif ()
@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
add_compile_definitions(__ARM_FEATURE_DOTPROD) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
message(STATUS "ARM feature DOTPROD enabled") message(STATUS "ARM feature DOTPROD enabled")
endif () endif ()
@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
message(STATUS "ARM feature MATMUL_INT8 enabled") message(STATUS "ARM feature MATMUL_INT8 enabled")
endif () endif ()
@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (MSVC) if (MSVC)
# instruction set detection for MSVC only # instruction set detection for MSVC only
if (GGML_NATIVE) if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(cmake/FindSIMD.cmake) include(cmake/FindSIMD.cmake)
endif () endif ()
if (GGML_AVX512) if (GGML_AVX512)
@ -185,43 +186,43 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
# macros corresponding to the extensions. # macros corresponding to the extensions.
# Do it manually. # Do it manually.
if (GGML_AVX512_VBMI) if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>) list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vbmi) list(APPEND ARCH_FLAGS -mavx512vbmi)
endif() endif()
endif() endif()
if (GGML_AVX512_VNNI) if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
endif() endif()
if (GGML_AVX512_BF16) if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>) list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang") if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavx512bf16) list(APPEND ARCH_FLAGS -mavx512bf16)
endif() endif()
endif() endif()
if (GGML_AMX_TILE) if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>) list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif() endif()
if (GGML_AMX_INT8) if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>) list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif() endif()
if (GGML_AMX_BF16) if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>) list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif() endif()
elseif (GGML_AVX2) elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX) elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX) list(APPEND ARCH_FLAGS /arch:AVX)
endif() endif()
if (GGML_AVX_VNNI)
list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
list(APPEND ARCH_FLAGS -mavxvnni)
endif()
endif()
else() else()
if (GGML_NATIVE) if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native) list(APPEND ARCH_FLAGS -march=native)
@ -238,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (GGML_AVX2) if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2) list(APPEND ARCH_FLAGS -mavx2)
endif() endif()
if (GGML_AVX_VNNI)
list(APPEND ARCH_FLAGS -mavxvnni)
endif()
if (GGML_AVX512) if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f) list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq) list(APPEND ARCH_FLAGS -mavx512dq)
@ -276,7 +280,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=powerpc64le) list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
else() else()
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected") message(STATUS "loongarch64 detected")
@ -299,11 +303,16 @@ endif()
if (GGML_CPU_AARCH64) if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
add_compile_definitions(GGML_USE_CPU_AARCH64) target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
endif() endif()
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>") target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>") set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
# the feature detection code must be compiled without any architecture flags
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
if (EMSCRIPTEN) if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")

View File

@ -0,0 +1,196 @@
#include "amx.h"
#include "common.h"
#include "mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};
if (!ggml_amx_init()) {
return NULL;
}
return &ggml_backend_buffer_type_amx;
}
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
}
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
}
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)

View File

@ -0,0 +1,6 @@
ggml/src/ggml-cpu/amx/amx.o: ggml/src/ggml-cpu/amx/amx.cpp \
ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
ggml/include/ggml.h ggml/include/ggml-alloc.h \
ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
ggml/src/ggml-cpu/amx/common.h ggml/src/ggml-cpu/amx/mmq.h \
ggml/src/ggml-backend-impl.h ggml/include/ggml-cpu.h

View File

@ -0,0 +1,20 @@
#include "ggml-backend.h"
#include "ggml-cpu-impl.h"
#ifdef __cplusplus
extern "C" {
#endif
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
#endif
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,100 @@
#pragma once
#include "ggml.h"
#include "ggml-cpu-impl.h"
#include <algorithm>
#include <memory>
#include <type_traits>
#if defined(_OPENMP)
#include <omp.h>
#endif
#define TILE_M 16
#define TILE_N 16
#define TILE_K 32
#define VNNI_BLK 4
#define AMX_BLK_SIZE 32
#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7
// parallel routines
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
inline T div_up(T x, T y) { return (x + y - 1) / y; }
template <typename T>
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
#if 0
// onednn partition pattern
T& n_my = n_end;
if (nth <= 1 || n == 0) {
n_start = 0;
n_my = n;
} else {
T n1 = div_up(n, nth);
T n2 = n1 - 1;
T T1 = n - n2 * nth;
n_my = ith < T1 ? n1 : n2;
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
}
n_end += n_start;
#else
// pytorch aten partition pattern
T n_my = div_up(n, nth);
n_start = ith * n_my;
n_end = std::min(n_start + n_my, n);
#endif
}
template <typename func_t>
inline void parallel_for(int nth, int n, const func_t& f) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(nth)
{
//int nth = omp_get_num_threads();
int ith = omp_get_thread_num();
int tbegin, tend;
balance211(n, nth, ith, tbegin, tend);
f(tbegin, tend);
}
#else
f(0, n);
GGML_UNUSED(nth);
#endif
}
template <typename func_t>
inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
int tbegin, tend;
balance211(n, params->nth, params->ith, tbegin, tend);
f(tbegin, tend);
}
// quantized types that have AMX support
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
// TODO: fix padding for vnni format
return (type == GGML_TYPE_Q4_0) ||
(type == GGML_TYPE_Q4_1) ||
(type == GGML_TYPE_Q8_0) ||
(type == GGML_TYPE_Q4_K) ||
(type == GGML_TYPE_Q5_K) ||
(type == GGML_TYPE_Q6_K) ||
(type == GGML_TYPE_IQ4_XS);
}
// ggml backend context
struct ggml_backend_amx_context {
int n_threads = GGML_DEFAULT_N_THREADS;
std::unique_ptr<char[]> work_data;
size_t work_size = 0;
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
ggml/src/ggml-cpu/amx/mmq.o: ggml/src/ggml-cpu/amx/mmq.cpp \
ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
ggml/include/ggml.h ggml/include/ggml-alloc.h \
ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
ggml/src/ggml-cpu/amx/mmq.h ggml/src/ggml-cpu/amx/common.h \
ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h \
ggml/src/ggml-quants.h

View File

@ -0,0 +1,16 @@
#pragma once
#include "common.h"
#ifdef __cplusplus
extern "C" {
#endif
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,298 @@
#include "ggml-cpu.h"
#include "ggml-backend-impl.h"
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <cstring>
#include <vector>
#include <bitset>
#include <array>
#include <string>
struct cpuid_x86 {
bool SSE3(void) { return f_1_ecx[0]; }
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
bool MONITOR(void) { return f_1_ecx[3]; }
bool SSSE3(void) { return f_1_ecx[9]; }
bool FMA(void) { return f_1_ecx[12]; }
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
bool SSE41(void) { return f_1_ecx[19]; }
bool SSE42(void) { return f_1_ecx[20]; }
bool MOVBE(void) { return f_1_ecx[22]; }
bool POPCNT(void) { return f_1_ecx[23]; }
bool AES(void) { return f_1_ecx[25]; }
bool XSAVE(void) { return f_1_ecx[26]; }
bool OSXSAVE(void) { return f_1_ecx[27]; }
bool AVX(void) { return f_1_ecx[28]; }
bool F16C(void) { return f_1_ecx[29]; }
bool RDRAND(void) { return f_1_ecx[30]; }
bool MSR(void) { return f_1_edx[5]; }
bool CX8(void) { return f_1_edx[8]; }
bool SEP(void) { return f_1_edx[11]; }
bool CMOV(void) { return f_1_edx[15]; }
bool CLFSH(void) { return f_1_edx[19]; }
bool MMX(void) { return f_1_edx[23]; }
bool FXSR(void) { return f_1_edx[24]; }
bool SSE(void) { return f_1_edx[25]; }
bool SSE2(void) { return f_1_edx[26]; }
bool FSGSBASE(void) { return f_7_ebx[0]; }
bool BMI1(void) { return f_7_ebx[3]; }
bool HLE(void) { return is_intel && f_7_ebx[4]; }
bool AVX2(void) { return f_7_ebx[5]; }
bool BMI2(void) { return f_7_ebx[8]; }
bool ERMS(void) { return f_7_ebx[9]; }
bool INVPCID(void) { return f_7_ebx[10]; }
bool RTM(void) { return is_intel && f_7_ebx[11]; }
bool AVX512F(void) { return f_7_ebx[16]; }
bool RDSEED(void) { return f_7_ebx[18]; }
bool ADX(void) { return f_7_ebx[19]; }
bool AVX512PF(void) { return f_7_ebx[26]; }
bool AVX512ER(void) { return f_7_ebx[27]; }
bool AVX512CD(void) { return f_7_ebx[28]; }
bool SHA(void) { return f_7_ebx[29]; }
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
bool LAHF(void) { return f_81_ecx[0]; }
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
bool ABM(void) { return is_amd && f_81_ecx[5]; }
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
bool XOP(void) { return is_amd && f_81_ecx[11]; }
bool TBM(void) { return is_amd && f_81_ecx[21]; }
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
bool AVX512_FP16(void) { return f_7_edx[23]; }
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
bool AMX_TILE(void) { return f_7_edx[24]; }
bool AMX_INT8(void) { return f_7_edx[25]; }
bool AMX_FP16(void) { return f_7_1_eax[21]; }
bool AMX_BF16(void) { return f_7_edx[22]; }
#ifdef _MSC_VER
static void cpuid(int cpu_info[4], int eax) {
__cpuid(cpu_info, eax);
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__cpuidex(cpu_info, eax, ecx);
}
#else
static void cpuid(int cpu_info[4], int eax) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(0));
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(ecx));
}
#endif
cpuid_x86() {
std::array<int, 4> cpui;
std::vector<std::array<int, 4>> data;
// calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
cpuid(cpui.data(), 0);
int n_ids = cpui[0];
for (int i = 0; i <= n_ids; ++i) {
cpuidex(cpui.data(), i, 0);
data.push_back(cpui);
}
// capture vendor string
char vendor[0x20] = {};
*reinterpret_cast<int *>(vendor) = data[0][1];
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
this->vendor = vendor;
if (this->vendor == "GenuineIntel") {
is_intel = true;
} else if (this->vendor == "AuthenticAMD") {
is_amd = true;
}
// load bitset with flags for function 0x00000001
if (n_ids >= 1) {
f_1_ecx = data[1][2];
f_1_edx = data[1][3];
}
// load bitset with flags for function 0x00000007
if (n_ids >= 7) {
f_7_ebx = data[7][1];
f_7_ecx = data[7][2];
f_7_edx = data[7][3];
cpuidex(cpui.data(), 7, 1);
f_7_1_eax = cpui[0];
}
// calling __cpuid with 0x80000000 as the function_id argument
// gets the number of the highest valid extended ID.
cpuid(cpui.data(), 0x80000000);
unsigned int n_ex_ids = cpui[0];
std::vector<std::array<int, 4>> ext_data;
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
cpuidex(cpui.data(), i, 0);
ext_data.push_back(cpui);
}
// load bitset with flags for function 0x80000001
if (n_ex_ids >= 0x80000001) {
f_81_ecx = ext_data[1][2];
f_81_edx = ext_data[1][3];
}
// interpret CPU brand string if reported
char brand[0x40] = {};
if (n_ex_ids >= 0x80000004) {
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
this->brand = brand;
}
}
bool is_intel = false;
bool is_amd = false;
std::string vendor;
std::string brand;
std::bitset<32> f_1_ecx;
std::bitset<32> f_1_edx;
std::bitset<32> f_7_ebx;
std::bitset<32> f_7_ecx;
std::bitset<32> f_7_edx;
std::bitset<32> f_7_1_eax;
std::bitset<32> f_81_ecx;
std::bitset<32> f_81_edx;
};
#if 0
void test_x86_is() {
cpuid_x86 is;
printf("CPU Vendor: %s\n", is.vendor.c_str());
printf("Brand: %s\n", is.brand.c_str());
printf("is_intel: %d\n", is.is_intel);
printf("is_amd: %d\n", is.is_amd);
printf("sse3: %d\n", is.SSE3());
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
printf("ssse3: %d\n", is.SSSE3());
printf("fma: %d\n", is.FMA());
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
printf("sse41: %d\n", is.SSE41());
printf("sse42: %d\n", is.SSE42());
printf("movbe: %d\n", is.MOVBE());
printf("popcnt: %d\n", is.POPCNT());
printf("aes: %d\n", is.AES());
printf("xsave: %d\n", is.XSAVE());
printf("osxsave: %d\n", is.OSXSAVE());
printf("avx: %d\n", is.AVX());
printf("f16c: %d\n", is.F16C());
printf("rdrand: %d\n", is.RDRAND());
printf("msr: %d\n", is.MSR());
printf("cx8: %d\n", is.CX8());
printf("sep: %d\n", is.SEP());
printf("cmov: %d\n", is.CMOV());
printf("clflush: %d\n", is.CLFSH());
printf("mmx: %d\n", is.MMX());
printf("fxsr: %d\n", is.FXSR());
printf("sse: %d\n", is.SSE());
printf("sse2: %d\n", is.SSE2());
printf("fsgsbase: %d\n", is.FSGSBASE());
printf("bmi1: %d\n", is.BMI1());
printf("hle: %d\n", is.HLE());
printf("avx2: %d\n", is.AVX2());
printf("bmi2: %d\n", is.BMI2());
printf("erms: %d\n", is.ERMS());
printf("invpcid: %d\n", is.INVPCID());
printf("rtm: %d\n", is.RTM());
printf("avx512f: %d\n", is.AVX512F());
printf("rdseed: %d\n", is.RDSEED());
printf("adx: %d\n", is.ADX());
printf("avx512pf: %d\n", is.AVX512PF());
printf("avx512er: %d\n", is.AVX512ER());
printf("avx512cd: %d\n", is.AVX512CD());
printf("sha: %d\n", is.SHA());
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
printf("lahf: %d\n", is.LAHF());
printf("lzcnt: %d\n", is.LZCNT());
printf("abm: %d\n", is.ABM());
printf("sse4a: %d\n", is.SSE4a());
printf("xop: %d\n", is.XOP());
printf("tbm: %d\n", is.TBM());
printf("syscall: %d\n", is.SYSCALL());
printf("mmxext: %d\n", is.MMXEXT());
printf("rdtscp: %d\n", is.RDTSCP());
printf("3dnowext: %d\n", is._3DNOWEXT());
printf("3dnow: %d\n", is._3DNOW());
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
printf("avx512_fp16: %d\n", is.AVX512_FP16());
printf("avx512_bf16: %d\n", is.AVX512_BF16());
printf("amx_tile: %d\n", is.AMX_TILE());
printf("amx_int8: %d\n", is.AMX_INT8());
printf("amx_fp16: %d\n", is.AMX_FP16());
printf("amx_bf16: %d\n", is.AMX_BF16());
}
#endif
static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support
cpuid_x86 is;
// if the CPU backend was built with any features not supported by the current CPU, it cannot be used
if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
// calculate a backend score based on the supported features
// more important features have a higher weight
int score = 0;
score += ggml_cpu_has_fma () * 1;
score += ggml_cpu_has_f16c () * 1<<1;
score += ggml_cpu_has_ssse3 () * 1<<2;
score += ggml_cpu_has_sse3 () * 1<<3;
score += ggml_cpu_has_avx_vnni () * 1<<4;
score += ggml_cpu_has_avx () * 1<<5;
score += ggml_cpu_has_avx2 () * 1<<6;
score += ggml_cpu_has_avx512 () * 1<<7;
// score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
score += ggml_cpu_has_avx512_bf16() * 1<<9;
score += ggml_cpu_has_avx512_vnni() * 1<<10;
score += ggml_cpu_has_amx_int8 () * 1<<11;
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))

View File

@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
} }
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) { static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) #if defined(__AVX512VNNI__)
const __m512i zero = _mm512_setzero_si512(); const __m512i zero = _mm512_setzero_si512();
return _mm512_dpbusd_epi32(zero, ax, sy); return _mm512_dpbusd_epi32(zero, ax, sy);
#else #else

View File

@ -15,6 +15,18 @@
extern "C" { extern "C" {
#endif #endif
struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_threadpool * threadpool;
};
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define m512bh(p) p #define m512bh(p) p
@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
} }
#endif #endif
// TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -10,6 +10,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml-cpu-quants.h" #include "ggml-cpu-quants.h"
#include "ggml-threading.h" #include "ggml-threading.h"
#include "amx/amx.h"
#include "ggml.h" #include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
@ -624,7 +625,7 @@ do { \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ps(x[i], x[offset+i]); \ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
} \ } \
res = _mm512_reduce_add_ps(x[0]); \ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
} while (0) } while (0)
// TODO: is this optimal ? // TODO: is this optimal ?
@ -674,7 +675,7 @@ do { \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ps(x[i], x[offset+i]); \ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
} \ } \
res = _mm512_reduce_add_ps(x[0]); \ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
} while (0) } while (0)
#define GGML_F16_VEC GGML_F32Cx16 #define GGML_F16_VEC GGML_F32Cx16
@ -685,8 +686,8 @@ do { \
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#elif defined(__AVX__) #elif defined(__AVX__)
#define GGML_SIMD #define GGML_SIMD
@ -1182,22 +1183,22 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
{ \ { \
int offset = GGML_F32_ARR >> 1; \ int offset = GGML_F32_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \ } \
__m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
tmp = __lsx_vsrli_d((__m128i)t0, 32); \ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
} }
@ -1367,31 +1368,15 @@ struct ggml_compute_state {
int ith; int ith;
}; };
struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_threadpool * threadpool;
};
// //
// fundamental operations // fundamental operations
// //
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
@ -2286,7 +2271,7 @@ struct ggml_state {
static struct ggml_state g_state = {0}; static struct ggml_state g_state = {0};
static void ggml_barrier(struct ggml_threadpool * tp) { void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
if (n_threads == 1) { if (n_threads == 1) {
return; return;
@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
type = (enum ggml_type)(intptr_t)src0->extra; type = (enum ggml_type)(intptr_t)src0->extra;
} }
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
ggml_backend_amx_mul_mat(params, dst);
return;
}
#endif
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
cur = ggml_backend_amx_desired_wsize(node);
}
#endif
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
if (node->src[1]->type != vec_dot_type) { if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
cur = MAX(cur, cur2);
} }
} break; } break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:

View File

@ -3,6 +3,7 @@
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-cpu-aarch64.h" #include "ggml-cpu-aarch64.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "amx/amx.h"
#include <cctype> #include <cctype>
#include <string> #include <string>
#include <vector> #include <vector>
@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
static std::vector<ggml_backend_buffer_type_t> bufts = []() { static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts; std::vector<ggml_backend_buffer_type_t> bufts;
#ifdef GGML_USE_CPU_HBM #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
bufts.push_back(ggml_backend_cpu_hbm_buffer_type()); if (ggml_backend_amx_buffer_type()) {
bufts.push_back(ggml_backend_amx_buffer_type());
}
#endif #endif
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_CPU_AARCH64
if (ggml_backend_cpu_aarch64_buffer_type()) {
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
}
#endif #endif
bufts.push_back(NULL); bufts.push_back(NULL);
@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src1 = op->src[1];
if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
return true;
}
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
return false; return false;
} }
} }
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
return ggml_backend_amx_device_supports_op(op);
}
for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
return false;
}
}
#endif
for (int i = 1; i < GGML_MAX_SRC; i++) { for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
return false; return false;
@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
} }
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
supported = supported || ggml_backend_amx_buft_is_amx(buft);
#endif
return supported;
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }

View File

@ -50,8 +50,7 @@
#include "sgemm.h" #include "sgemm.h"
#include "ggml-impl.h" #include "ggml-impl.h"
// hack until moved into the CPU backend #include "ggml-cpu-impl.h"
#include "../ggml-cpu-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@ -30,11 +30,13 @@
extern "C" { extern "C" {
#endif #endif
#undef MIN #ifndef MIN
#undef MAX # define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #ifndef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b)) # define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
// required for mmap as gguf only guarantees 32-byte alignment // required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32 #define TENSOR_ALIGNMENT 32

View File

@ -2911,7 +2911,6 @@ static void ggml_metal_encode_node(
} break; } break;
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
{ {
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
float eps; float eps;

View File

@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
set(TARGET vulkan-shaders-gen) set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp) add_executable(${TARGET} vulkan-shaders-gen.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)