diff --git a/ggml/.gitignore b/ggml/.gitignore deleted file mode 100644 index 9dd9ddea..00000000 --- a/ggml/.gitignore +++ /dev/null @@ -1 +0,0 @@ -src/ggml-metal-embed.metal diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt deleted file mode 100644 index d33f843b..00000000 --- a/ggml/CMakeLists.txt +++ /dev/null @@ -1,362 +0,0 @@ -cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. -project("ggml" C CXX) -include(CheckIncludeFileCXX) - -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() - -if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(GGML_STANDALONE ON) - - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - - # configure project version - # TODO -else() - set(GGML_STANDALONE OFF) -endif() - -if (EMSCRIPTEN) - set(BUILD_SHARED_LIBS_DEFAULT OFF) - - option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON) -else() - if (MINGW) - set(BUILD_SHARED_LIBS_DEFAULT OFF) - else() - set(BUILD_SHARED_LIBS_DEFAULT ON) - endif() -endif() - -# remove the lib prefix on win32 mingw -if (WIN32) - set(CMAKE_STATIC_LIBRARY_PREFIX "") - set(CMAKE_SHARED_LIBRARY_PREFIX "") - set(CMAKE_SHARED_MODULE_PREFIX "") -endif() - -option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) -option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) - -# -# option list -# - -# TODO: mark all options as advanced when not GGML_STANDALONE - -if (APPLE) - set(GGML_METAL_DEFAULT ON) - set(GGML_BLAS_DEFAULT ON) - set(GGML_BLAS_VENDOR_DEFAULT "Apple") -else() - set(GGML_METAL_DEFAULT OFF) - set(GGML_BLAS_DEFAULT OFF) - set(GGML_BLAS_VENDOR_DEFAULT "Generic") -endif() - -if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH}) - message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF") - set(GGML_NATIVE_DEFAULT OFF) -else() - set(GGML_NATIVE_DEFAULT ON) -endif() - -# defaults -if (NOT GGML_LLAMAFILE_DEFAULT) - set(GGML_LLAMAFILE_DEFAULT OFF) -endif() - -if (NOT GGML_CUDA_GRAPHS_DEFAULT) - set(GGML_CUDA_GRAPHS_DEFAULT OFF) -endif() - -# general -option(GGML_STATIC "ggml: static link libraries" OFF) -option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT}) -option(GGML_LTO "ggml: enable link time optimization" OFF) -option(GGML_CCACHE "ggml: use ccache if available" ON) - -# debug -option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON) -option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF) -option(GGML_GPROF "ggml: enable gprof" OFF) - -# build -option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF) - -# sanitizers -option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF) -option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF) -option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) - -# instruction set specific -if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT) - set(INS_ENB OFF) -else() - set(INS_ENB ON) -endif() - -message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}") -message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}") -message(DEBUG "INS_ENB : ${INS_ENB}") - -option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) -option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) -option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF) -option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) -option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) -option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) -option(GGML_BMI2 "ggml: enable BMI2" ${INS_ENB}) -option(GGML_AVX512 "ggml: enable AVX512F" OFF) -option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) -option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) -option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) -if (NOT MSVC) - # in MSVC F16C and FMA is implied with AVX2/AVX512 - option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) - option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) - # MSVC does not seem to support AMX - option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) - option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) - option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) -endif() -option(GGML_LASX "ggml: enable lasx" ON) -option(GGML_LSX "ggml: enable lsx" ON) -option(GGML_RVV "ggml: enable rvv" ON) -option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) -option(GGML_VXE "ggml: enable vxe" ON) - -option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) -set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") -set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC") - - -if (WIN32) - set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") -endif() - -# ggml core -set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") -option(GGML_CPU "ggml: enable CPU backend" ON) - -# 3rd party libs / backends -option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) -option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) -set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING - "ggml: BLAS library vendor") -option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) - -option(GGML_CUDA "ggml: use CUDA" OFF) -option(GGML_MUSA "ggml: use MUSA" OFF) -option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) -option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) -option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) -set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING - "ggml: max. batch size for using peer access") -option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) -option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) -option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON) -option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) -option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT}) -set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING - "ggml: cuda link binary compression mode; requires cuda 12.8+") -set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size") - -option(GGML_HIP "ggml: use HIP" OFF) -option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF) -option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) -option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) -option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) -option(GGML_VULKAN "ggml: use Vulkan" OFF) -option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) -option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) -option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) -option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF) -option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF) -option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) -option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) -option(GGML_KOMPUTE "ggml: use Kompute" OFF) -option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) -option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) -option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) -option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) -option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL}) -set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING - "ggml: metal minimum macOS version") -set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") -option(GGML_OPENMP "ggml: use OpenMP" ON) -option(GGML_RPC "ggml: use RPC" OFF) -option(GGML_SYCL "ggml: use SYCL" OFF) -option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) -option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) -set (GGML_SYCL_TARGET "INTEL" CACHE STRING - "ggml: sycl target device") -set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING - "ggml: sycl device architecture") - -option(GGML_OPENCL "ggml: use OpenCL" OFF) -option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) -option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON) -option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON) -set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING - "gmml: OpenCL API version to target") - -# toolchain for vulkan-shaders-gen -set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen") - -# extra artifacts -option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) -option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) - -# -# dependencies -# - -set(CMAKE_C_STANDARD 11) -set(CMAKE_C_STANDARD_REQUIRED true) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED true) - -set(THREADS_PREFER_PTHREAD_FLAG ON) - -find_package(Threads REQUIRED) - -include(GNUInstallDirs) - -# -# build the library -# - -add_subdirectory(src) - -# -# tests and examples -# - -if (GGML_BUILD_TESTS) - enable_testing() - add_subdirectory(tests) -endif () - -if (GGML_BUILD_EXAMPLES) - add_subdirectory(examples) -endif () - -# -# install -# - -include(CMakePackageConfigHelpers) - -# all public headers -set(GGML_PUBLIC_HEADERS - include/ggml.h - include/ggml-cpu.h - include/ggml-alloc.h - include/ggml-backend.h - include/ggml-blas.h - include/ggml-cann.h - include/ggml-cpp.h - include/ggml-cuda.h - include/ggml-kompute.h - include/ggml-opt.h - include/ggml-metal.h - include/ggml-rpc.h - include/ggml-sycl.h - include/ggml-vulkan.h - include/gguf.h) - -set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") -#if (GGML_METAL) -# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") -#endif() -install(TARGETS ggml LIBRARY PUBLIC_HEADER) -install(TARGETS ggml-base LIBRARY) - -if (GGML_STANDALONE) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in - ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc - @ONLY) - - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc - DESTINATION share/pkgconfig) -endif() - -# -# Create CMake package -# - -# Generate version info based on git commit. - -if(NOT DEFINED GGML_BUILD_NUMBER) - find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH) - execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE GGML_BUILD_NUMBER - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - - if(GGML_BUILD_NUMBER EQUAL 1) - message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.") - endif() - - execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE GGML_BUILD_COMMIT - OUTPUT_STRIP_TRAILING_WHITESPACE - ) -endif() - - -# Capture variables prefixed with GGML_. - -set(variable_set_statements -" -####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() ####### -####### Any changes to this file will be overwritten by the next CMake run ####### - -") - -set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS}) - -get_cmake_property(all_variables VARIABLES) -foreach(variable_name IN LISTS all_variables) - if(variable_name MATCHES "^GGML_") - string(REPLACE ";" "\\;" - variable_value "${${variable_name}}") - - set(variable_set_statements - "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n") - endif() -endforeach() - -set(GGML_VARIABLES_EXPANDED ${variable_set_statements}) - -# Create the CMake package and set install location. - -set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER}) -set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") -set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") -set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") - -configure_package_config_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml - PATH_VARS GGML_INCLUDE_INSTALL_DIR - GGML_LIB_INSTALL_DIR - GGML_BIN_INSTALL_DIR) - -write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake - VERSION ${GGML_INSTALL_VERSION} - COMPATIBILITY SameMajorVersion) - -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) diff --git a/ggml/cmake/BuildTypes.cmake b/ggml/cmake/BuildTypes.cmake deleted file mode 100644 index a9c7b6c9..00000000 --- a/ggml/cmake/BuildTypes.cmake +++ /dev/null @@ -1,54 +0,0 @@ -# Add new build types - -# ReleaseGG - Release with enabled asserts - -SET(CMAKE_CXX_FLAGS_RELEASEGG - "-O3" - CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts." - FORCE ) -SET(CMAKE_C_FLAGS_RELEASEGG - "-O3" - CACHE STRING "Flags used by the compiler during release builds with enabled asserts." - FORCE ) -SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG - "" - CACHE STRING "Flags used for linking binaries during release builds with enabled asserts." - FORCE ) -SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG - "" - CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts." - FORCE ) -MARK_AS_ADVANCED( - CMAKE_CXX_FLAGS_RELEASEGG - CMAKE_C_FLAGS_RELEASEGG - CMAKE_EXE_LINKER_FLAGS_RELEASEGG - CMAKE_SHARED_LINKER_FLAGS_RELEASEGG ) - -# RelWithDebInfoGG - RelWithDebInfo with enabled asserts - -SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG - "-O2 -g" - CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts." - FORCE ) -SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG - "-O2 -g" - CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts." - FORCE ) -SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG - "" - CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts." - FORCE ) -SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG - "" - CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts." - FORCE ) -MARK_AS_ADVANCED( - CMAKE_CXX_FLAGS_RELWITHDEBINFOGG - CMAKE_C_FLAGS_RELWITHDEBINFOGG - CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG - CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG ) - -if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG") -endif() diff --git a/ggml/cmake/GitVars.cmake b/ggml/cmake/GitVars.cmake deleted file mode 100644 index 1a4c24eb..00000000 --- a/ggml/cmake/GitVars.cmake +++ /dev/null @@ -1,22 +0,0 @@ -find_package(Git) - -# the commit's SHA1 -execute_process(COMMAND - "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8 - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - OUTPUT_VARIABLE GIT_SHA1 - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - -# the date of the commit -execute_process(COMMAND - "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - OUTPUT_VARIABLE GIT_DATE - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - -# the subject of the commit -execute_process(COMMAND - "${GIT_EXECUTABLE}" log -1 --format=%s - WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" - OUTPUT_VARIABLE GIT_COMMIT_SUBJECT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/ggml/cmake/common.cmake b/ggml/cmake/common.cmake deleted file mode 100644 index 1976d0ae..00000000 --- a/ggml/cmake/common.cmake +++ /dev/null @@ -1,26 +0,0 @@ -function(ggml_get_flags CCID CCVER) - set(C_FLAGS "") - set(CXX_FLAGS "") - - if (CCID MATCHES "Clang") - set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) - set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) - - if ( - (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR - (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) - ) - list(APPEND C_FLAGS -Wdouble-promotion) - endif() - elseif (CCID STREQUAL "GNU") - set(C_FLAGS -Wdouble-promotion) - set(CXX_FLAGS -Wno-array-bounds) - - if (CCVER VERSION_GREATER_EQUAL 8.1.0) - list(APPEND CXX_FLAGS -Wextra-semi) - endif() - endif() - - set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) - set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) -endfunction() diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in deleted file mode 100644 index 8c2dc31c..00000000 --- a/ggml/cmake/ggml-config.cmake.in +++ /dev/null @@ -1,152 +0,0 @@ - -@GGML_VARIABLES_EXPANDED@ - -@PACKAGE_INIT@ - -set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@") -set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@") -#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@") - -find_package(Threads REQUIRED) - -find_library(GGML_LIBRARY ggml - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - -add_library(ggml::ggml UNKNOWN IMPORTED) -set_target_properties(ggml::ggml - PROPERTIES - IMPORTED_LOCATION "${GGML_LIBRARY}") - -find_library(GGML_BASE_LIBRARY ggml-base - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - -add_library(ggml::ggml-base UNKNOWN IMPORTED) -set_target_properties(ggml::ggml-base - PROPERTIES - IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") - -if (NOT GGML_SHARED_LIB) - if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK}) - endif() - - if (GGML_OPENMP) - find_package(OpenMP REQUIRED) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - endif() - - if (GGML_CPU_HBM) - find_library(memkind memkind REQUIRED) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind) - endif() - - if (GGML_BLAS) - find_package(BLAS REQUIRED) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES}) - list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS}) - endif() - - if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) - endif() - - if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - - list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES - ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) - endif() - - if (GGML_VULKAN) - find_package(Vulkan REQUIRED) - list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan) - endif() - - if (GGML_HIP) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas) - endif() - - if (GGML_SYCL) - find_package(DNNL) - if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") - list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl) - endif() - if (WIN32) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) - list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) - endif() - endif() -endif() - -set(_ggml_all_targets "") -foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) - string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") - string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx) - - find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend} - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - - message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}") - - add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED) - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}" - IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" - IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}" - INTERFACE_COMPILE_FEATURES c_std_90 - POSITION_INDEPENDENT_CODE ON) - - string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}") - if(is_cpu_variant) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}") - - if(GGML_CPU_INTERFACE_LINK_OPTIONS) - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}") - endif() - - else() - list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}") - - if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS) - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}") - endif() - endif() - - list(APPEND _ggml_all_targets ggml::${_ggml_backend}) -endforeach() - -list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}") -set_target_properties(ggml::ggml - PROPERTIES - INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}") - -add_library(ggml::all INTERFACE IMPORTED) -set_target_properties(ggml::all - PROPERTIES - INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}") - -check_required_components(ggml) diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h deleted file mode 100644 index 2cb150fd..00000000 --- a/ggml/include/ggml-alloc.h +++ /dev/null @@ -1,76 +0,0 @@ -#pragma once - -#include "ggml.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; -typedef struct ggml_backend_buffer * ggml_backend_buffer_t; -typedef struct ggml_backend * ggml_backend_t; - -// Tensor allocator -struct ggml_tallocr { - ggml_backend_buffer_t buffer; - void * base; - size_t alignment; - size_t offset; -}; - -GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); -GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); - -// Graph allocator -/* - Example usage: - ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - - // optional: create a worst-case graph and reserve the buffers to avoid reallocations - ggml_gallocr_reserve(galloc, build_graph(max_batch)); - - // allocate the graph - struct ggml_cgraph * graph = build_graph(batch); - ggml_gallocr_alloc_graph(galloc, graph); - - printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); - - // evaluate the graph - ggml_backend_graph_compute(backend, graph); -*/ - -// special tensor flags for use with the graph allocator: -// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses -// ggml_set_output(): output tensors are never freed and never overwritten - -typedef struct ggml_gallocr * ggml_gallocr_t; - -GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); -GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); -GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); - -// pre-allocate buffers from a measure graph - does not allocate or modify the graph -// call with a worst-case graph to avoid buffer reallocations -// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed -// returns false if the buffer allocation failed -GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); -GGML_API bool ggml_gallocr_reserve_n( - ggml_gallocr_t galloc, - struct ggml_cgraph * graph, - const int * node_buffer_ids, - const int * leaf_buffer_ids); - -// automatic reallocation if the topology changes when using a single buffer -// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) -GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); - -GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); - -// Utils -// Create a buffer and allocate all the tensors in a ggml_context -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h deleted file mode 100644 index 64671495..00000000 --- a/ggml/include/ggml-backend.h +++ /dev/null @@ -1,354 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-alloc.h" - -#ifdef GGML_BACKEND_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef GGML_BACKEND_BUILD -# define GGML_BACKEND_API __declspec(dllexport) extern -# else -# define GGML_BACKEND_API __declspec(dllimport) extern -# endif -# else -# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern -# endif -#else -# define GGML_BACKEND_API extern -#endif - -#ifdef __cplusplus -extern "C" { -#endif - - typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; - typedef struct ggml_backend_buffer * ggml_backend_buffer_t; - typedef struct ggml_backend_event * ggml_backend_event_t; - typedef struct ggml_backend * ggml_backend_t; - typedef void * ggml_backend_graph_plan_t; - typedef struct ggml_backend_reg * ggml_backend_reg_t; - typedef struct ggml_backend_device * ggml_backend_dev_t; - - - // - // Backend buffer type - // - - GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); - GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); - - // - // Backend buffer - // - - enum ggml_backend_buffer_usage { - GGML_BACKEND_BUFFER_USAGE_ANY = 0, - GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, - GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, - }; - - GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); - - // tensor copy between different backends - GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); - - // - // Backend (stream) - // - - GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); - GGML_API const char * ggml_backend_name(ggml_backend_t backend); - GGML_API void ggml_backend_free(ggml_backend_t backend); - - GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); - GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); - GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); - GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend); - - GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - - // "offset" refers to the offset in tensor->data for setting/getting data - GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - - GGML_API void ggml_backend_synchronize(ggml_backend_t backend); - - GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - - GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); - - // NOTE: will be removed, use device version instead - GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); - GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); - GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); - - // asynchronous copy - // the copy is performed after all the currently queued operations in backend_src - // backend_dst will wait for the copy to complete before performing other operations - // automatic fallback to sync copy if async is not supported - GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); - - GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); - - // - // Events - // - - GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); - GGML_API void ggml_backend_event_free(ggml_backend_event_t event); - GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); - GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); - - // - // Backend device - // - - enum ggml_backend_dev_type { - // CPU device using system memory - GGML_BACKEND_DEVICE_TYPE_CPU, - // GPU device using dedicated memory - GGML_BACKEND_DEVICE_TYPE_GPU, - // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) - GGML_BACKEND_DEVICE_TYPE_ACCEL - }; - - // functionality supported by the device - struct ggml_backend_dev_caps { - // asynchronous operations - bool async; - // pinned host buffer - bool host_buffer; - // creating buffers from host ptr - bool buffer_from_host_ptr; - // event synchronization - bool events; - }; - - // all the device properties - struct ggml_backend_dev_props { - const char * name; - const char * description; - size_t memory_free; - size_t memory_total; - enum ggml_backend_dev_type type; - struct ggml_backend_dev_caps caps; - }; - - GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); - GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); - GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); - GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device); - GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); - GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); - GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); - GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); - GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); - GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); - - GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); - GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); - GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); - - // - // Backend (reg) - // - - GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg); - GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg); - GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); - GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); - - // Common functions that may be obtained using ggml_backend_reg_get_proc_address - - // Split buffer type for tensor parallelism - typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split); - // Set the number of threads for the backend - typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); - // Get additional buffer types provided by the device (returns a NULL-terminated array) - typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); - // Set the abort callback for the backend - typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); - // Get a list of feature flags supported by the backend (returns a NULL-terminated array) - struct ggml_backend_feature { - const char * name; - const char * value; - }; - typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); - - // - // Backend registry - // - - GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); - - // Backend (reg) enumeration - GGML_API size_t ggml_backend_reg_count(void); - GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); - GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name); - - // Device enumeration - GGML_API size_t ggml_backend_dev_count(void); - GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); - GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); - GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); - - // Direct backend (stream) initialization - // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) - GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); - // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) - GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); - // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) - GGML_API ggml_backend_t ggml_backend_init_best(void); - - // Load a backend from a dynamic library and register it - GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); - // Unload a backend if loaded dynamically and unregister it - GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); - // Load all known backends from dynamic libraries - GGML_API void ggml_backend_load_all(void); - GGML_API void ggml_backend_load_all_from_path(const char * dir_path); - - // - // Backend scheduler - // - - // The backend scheduler allows for multiple backend devices to be used together - // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends - // The backends are selected based on: - // - the backend that supports the operation - // - the location of the pre-allocated tensors (e.g. the weights) - /* - Example usage: - - // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned - // preferrably to run on the same backend as the buffer - ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); - - // initialize buffers from a max size graph (optional) - reserve_graph = build_graph(sched, max_batch_size); - - // manually assign nodes to a backend (optional, should not be needed in most cases) - struct ggml_tensor * node = ggml_mul_mat(ctx, ...); - ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu); - - ggml_backend_sched_reserve(sched, reserve_graph); - - // compute - graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation - for (int i = 0; i < 10; ++i) { - ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically - } - - // if there are graph inputs: - graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called) - ggml_backend_sched_reset(sched); // clear the allocation of the previous graph - ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it - ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors - ggml_backend_sched_graph_compute(sched, graph); // execute the graph - - // as an alternative to the above it is also possible to assign the inputs to a dedicated context and - // allocate them statically via ggml_backend_alloc_ctx_tensors - } - */ - - typedef struct ggml_backend_sched * ggml_backend_sched_t; - - // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) - // when ask == true, the scheduler wants to know if the user wants to observe this node - // this allows the scheduler to batch nodes together in order to evaluate them in a single call - // - // when ask == false, the scheduler is passing the node tensor to the user for observation - // if the user returns false, the scheduler will cancel the graph compute - // - typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); - - // Initialize a backend scheduler, backends with low index are given priority over backends with high index - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); - GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - - // Initialize backend buffers from a measure graph - GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success - - GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); - GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); - - // Get the number of splits of the last graph - GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); - GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); - - GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); - - GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); - GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); - - // Allocate and compute graph on the backend scheduler - GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success - GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); - - // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph. - // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers. - // The correct way to use this API is to discard the deallocated tensors and create new ones. - GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); - - // Set a callback to be called for each resulting node during graph compute - GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); - - // - // Utils - // - - struct ggml_backend_graph_copy { - ggml_backend_buffer_t buffer; - struct ggml_context * ctx_allocated; - struct ggml_context * ctx_unallocated; - struct ggml_cgraph * graph; - }; - - // Copy a graph to a different backend - GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); - GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - - typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); - - // Compare the output of two backends - GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); - - // Tensor initialization - GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); - GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor); - - // CPU buffer types are always available - GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h deleted file mode 100644 index 87a81b36..00000000 --- a/ggml/include/ggml-blas.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - - -#ifdef __cplusplus -extern "C" { -#endif - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void); - -GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend); - -// number of threads used for conversion to float -// for openblas and blis, this will also set the number of threads used for blas operations -GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void); - - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h deleted file mode 100644 index b469e228..00000000 --- a/ggml/include/ggml-cann.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2023-2024 The ggml authors - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#pragma once - -#include "ggml-backend.h" -#include "ggml.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * @brief Maximum number of CANN devices supported. - */ -#define GGML_CANN_MAX_DEVICES 16 - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void); - -/** - * @brief Initializes the CANN backend for a specified device. - * - * This function initializes the CANN backend for the given device. - * It verifies the device index, allocates a context, and creates a backend - * instance. - * - * @param device The index of the device to initialize. - * @return A pointer to the initialized backend instance, or nullptr on failure. - */ -GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device); - -/** - * @brief Checks if a given backend is a CANN backend. - * - * This function verifies if the provided backend is a CANN backend by comparing - * its GUID with the CANN backend's GUID. - * - * @param backend The backend instance to check. - * @return True if the backend is a CANN backend, false otherwise. - */ -GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend); - -/** - * @brief Retrieves the CANN buffer type for a specified device. - * - * This function initializes and returns the buffer type interface associated - * with the given device. It ensures thread-safe access using a mutex. - * - * @param device The device index for which to retrieve the buffer type. - * @return A pointer to the buffer type interface for the specified device, or - * nullptr if the device index is out of range. - */ -GGML_BACKEND_API ggml_backend_buffer_type_t -ggml_backend_cann_buffer_type(int32_t device); - -/** - * @brief Retrieves the number of CANN devices available. - * - * This function returns the number of CANN devices available based on - * information obtained from `ggml_cann_info()`. - * - * @return The number of CANN devices available. - */ -GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void); - -/** - * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. - * - * @return A pointer to the host buffer type interface. - */ -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); - -/** - * @brief Retrieves the description of a specific CANN device. - * - * This function sets the specified device, retrieves the SoC name, - * and writes it into the provided description buffer. - * - * @param device The device index to retrieve the description for. - * @param description Pointer to a buffer where the description will be written. - * @param description_size Size of the description buffer. - */ -GGML_BACKEND_API void ggml_backend_cann_get_device_description( - int32_t device, char* description, size_t description_size); - -/** - * @brief Retrieves the memory information of a specific CANN device. - * - * This function sets the specified device, retrieves the free and total - * memory information of the specified type (ACL_HBM_MEM), and stores them - * in the provided pointers. - * - * @param device The device index to retrieve memory information for. - * @param free Pointer to a variable where the free memory size will be stored. - * @param total Pointer to a variable where the total memory size will be - * stored. - */ -GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device, - size_t* free, - size_t* total); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h deleted file mode 100644 index a12342c2..00000000 --- a/ggml/include/ggml-cpp.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#ifndef __cplusplus -#error "This header is for C++ only" -#endif - -#include "ggml.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "gguf.h" -#include - -// Smart pointers for ggml types - -// ggml - -struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } }; -struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } }; - -typedef std::unique_ptr ggml_context_ptr; -typedef std::unique_ptr gguf_context_ptr; - -// ggml-alloc - -struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; - -typedef std::unique_ptr ggml_gallocr_ptr; - -// ggml-backend - -struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } }; -struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } }; -struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } }; -struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } }; - -typedef std::unique_ptr ggml_backend_ptr; -typedef std::unique_ptr ggml_backend_buffer_ptr; -typedef std::unique_ptr ggml_backend_event_ptr; -typedef std::unique_ptr ggml_backend_sched_ptr; diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h deleted file mode 100644 index f5e11f1e..00000000 --- a/ggml/include/ggml-cpu.h +++ /dev/null @@ -1,138 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - - // the compute plan that needs to be prepared for ggml_graph_compute() - // since https://github.com/ggml-org/ggml/issues/287 - struct ggml_cplan { - size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` - uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` - - int n_threads; - struct ggml_threadpool * threadpool; - - // abort ggml_graph_compute when true - ggml_abort_callback abort_callback; - void * abort_callback_data; - }; - - // numa strategies - enum ggml_numa_strategy { - GGML_NUMA_STRATEGY_DISABLED = 0, - GGML_NUMA_STRATEGY_DISTRIBUTE = 1, - GGML_NUMA_STRATEGY_ISOLATE = 2, - GGML_NUMA_STRATEGY_NUMACTL = 3, - GGML_NUMA_STRATEGY_MIRROR = 4, - GGML_NUMA_STRATEGY_COUNT - }; - - GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems - GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node - - GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); - GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); - - GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); - GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); - - GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); - GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); - - GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); - GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); - - GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); - GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); - - GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); - GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - - GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); - - // ggml_graph_plan() has to be called before ggml_graph_compute() - // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_BACKEND_API struct ggml_cplan ggml_graph_plan( - const struct ggml_cgraph * cgraph, - int n_threads, /* = GGML_DEFAULT_N_THREADS */ - struct ggml_threadpool * threadpool /* = NULL */ ); - GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); - - // same as ggml_graph_compute() but the work data is allocated as a part of the context - // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data - GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); - - // - // system info - // - - // x86 - GGML_BACKEND_API int ggml_cpu_has_sse3 (void); - GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); - GGML_BACKEND_API int ggml_cpu_has_avx (void); - GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); - GGML_BACKEND_API int ggml_cpu_has_avx2 (void); - GGML_BACKEND_API int ggml_cpu_has_bmi2 (void); - GGML_BACKEND_API int ggml_cpu_has_f16c (void); - GGML_BACKEND_API int ggml_cpu_has_fma (void); - GGML_BACKEND_API int ggml_cpu_has_avx512 (void); - GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); - GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); - GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void); - GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void); - // ARM - GGML_BACKEND_API int ggml_cpu_has_neon (void); - GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); - GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); - GGML_BACKEND_API int ggml_cpu_has_dotprod (void); - GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); - GGML_BACKEND_API int ggml_cpu_has_sve (void); - GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes - GGML_BACKEND_API int ggml_cpu_has_sme (void); - // other - GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); - GGML_BACKEND_API int ggml_cpu_has_vsx (void); - GGML_BACKEND_API int ggml_cpu_has_vxe (void); - GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); - GGML_BACKEND_API int ggml_cpu_has_llamafile (void); - - // Internal types and functions exposed for tests and benchmarks - - typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, - const void * GGML_RESTRICT y, size_t by, int nrc); - - struct ggml_type_traits_cpu { - ggml_from_float_t from_float; - ggml_vec_dot_t vec_dot; - enum ggml_type vec_dot_type; - int64_t nrows; // number of rows to process simultaneously - }; - - GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); - - GGML_BACKEND_API void ggml_cpu_init(void); - - // - // CPU backend - // - - GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void); - - GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); - GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); - - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h deleted file mode 100644 index 22ad2c00..00000000 --- a/ggml/include/ggml-cuda.h +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef GGML_USE_HIP -#define GGML_CUDA_NAME "ROCm" -#define GGML_CUBLAS_NAME "hipBLAS" -#elif defined(GGML_USE_MUSA) -#define GGML_CUDA_NAME "MUSA" -#define GGML_CUBLAS_NAME "muBLAS" -#else -#define GGML_CUDA_NAME "CUDA" -#define GGML_CUBLAS_NAME "cuBLAS" -#endif -#define GGML_CUDA_MAX_DEVICES 16 - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device); - -GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend); - -// device buffer -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); - -// split tensor buffer that splits matrices by rows across multiple devices -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split); - -// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); - -GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void); -GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); -GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); - -GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); -GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h deleted file mode 100644 index 154aa56a..00000000 --- a/ggml/include/ggml-kompute.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define GGML_KOMPUTE_MAX_DEVICES 16 - -struct ggml_vk_device { - int index; - int type; // same as VkPhysicalDeviceType - size_t heapSize; - const char * name; - const char * vendor; - int subgroupSize; - uint64_t bufferAlignment; - uint64_t maxAlloc; -}; - -struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count); -bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name); -bool ggml_vk_has_vulkan(void); -bool ggml_vk_has_device(void); -struct ggml_vk_device ggml_vk_current_device(void); - -// -// backend API -// - -// forward declaration -typedef struct ggml_backend * ggml_backend_t; - -GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device); - -GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h deleted file mode 100644 index a6106944..00000000 --- a/ggml/include/ggml-metal.h +++ /dev/null @@ -1,66 +0,0 @@ -// Note: this description is outdated -// -// An interface allowing to compute ggml_cgraph with Metal -// -// This is a fully functional interface that extends ggml with GPU support for Apple devices. -// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.) -// -// How it works? -// -// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this -// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you -// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.) -// -// You only need to make sure that all memory buffers that you used during the graph creation -// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is -// used during the graph evaluation to determine the arguments of the compute kernels. -// -// Synchronization between device and host memory (for example for input and output tensors) -// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions. -// - -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#include -#include - -struct ggml_tensor; -struct ggml_cgraph; - -#ifdef __cplusplus -extern "C" { -#endif - -// -// backend API -// user-code should use only these functions -// - -GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); - -GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); - -GGML_DEPRECATED( - GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), - "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713"); - -GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); - -// helper to check if the device supports a specific family -// ideally, the user code should be doing these checks -// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf -GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); - -// capture all command buffers committed the next time `ggml_backend_graph_compute` is called -GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-opencl.h b/ggml/include/ggml-opencl.h deleted file mode 100644 index 6b617713..00000000 --- a/ggml/include/ggml-opencl.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef GGML_OPENCL_H -#define GGML_OPENCL_H - -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// -// backend API -// -GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void); -GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void); - -#ifdef __cplusplus -} -#endif - -#endif // GGML_OPENCL_H diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h deleted file mode 100644 index eb5eab9d..00000000 --- a/ggml/include/ggml-opt.h +++ /dev/null @@ -1,216 +0,0 @@ -// This file contains functionality for training models using GGML. -// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets. -// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code. -// -// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de) - -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#include - -#ifdef __cplusplus -extern "C" { -#endif - - struct ggml_opt_dataset; - struct ggml_opt_context; - struct ggml_opt_result; - - typedef struct ggml_opt_dataset * ggml_opt_dataset_t; - typedef struct ggml_opt_context * ggml_opt_context_t; - typedef struct ggml_opt_result * ggml_opt_result_t; - - // ====== Loss ====== - - // built-in loss types, i.e. the built-in quantities minimized by the optimizer - // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value - enum ggml_opt_loss_type { - GGML_OPT_LOSS_TYPE_MEAN, - GGML_OPT_LOSS_TYPE_SUM, - GGML_OPT_LOSS_TYPE_CROSS_ENTROPY, - GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, - }; - - // ====== Dataset ====== - - GGML_API ggml_opt_dataset_t ggml_opt_dataset_init( - int64_t ne_datapoint, // number of elements per datapoint - int64_t ne_label, // number of elements per label - int64_t ndata, // total number of datapoints/labels - int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied) - GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset); - - // get underlying tensors that store the data - GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata] - GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata] - - // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative - GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata); - - // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch - GGML_API void ggml_opt_dataset_get_batch( - ggml_opt_dataset_t dataset, - struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch] - struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch] - int64_t ibatch); - - // ====== Model / Context ====== - - enum ggml_opt_build_type { - GGML_OPT_BUILD_TYPE_FORWARD, - GGML_OPT_BUILD_TYPE_GRAD, - GGML_OPT_BUILD_TYPE_OPT, - }; - - // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss - struct ggml_opt_optimizer_params { - // AdamW optimizer parameters - struct { - float alpha; // learning rate - float beta1; - float beta2; - float eps; // epsilon for numerical stability - float wd; // weight decay for AdamW, use 0.0f to disable - } adamw; - }; - - // callback to calculate optimizer parameters prior to a backward pass - // userdata can be used to pass arbitrary data - typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata); - - // returns the default optimizer params (constant) - // userdata is not used - GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata); - - // parameters for initializing a new optimization context - struct ggml_opt_params { - ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs - - struct ggml_context * ctx_compute; // created in user code, holds non-static tensors - - // the forward graph is defined by inputs and outputs - // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts - struct ggml_tensor * inputs; - struct ggml_tensor * outputs; - - enum ggml_opt_loss_type loss_type; - enum ggml_opt_build_type build_type; - - int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done - - ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters - void * get_opt_pars_ud; // userdata for calculating optimizer parameters - }; - - // get parameters for an optimization context with defaults set where possible - // parameters for which no sensible defaults exist are supplied as arguments to this function - GGML_API ggml_opt_params ggml_opt_default_params( - ggml_backend_sched_t backend_sched, - struct ggml_context * ctx_compute, - struct ggml_tensor * inputs, - struct ggml_tensor * outputs, - enum ggml_opt_loss_type loss_type); - - GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params); - GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx); - - // set gradients to zero, initilize loss, and optionally reset the optimizer - GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer); - - // get underlying tensors that store data - GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor - GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor - GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against - GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss - GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs - GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels - - GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); - - // ====== Optimization Result ====== - - GGML_API ggml_opt_result_t ggml_opt_result_init(); - GGML_API void ggml_opt_result_free(ggml_opt_result_t result); - GGML_API void ggml_opt_result_reset(ggml_opt_result_t result); - - // get data from result, uncertainties are optional and can be ignored by passing NULL - GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints - GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value - GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values - GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value - - // ====== Computation ====== - - // do forward pass, increment result if not NULL - GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); - - // do forward pass, increment result if not NULL, do backward pass - GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); - - // ############################################################################ - // ## The high-level functions start here. They do not depend on any private ## - // ## functions or structs and can be copied to and adapted for user code. ## - // ############################################################################ - - // ====== Intended Usage ====== - // - // 1. Select the appropriate loss for your problem. - // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them. - // Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster). - // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors. - // The first context should contain the model parameters and inputs and be allocated statically in user code. - // The second context should contain all other tensors and will be (re)allocated automatically. - // Due to this automated allocation the data of the second context is not defined when accessed in user code. - // Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors. - // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead. - - // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation - typedef void (*ggml_opt_epoch_callback)( - bool train, // true after training evaluation, false after validation evaluation - ggml_opt_context_t opt_ctx, - ggml_opt_dataset_t dataset, - ggml_opt_result_t result, // result associated with the dataset subsection - int64_t ibatch, // number of batches that have been evaluated so far - int64_t ibatch_max, // total number of batches in this dataset subsection - int64_t t_start_us); // time at which the evaluation on the dataset subsection was started - - // do training on front of dataset, do evaluation only on back of dataset - GGML_API void ggml_opt_epoch( - ggml_opt_context_t opt_ctx, - ggml_opt_dataset_t dataset, - ggml_opt_result_t result_train, // result to increment during training, ignored if NULL - ggml_opt_result_t result_eval, // result to increment during evaluation, ignored if NULL - int64_t idata_split, // data index at which to split training and evaluation - ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); - - // callback that prints a progress bar on stderr - GGML_API void ggml_opt_epoch_callback_progress_bar( - bool train, - ggml_opt_context_t opt_ctx, - ggml_opt_dataset_t dataset, - ggml_opt_result_t result, - int64_t ibatch, - int64_t ibatch_max, - int64_t t_start_us); - - // fit model defined by inputs and outputs to dataset - GGML_API void ggml_opt_fit( - ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs - ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs - ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch] - ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used - ggml_opt_dataset_t dataset, // dataset with data and optionally also labels - enum ggml_opt_loss_type loss_type, // loss to minimize - ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) - int64_t nepoch, // how many times the dataset should be iterated over - int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs - float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f) - bool silent); // whether or not info prints to stderr should be suppressed - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h deleted file mode 100644 index 4e0d210f..00000000 --- a/ggml/include/ggml-rpc.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define GGML_RPC_MAX_SERVERS 16 - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); -GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); - -GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); - -GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, - const char * cache_dir, - size_t free_mem, size_t total_mem); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void); - -GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h deleted file mode 100644 index 5ce349a8..00000000 --- a/ggml/include/ggml-sycl.h +++ /dev/null @@ -1,49 +0,0 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#define GGML_SYCL_NAME "SYCL" -#define GGML_SYCL_MAX_DEVICES 48 - -#ifdef __cplusplus -extern "C" { -#endif - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device); - -GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend); - -// devide buffer -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); - -// split tensor buffer that splits matrices by rows across multiple devices -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); - -// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); - -GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void); -GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); -GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device, - char *description, - size_t description_size); -GGML_BACKEND_API int ggml_backend_sycl_get_device_count(); -GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); - -// SYCL doesn't support registering host memory, keep here for reference -// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); -// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h deleted file mode 100644 index ed5ea5f7..00000000 --- a/ggml/include/ggml-vulkan.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define GGML_VK_NAME "Vulkan" -#define GGML_VK_MAX_DEVICES 16 - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); - -GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend); -GGML_BACKEND_API int ggml_backend_vk_get_device_count(void); -GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); -GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); -// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h deleted file mode 100644 index 452c967b..00000000 --- a/ggml/include/ggml.h +++ /dev/null @@ -1,2221 +0,0 @@ -#pragma once - -// -// GGML Tensor Library -// -// This documentation is still a work in progress. -// If you wish some specific topics to be covered, feel free to drop a comment: -// -// https://github.com/ggerganov/whisper.cpp/issues/40 -// -// ## Overview -// -// This library implements: -// -// - a set of tensor operations -// - automatic differentiation -// - basic optimization algorithms -// -// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, -// but is not limited to, the following: -// -// - linear regression -// - support vector machines -// - neural networks -// -// The library allows the user to define a certain function using the available tensor operations. This function -// definition is represented internally via a computation graph. Each tensor operation in the function definition -// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the -// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized -// using one of the available optimization algorithms. -// -// For example, here we define the function: f(x) = a*x^2 + b -// -// { -// struct ggml_init_params params = { -// .mem_size = 16*1024*1024, -// .mem_buffer = NULL, -// }; -// -// // memory allocation happens here -// struct ggml_context * ctx = ggml_init(params); -// -// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); -// -// ggml_set_param(ctx, x); // x is an input variable -// -// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); -// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); -// struct ggml_tensor * x2 = ggml_mul(ctx, x, x); -// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); -// -// ... -// } -// -// Notice that the function definition above does not involve any actual computation. The computation is performed only -// when the user explicitly requests it. For example, to compute the function's value at x = 2.0: -// -// { -// ... -// -// struct ggml_cgraph * gf = ggml_new_graph(ctx); -// ggml_build_forward_expand(gf, f); -// -// // set the input variable and parameter values -// ggml_set_f32(x, 2.0f); -// ggml_set_f32(a, 3.0f); -// ggml_set_f32(b, 4.0f); -// -// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); -// -// printf("f = %f\n", ggml_get_f32_1d(f, 0)); -// -// ... -// } -// -// The actual computation is performed in the ggml_graph_compute() function. -// -// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the -// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know -// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory -// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was -// actually needed. -// -// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic -// differentiation and optimization algorithms. -// -// The described approach allows to define the function graph once and then compute its forward or backward graphs -// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way -// the user can avoid the memory allocation overhead at runtime. -// -// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class -// citizens, but in theory the library can be extended to support FP8 and integer data types. -// -// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary -// and binary operations. Most of the available operations fall into one of these two categories. With time, it became -// clear that the library needs to support more complex operations. The way to support these operations is not clear -// yet, but a few examples are demonstrated in the following operations: -// -// - ggml_permute() -// - ggml_conv_1d_1s() -// - ggml_conv_1d_2s() -// -// For each tensor operator, the library implements a forward and backward computation function. The forward function -// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the -// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a -// calculus class, or watch the following video: -// -// What is Automatic Differentiation? -// https://www.youtube.com/watch?v=wG_nF1awSSY -// -// -// ## Tensor data (struct ggml_tensor) -// -// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of -// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains -// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: -// -// { -// struct ggml_tensor * c = ggml_add(ctx, a, b); -// -// assert(c->src[0] == a); -// assert(c->src[1] == b); -// } -// -// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the -// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows -// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and -// permutation. All tensor operations have to take the stride into account and not assume that the tensor is -// contiguous in memory. -// -// The data of the tensor is accessed via the "data" pointer. For example: -// -// { -// const int nx = 2; -// const int ny = 3; -// -// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); -// -// for (int y = 0; y < ny; y++) { -// for (int x = 0; x < nx; x++) { -// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; -// } -// } -// -// ... -// } -// -// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. -// -// ## The matrix multiplication operator (ggml_mul_mat) -// -// TODO -// -// -// ## Multi-threading -// -// TODO -// -// -// ## Overview of ggml.c -// -// TODO -// -// -// ## SIMD optimizations -// -// TODO -// -// -// ## Debugging ggml -// -// TODO -// -// - -#ifdef GGML_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef GGML_BUILD -# define GGML_API __declspec(dllexport) extern -# else -# define GGML_API __declspec(dllimport) extern -# endif -# else -# define GGML_API __attribute__ ((visibility ("default"))) extern -# endif -#else -# define GGML_API extern -#endif - -// TODO: support for clang -#ifdef __GNUC__ -# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) -#elif defined(_MSC_VER) -# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func -#else -# define GGML_DEPRECATED(func, hint) func -#endif - -#ifndef __GNUC__ -# define GGML_ATTRIBUTE_FORMAT(...) -#elif defined(__MINGW32__) && !defined(__clang__) -# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif - -#include -#include -#include -#include - -#define GGML_FILE_MAGIC 0x67676d6c // "ggml" -#define GGML_FILE_VERSION 2 - -#define GGML_QNT_VERSION 2 // bump this on quantization format changes -#define GGML_QNT_VERSION_FACTOR 1000 // do not change this - -#define GGML_MAX_DIMS 4 -#define GGML_MAX_PARAMS 2048 -#define GGML_MAX_SRC 10 -#define GGML_MAX_N_THREADS 512 -#define GGML_MAX_OP_PARAMS 64 - -#ifndef GGML_MAX_NAME -# define GGML_MAX_NAME 64 -#endif - -#define GGML_DEFAULT_N_THREADS 4 -#define GGML_DEFAULT_GRAPH_SIZE 2048 - -#if UINTPTR_MAX == 0xFFFFFFFF - #define GGML_MEM_ALIGN 4 -#else - #define GGML_MEM_ALIGN 16 -#endif - -#define GGML_EXIT_SUCCESS 0 -#define GGML_EXIT_ABORTED 1 - -#define GGML_ROPE_TYPE_NEOX 2 -#define GGML_ROPE_TYPE_MROPE 8 -#define GGML_ROPE_TYPE_VISION 24 - -#define GGML_UNUSED(x) (void)(x) - -#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) - -#ifndef NDEBUG -# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) -#elif defined(__GNUC__) -# define GGML_UNREACHABLE() __builtin_unreachable() -#elif defined(_MSC_VER) -# define GGML_UNREACHABLE() __assume(0) -#else -# define GGML_UNREACHABLE() ((void) 0) -#endif - -#ifdef __cplusplus -# define GGML_NORETURN [[noreturn]] -#elif defined(_MSC_VER) -# define GGML_NORETURN __declspec(noreturn) -#else -# define GGML_NORETURN _Noreturn -#endif - -#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) -#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) - -// used to copy the number of elements and stride in bytes of tensors into local variables. -// main purpose is to reduce code duplication and improve readability. -// -// example: -// -// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); -// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); -// -#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ - const type prefix##0 = (pointer)->array[0]; \ - GGML_UNUSED(prefix##0); -#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ - GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ - const type prefix##1 = (pointer)->array[1]; \ - GGML_UNUSED(prefix##1); -#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ - GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ - const type prefix##2 = (pointer)->array[2]; \ - GGML_UNUSED(prefix##2); -#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ - GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ - const type prefix##3 = (pointer)->array[3]; \ - GGML_UNUSED(prefix##3); - -#define GGML_TENSOR_UNARY_OP_LOCALS \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - -#define GGML_TENSOR_BINARY_OP_LOCALS \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - -#define GGML_TENSOR_BINARY_OP_LOCALS01 \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) - -#ifdef __cplusplus -extern "C" { -#endif - - GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) - GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); - - enum ggml_status { - GGML_STATUS_ALLOC_FAILED = -2, - GGML_STATUS_FAILED = -1, - GGML_STATUS_SUCCESS = 0, - GGML_STATUS_ABORTED = 1, - }; - - // get ggml_status name string - GGML_API const char * ggml_status_to_string(enum ggml_status status); - - // ieee 754-2008 half-precision float16 - // todo: make this not an integral type - typedef uint16_t ggml_fp16_t; - GGML_API float ggml_fp16_to_fp32(ggml_fp16_t); - GGML_API ggml_fp16_t ggml_fp32_to_fp16(float); - GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t); - GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t); - - // google brain half-precision bfloat16 - typedef struct { uint16_t bits; } ggml_bf16_t; - GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); - GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 - GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); - GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t); - GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); - - struct ggml_object; - struct ggml_context; - struct ggml_cgraph; - - // NOTE: always add types at the end of the enum to keep backward compatibility - enum ggml_type { - GGML_TYPE_F32 = 0, - GGML_TYPE_F16 = 1, - GGML_TYPE_Q4_0 = 2, - GGML_TYPE_Q4_1 = 3, - // GGML_TYPE_Q4_2 = 4, support has been removed - // GGML_TYPE_Q4_3 = 5, support has been removed - GGML_TYPE_Q5_0 = 6, - GGML_TYPE_Q5_1 = 7, - GGML_TYPE_Q8_0 = 8, - GGML_TYPE_Q8_1 = 9, - GGML_TYPE_Q2_K = 10, - GGML_TYPE_Q3_K = 11, - GGML_TYPE_Q4_K = 12, - GGML_TYPE_Q5_K = 13, - GGML_TYPE_Q6_K = 14, - GGML_TYPE_Q8_K = 15, - GGML_TYPE_IQ2_XXS = 16, - GGML_TYPE_IQ2_XS = 17, - GGML_TYPE_IQ3_XXS = 18, - GGML_TYPE_IQ1_S = 19, - GGML_TYPE_IQ4_NL = 20, - GGML_TYPE_IQ3_S = 21, - GGML_TYPE_IQ2_S = 22, - GGML_TYPE_IQ4_XS = 23, - GGML_TYPE_I8 = 24, - GGML_TYPE_I16 = 25, - GGML_TYPE_I32 = 26, - GGML_TYPE_I64 = 27, - GGML_TYPE_F64 = 28, - GGML_TYPE_IQ1_M = 29, - GGML_TYPE_BF16 = 30, - // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files - // GGML_TYPE_Q4_0_4_8 = 32, - // GGML_TYPE_Q4_0_8_8 = 33, - GGML_TYPE_TQ1_0 = 34, - GGML_TYPE_TQ2_0 = 35, - // GGML_TYPE_IQ4_NL_4_4 = 36, - // GGML_TYPE_IQ4_NL_4_8 = 37, - // GGML_TYPE_IQ4_NL_8_8 = 38, - GGML_TYPE_COUNT = 39, - }; - - // precision - enum ggml_prec { - GGML_PREC_DEFAULT, - GGML_PREC_F32, - }; - - // model file types - enum ggml_ftype { - GGML_FTYPE_UNKNOWN = -1, - GGML_FTYPE_ALL_F32 = 0, - GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors - GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors - GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors - GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors - }; - - // available tensor operations: - enum ggml_op { - GGML_OP_NONE = 0, - - GGML_OP_DUP, - GGML_OP_ADD, - GGML_OP_ADD1, - GGML_OP_ACC, - GGML_OP_SUB, - GGML_OP_MUL, - GGML_OP_DIV, - GGML_OP_SQR, - GGML_OP_SQRT, - GGML_OP_LOG, - GGML_OP_SIN, - GGML_OP_COS, - GGML_OP_SUM, - GGML_OP_SUM_ROWS, - GGML_OP_MEAN, - GGML_OP_ARGMAX, - GGML_OP_COUNT_EQUAL, - GGML_OP_REPEAT, - GGML_OP_REPEAT_BACK, - GGML_OP_CONCAT, - GGML_OP_SILU_BACK, - GGML_OP_NORM, // normalize - GGML_OP_RMS_NORM, - GGML_OP_RMS_NORM_BACK, - GGML_OP_GROUP_NORM, - GGML_OP_L2_NORM, - - GGML_OP_MUL_MAT, - GGML_OP_MUL_MAT_ID, - GGML_OP_OUT_PROD, - - GGML_OP_SCALE, - GGML_OP_SET, - GGML_OP_CPY, - GGML_OP_CONT, - GGML_OP_RESHAPE, - GGML_OP_VIEW, - GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, - GGML_OP_GET_ROWS_BACK, - GGML_OP_DIAG, - GGML_OP_DIAG_MASK_INF, - GGML_OP_DIAG_MASK_ZERO, - GGML_OP_SOFT_MAX, - GGML_OP_SOFT_MAX_BACK, - GGML_OP_ROPE, - GGML_OP_ROPE_BACK, - GGML_OP_CLAMP, - GGML_OP_CONV_TRANSPOSE_1D, - GGML_OP_IM2COL, - GGML_OP_IM2COL_BACK, - GGML_OP_CONV_TRANSPOSE_2D, - GGML_OP_POOL_1D, - GGML_OP_POOL_2D, - GGML_OP_POOL_2D_BACK, - GGML_OP_UPSCALE, // nearest interpolate - GGML_OP_PAD, - GGML_OP_PAD_REFLECT_1D, - GGML_OP_ARANGE, - GGML_OP_TIMESTEP_EMBEDDING, - GGML_OP_ARGSORT, - GGML_OP_LEAKY_RELU, - - GGML_OP_FLASH_ATTN_EXT, - GGML_OP_FLASH_ATTN_BACK, - GGML_OP_SSM_CONV, - GGML_OP_SSM_SCAN, - GGML_OP_WIN_PART, - GGML_OP_WIN_UNPART, - GGML_OP_GET_REL_POS, - GGML_OP_ADD_REL_POS, - GGML_OP_RWKV_WKV6, - GGML_OP_GATED_LINEAR_ATTN, - GGML_OP_RWKV_WKV7, - - GGML_OP_UNARY, - - GGML_OP_MAP_UNARY, - GGML_OP_MAP_BINARY, - - GGML_OP_MAP_CUSTOM1_F32, - GGML_OP_MAP_CUSTOM2_F32, - GGML_OP_MAP_CUSTOM3_F32, - - GGML_OP_MAP_CUSTOM1, - GGML_OP_MAP_CUSTOM2, - GGML_OP_MAP_CUSTOM3, - - GGML_OP_CROSS_ENTROPY_LOSS, - GGML_OP_CROSS_ENTROPY_LOSS_BACK, - GGML_OP_OPT_STEP_ADAMW, - - GGML_OP_COUNT, - }; - - enum ggml_unary_op { - GGML_UNARY_OP_ABS, - GGML_UNARY_OP_SGN, - GGML_UNARY_OP_NEG, - GGML_UNARY_OP_STEP, - GGML_UNARY_OP_TANH, - GGML_UNARY_OP_ELU, - GGML_UNARY_OP_RELU, - GGML_UNARY_OP_SIGMOID, - GGML_UNARY_OP_GELU, - GGML_UNARY_OP_GELU_QUICK, - GGML_UNARY_OP_SILU, - GGML_UNARY_OP_HARDSWISH, - GGML_UNARY_OP_HARDSIGMOID, - GGML_UNARY_OP_EXP, - - GGML_UNARY_OP_COUNT, - }; - - enum ggml_object_type { - GGML_OBJECT_TYPE_TENSOR, - GGML_OBJECT_TYPE_GRAPH, - GGML_OBJECT_TYPE_WORK_BUFFER - }; - - enum ggml_log_level { - GGML_LOG_LEVEL_NONE = 0, - GGML_LOG_LEVEL_DEBUG = 1, - GGML_LOG_LEVEL_INFO = 2, - GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_ERROR = 4, - GGML_LOG_LEVEL_CONT = 5, // continue previous log - }; - - // this tensor... - enum ggml_tensor_flag { - GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph - GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph - GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters - GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) - }; - - struct ggml_init_params { - // memory pool - size_t mem_size; // bytes - void * mem_buffer; // if NULL, memory will be allocated internally - bool no_alloc; // don't allocate memory for the tensor data - }; - - // n-dimensional tensor - struct ggml_tensor { - enum ggml_type type; - - struct ggml_backend_buffer * buffer; - - int64_t ne[GGML_MAX_DIMS]; // number of elements - size_t nb[GGML_MAX_DIMS]; // stride in bytes: - // nb[0] = ggml_type_size(type) - // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding - // nb[i] = nb[i-1] * ne[i-1] - - // compute data - enum ggml_op op; - - // op params - allocated as int32_t for alignment - int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; - - int32_t flags; - - struct ggml_tensor * src[GGML_MAX_SRC]; - - // source tensor and offset for views - struct ggml_tensor * view_src; - size_t view_offs; - - void * data; - - char name[GGML_MAX_NAME]; - - void * extra; // extra things e.g. for ggml-cuda.cu - - char padding[8]; - }; - - static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - - // Abort callback - // If not NULL, called before ggml computation - // If it returns true, the computation is aborted - typedef bool (*ggml_abort_callback)(void * data); - - - // - // GUID - // - - // GUID types - typedef uint8_t ggml_guid[16]; - typedef ggml_guid * ggml_guid_t; - - GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b); - - // misc - - GGML_API void ggml_time_init(void); // call this once at the beginning of the program - GGML_API int64_t ggml_time_ms(void); - GGML_API int64_t ggml_time_us(void); - GGML_API int64_t ggml_cycles(void); - GGML_API int64_t ggml_cycles_per_ms(void); - - // accepts a UTF-8 path, even on Windows - GGML_API FILE * ggml_fopen(const char * fname, const char * mode); - - GGML_API void ggml_print_object (const struct ggml_object * obj); - GGML_API void ggml_print_objects(const struct ggml_context * ctx); - - GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); - GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - - GGML_API int64_t ggml_blck_size(enum ggml_type type); - GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block - GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row - - GGML_DEPRECATED( - GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float - "use ggml_row_size() instead"); - - GGML_API const char * ggml_type_name(enum ggml_type type); - GGML_API const char * ggml_op_name (enum ggml_op op); - GGML_API const char * ggml_op_symbol(enum ggml_op op); - - GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); - GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name - - GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); - - GGML_API bool ggml_is_quantized(enum ggml_type type); - - // TODO: temporary until model loading of ggml examples is refactored - GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); - - GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); - GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars - - GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() - GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 - GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 - - GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); - GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); - - GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1); - - // use this to compute the memory overhead of a tensor - GGML_API size_t ggml_tensor_overhead(void); - - GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes); - - // main - - GGML_API struct ggml_context * ggml_init (struct ggml_init_params params); - GGML_API void ggml_reset(struct ggml_context * ctx); - GGML_API void ggml_free (struct ggml_context * ctx); - - GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - - GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); - GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); - - GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); - GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx); - GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); - - GGML_API struct ggml_tensor * ggml_new_tensor( - struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t *ne); - - GGML_API struct ggml_tensor * ggml_new_tensor_1d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0); - - GGML_API struct ggml_tensor * ggml_new_tensor_2d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0, - int64_t ne1); - - GGML_API struct ggml_tensor * ggml_new_tensor_3d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0, - int64_t ne1, - int64_t ne2); - - GGML_API struct ggml_tensor * ggml_new_tensor_4d( - struct ggml_context * ctx, - enum ggml_type type, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3); - - GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes); - - GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); - GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); - - // Context tensor enumeration and lookup - GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx); - GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); - - // Converts a flat index into coordinates - GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); - - GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); - - GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); - GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - - GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); - GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); - GGML_ATTRIBUTE_FORMAT(2, 3) - GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); - - // Tensor flags - GGML_API void ggml_set_input(struct ggml_tensor * tensor); - GGML_API void ggml_set_output(struct ggml_tensor * tensor); - GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API void ggml_set_loss(struct ggml_tensor * tensor); - - // - // operations on tensors with backpropagation - // - - GGML_API struct ggml_tensor * ggml_dup( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_dup_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_add( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_add_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_add_cast( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - enum ggml_type type); - - GGML_API struct ggml_tensor * ggml_add1( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_add1_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // dst = a - // view(dst, nb1, nb2, nb3, offset) += b - // return dst - GGML_API struct ggml_tensor * ggml_acc( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset); - - GGML_API struct ggml_tensor * ggml_acc_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset); - - GGML_API struct ggml_tensor * ggml_sub( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_sub_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_mul( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_mul_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_div( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_div_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_sqr( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sqr_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sqrt( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sqrt_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_log( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_log_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sin( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sin_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_cos( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_cos_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // return scalar - GGML_API struct ggml_tensor * ggml_sum( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] - GGML_API struct ggml_tensor * ggml_sum_rows( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // mean along rows - GGML_API struct ggml_tensor * ggml_mean( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // argmax along rows - GGML_API struct ggml_tensor * ggml_argmax( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // count number of equal elements in a and b - GGML_API struct ggml_tensor * ggml_count_equal( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // if a is the same shape as b, and a is not parameter, return a - // otherwise, return a new tensor: repeat(a) to fit in b - GGML_API struct ggml_tensor * ggml_repeat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // sums repetitions in a into shape of b - GGML_API struct ggml_tensor * ggml_repeat_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // concat a and b along dim - // used in stable-diffusion - GGML_API struct ggml_tensor * ggml_concat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int dim); - - GGML_API struct ggml_tensor * ggml_abs( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_abs_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sgn( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sgn_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_neg( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_neg_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_step( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_step_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_tanh( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_tanh_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_elu( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_elu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_relu( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_leaky_relu( - struct ggml_context * ctx, - struct ggml_tensor * a, float negative_slope, bool inplace); - - GGML_API struct ggml_tensor * ggml_relu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sigmoid( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_sigmoid_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_gelu( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_gelu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_gelu_quick( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_silu( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_silu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // a - x - // b - dy - GGML_API struct ggml_tensor * ggml_silu_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // hardswish(x) = x * relu6(x + 3) / 6 - GGML_API struct ggml_tensor * ggml_hardswish( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // hardsigmoid(x) = relu6(x + 3) / 6 - GGML_API struct ggml_tensor * ggml_hardsigmoid( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_exp( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_exp_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // normalize along rows - GGML_API struct ggml_tensor * ggml_norm( - struct ggml_context * ctx, - struct ggml_tensor * a, - float eps); - - GGML_API struct ggml_tensor * ggml_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - float eps); - - GGML_API struct ggml_tensor * ggml_rms_norm( - struct ggml_context * ctx, - struct ggml_tensor * a, - float eps); - - GGML_API struct ggml_tensor * ggml_rms_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - float eps); - - // group normalize along ne0*ne1*n_groups - // used in stable-diffusion - GGML_API struct ggml_tensor * ggml_group_norm( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps); - - GGML_API struct ggml_tensor * ggml_group_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps); - - // l2 normalize along rows - // used in rwkv v7 - GGML_API struct ggml_tensor * ggml_l2_norm( - struct ggml_context * ctx, - struct ggml_tensor * a, - float eps); - - GGML_API struct ggml_tensor * ggml_l2_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - float eps); - - // a - x - // b - dy - GGML_API struct ggml_tensor * ggml_rms_norm_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - float eps); - - // A: k columns, n rows => [ne03, ne02, n, k] - // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] - // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] - GGML_API struct ggml_tensor * ggml_mul_mat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // change the precision of a matrix multiplication - // set to GGML_PREC_F32 for higher precision (useful for phi-2) - GGML_API void ggml_mul_mat_set_prec( - struct ggml_tensor * a, - enum ggml_prec prec); - - // indirect matrix multiplication - GGML_API struct ggml_tensor * ggml_mul_mat_id( - struct ggml_context * ctx, - struct ggml_tensor * as, - struct ggml_tensor * b, - struct ggml_tensor * ids); - - // A: m columns, n rows, - // B: p columns, n rows, - // result is m columns, p rows - GGML_API struct ggml_tensor * ggml_out_prod( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // - // operations on tensors without backpropagation - // - - GGML_API struct ggml_tensor * ggml_scale( - struct ggml_context * ctx, - struct ggml_tensor * a, - float s); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_scale_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - float s); - - // b -> view(a,offset,nb1,nb2,3), return modified a - GGML_API struct ggml_tensor * ggml_set( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset); // in bytes - - // b -> view(a,offset,nb1,nb2,3), return view(a) - GGML_API struct ggml_tensor * ggml_set_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset); // in bytes - - GGML_API struct ggml_tensor * ggml_set_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t offset); // in bytes - - GGML_API struct ggml_tensor * ggml_set_1d_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t offset); // in bytes - - // b -> view(a,offset,nb1,nb2,3), return modified a - GGML_API struct ggml_tensor * ggml_set_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t offset); // in bytes - - // b -> view(a,offset,nb1,nb2,3), return view(a) - GGML_API struct ggml_tensor * ggml_set_2d_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t offset); // in bytes - - // a -> b, return view(b) - GGML_API struct ggml_tensor * ggml_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - GGML_API struct ggml_tensor * ggml_cast( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_type type); - - // make contiguous - GGML_API struct ggml_tensor * ggml_cont( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // make contiguous, with new shape - GGML_API struct ggml_tensor * ggml_cont_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0); - - GGML_API struct ggml_tensor * ggml_cont_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1); - - GGML_API struct ggml_tensor * ggml_cont_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2); - - GGML_API struct ggml_tensor * ggml_cont_4d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3); - - // return view(a), b specifies the new shape - // TODO: when we start computing gradient, make a copy instead of view - GGML_API struct ggml_tensor * ggml_reshape( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // return view(a) - // TODO: when we start computing gradient, make a copy instead of view - GGML_API struct ggml_tensor * ggml_reshape_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0); - - GGML_API struct ggml_tensor * ggml_reshape_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1); - - // return view(a) - // TODO: when we start computing gradient, make a copy instead of view - GGML_API struct ggml_tensor * ggml_reshape_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2); - - GGML_API struct ggml_tensor * ggml_reshape_4d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3); - - // offset in bytes - GGML_API struct ggml_tensor * ggml_view_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - size_t offset); - - GGML_API struct ggml_tensor * ggml_view_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - size_t nb1, // row stride in bytes - size_t offset); - - GGML_API struct ggml_tensor * ggml_view_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2, - size_t nb1, // row stride in bytes - size_t nb2, // slice stride in bytes - size_t offset); - - GGML_API struct ggml_tensor * ggml_view_4d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3, - size_t nb1, // row stride in bytes - size_t nb2, // slice stride in bytes - size_t nb3, - size_t offset); - - GGML_API struct ggml_tensor * ggml_permute( - struct ggml_context * ctx, - struct ggml_tensor * a, - int axis0, - int axis1, - int axis2, - int axis3); - - // alias for ggml_permute(ctx, a, 1, 0, 2, 3) - GGML_API struct ggml_tensor * ggml_transpose( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // supports 3D: a->ne[2] == b->ne[1] - GGML_API struct ggml_tensor * ggml_get_rows( - struct ggml_context * ctx, - struct ggml_tensor * a, // data - struct ggml_tensor * b); // row indices - - GGML_API struct ggml_tensor * ggml_get_rows_back( - struct ggml_context * ctx, - struct ggml_tensor * a, // gradients of ggml_get_rows result - struct ggml_tensor * b, // row indices - struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape - - GGML_API struct ggml_tensor * ggml_diag( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // set elements above the diagonal to -INF - GGML_API struct ggml_tensor * ggml_diag_mask_inf( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_past); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_past); - - // set elements above the diagonal to 0 - GGML_API struct ggml_tensor * ggml_diag_mask_zero( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_past); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_past); - - GGML_API struct ggml_tensor * ggml_soft_max( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_soft_max_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - // fused soft_max(a*scale + mask*(ALiBi slope)) - // mask is optional - // max_bias = 0.0f for no ALiBi - GGML_API struct ggml_tensor * ggml_soft_max_ext( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * mask, - float scale, - float max_bias); - - GGML_API struct ggml_tensor * ggml_soft_max_ext_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - float scale, - float max_bias); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - float scale, - float max_bias); - - // rotary position embedding - // if (mode & 1) - skip n_past elements (NOT SUPPORTED) - // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style - // - // b is an int32 vector with size a->ne[2], it contains the positions - GGML_API struct ggml_tensor * ggml_rope( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int n_dims, - int mode); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_rope_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int n_dims, - int mode); - - // custom RoPE - // c is freq factors (e.g. phi3-128k), (optional) - GGML_API struct ggml_tensor * ggml_rope_ext( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - int n_dims, - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow); - - GGML_API struct ggml_tensor * ggml_rope_multi( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - int n_dims, - int sections[4], - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow); - - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_rope_ext_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - int n_dims, - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int n_dims, - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow), - "use ggml_rope_ext instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int n_dims, - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow), - "use ggml_rope_ext_inplace instead"); - - // compute correction dims for YaRN RoPE scaling - GGML_API void ggml_rope_yarn_corr_dims( - int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); - - // rotary position embedding backward, i.e compute dx from dy - // a - dy - GGML_API struct ggml_tensor * ggml_rope_ext_back( - struct ggml_context * ctx, - struct ggml_tensor * a, // gradients of ggml_rope result - struct ggml_tensor * b, // positions - struct ggml_tensor * c, // freq factors - int n_dims, - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow); - - GGML_API struct ggml_tensor * ggml_rope_multi_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - int n_dims, - int sections[4], - int mode, - int n_ctx_orig, - float freq_base, - float freq_scale, - float ext_factor, - float attn_factor, - float beta_fast, - float beta_slow); - - - // clamp - // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_clamp( - struct ggml_context * ctx, - struct ggml_tensor * a, - float min, - float max); - - // im2col - // converts data into a format that effectively results in a convolution when combined with matrix multiplication - GGML_API struct ggml_tensor * ggml_im2col( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1, // dilation dimension 1 - bool is_2D, - enum ggml_type dst_type); - - GGML_API struct ggml_tensor * ggml_im2col_back( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // gradient of im2col output - int64_t * ne, // shape of im2col input - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1, // dilation dimension 1 - bool is_2D); - - GGML_API struct ggml_tensor * ggml_conv_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride - int p0, // padding - int d0); // dilation - - // conv_1d with padding = half - // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) - GGML_API struct ggml_tensor* ggml_conv_1d_ph( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s, // stride - int d); // dilation - - // depthwise - // TODO: this is very likely wrong for some cases! - needs more testing - GGML_API struct ggml_tensor * ggml_conv_1d_dw( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride - int p0, // padding - int d0); // dilation - - GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride - int d0); // dilation - - GGML_API struct ggml_tensor * ggml_conv_transpose_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride - int p0, // padding - int d0); // dilation - - GGML_API struct ggml_tensor * ggml_conv_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1); // dilation dimension 1 - - // kernel size is a->ne[0] x a->ne[1] - // stride is equal to kernel size - // padding is zero - // example: - // a: 16 16 3 768 - // b: 1024 1024 3 1 - // res: 64 64 768 1 - // used in sam - GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // kernel size is a->ne[0] x a->ne[1] - // stride is 1 - // padding is half - // example: - // a: 3 3 256 256 - // b: 64 64 256 1 - // res: 64 64 256 1 - // used in sam - GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); - - // depthwise - GGML_API struct ggml_tensor * ggml_conv_2d_dw( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1); // dilation dimension 1 - - GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int stride); - - enum ggml_op_pool { - GGML_OP_POOL_MAX, - GGML_OP_POOL_AVG, - GGML_OP_POOL_COUNT, - }; - - GGML_API struct ggml_tensor * ggml_pool_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_op_pool op, - int k0, // kernel size - int s0, // stride - int p0); // padding - - // the result will have 2*p0 padding for the first dimension - // and 2*p1 padding for the second dimension - GGML_API struct ggml_tensor * ggml_pool_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_op_pool op, - int k0, - int k1, - int s0, - int s1, - float p0, - float p1); - - GGML_API struct ggml_tensor * ggml_pool_2d_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * af, // "a"/input used in forward pass - enum ggml_op_pool op, - int k0, - int k1, - int s0, - int s1, - float p0, - float p1); - - // nearest interpolate - // multiplies ne0 and ne1 by scale factor - // used in stable-diffusion - GGML_API struct ggml_tensor * ggml_upscale( - struct ggml_context * ctx, - struct ggml_tensor * a, - int scale_factor); - - // nearest interpolate - // nearest interpolate to specified dimensions - // used in tortoise.cpp - GGML_API struct ggml_tensor * ggml_upscale_ext( - struct ggml_context * ctx, - struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3); - - // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] - GGML_API struct ggml_tensor * ggml_pad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, - int p1, - int p2, - int p3); - - // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] - GGML_API struct ggml_tensor * ggml_pad_reflect_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, - int p1); - - // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 - // timesteps: [N,] - // return: [N, dim] - GGML_API struct ggml_tensor * ggml_timestep_embedding( - struct ggml_context * ctx, - struct ggml_tensor * timesteps, - int dim, - int max_period); - - // sort rows - enum ggml_sort_order { - GGML_SORT_ORDER_ASC, - GGML_SORT_ORDER_DESC, - }; - - GGML_API struct ggml_tensor * ggml_argsort( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_sort_order order); - - GGML_API struct ggml_tensor * ggml_arange( - struct ggml_context * ctx, - float start, - float stop, - float step); - - // top k elements per row - GGML_API struct ggml_tensor * ggml_top_k( - struct ggml_context * ctx, - struct ggml_tensor * a, - int k); - -#define GGML_KQ_MASK_PAD 64 - - // q: [n_embd_k, n_batch, n_head, 1] - // k: [n_embd_k, n_kv, n_head_kv, 1] - // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !! - // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !! - // res: [n_embd_v, n_head, n_batch, 1] !! permuted !! - GGML_API struct ggml_tensor * ggml_flash_attn_ext( - struct ggml_context * ctx, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * mask, - float scale, - float max_bias, - float logit_softcap); - - GGML_API void ggml_flash_attn_ext_set_prec( - struct ggml_tensor * a, - enum ggml_prec prec); - - GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec( - const struct ggml_tensor * a); - - // TODO: needs to be adapted to ggml_flash_attn_ext - GGML_API struct ggml_tensor * ggml_flash_attn_back( - struct ggml_context * ctx, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * d, - bool masked); - - GGML_API struct ggml_tensor * ggml_ssm_conv( - struct ggml_context * ctx, - struct ggml_tensor * sx, - struct ggml_tensor * c); - - GGML_API struct ggml_tensor * ggml_ssm_scan( - struct ggml_context * ctx, - struct ggml_tensor * s, - struct ggml_tensor * x, - struct ggml_tensor * dt, - struct ggml_tensor * A, - struct ggml_tensor * B, - struct ggml_tensor * C); - - // partition into non-overlapping windows with padding if needed - // example: - // a: 768 64 64 1 - // w: 14 - // res: 768 14 14 25 - // used in sam - GGML_API struct ggml_tensor * ggml_win_part( - struct ggml_context * ctx, - struct ggml_tensor * a, - int w); - - // reverse of ggml_win_part - // used in sam - GGML_API struct ggml_tensor * ggml_win_unpart( - struct ggml_context * ctx, - struct ggml_tensor * a, - int w0, - int h0, - int w); - - GGML_API struct ggml_tensor * ggml_unary( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_unary_op op); - - GGML_API struct ggml_tensor * ggml_unary_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_unary_op op); - - // used in sam - GGML_API struct ggml_tensor * ggml_get_rel_pos( - struct ggml_context * ctx, - struct ggml_tensor * a, - int qh, - int kh); - - // used in sam - GGML_API struct ggml_tensor * ggml_add_rel_pos( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * pw, - struct ggml_tensor * ph); - - GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * pw, - struct ggml_tensor * ph); - - GGML_API struct ggml_tensor * ggml_rwkv_wkv6( - struct ggml_context * ctx, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * r, - struct ggml_tensor * tf, - struct ggml_tensor * td, - struct ggml_tensor * state); - - GGML_API struct ggml_tensor * ggml_gated_linear_attn( - struct ggml_context * ctx, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * q, - struct ggml_tensor * g, - struct ggml_tensor * state, - float scale); - - GGML_API struct ggml_tensor * ggml_rwkv_wkv7( - struct ggml_context * ctx, - struct ggml_tensor * r, - struct ggml_tensor * w, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * state); - - // custom operators - - typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); - typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - - typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - ggml_unary_op_f32_t fun), - "use ggml_map_custom1 instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - ggml_unary_op_f32_t fun), - "use ggml_map_custom1_inplace instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - ggml_binary_op_f32_t fun), - "use ggml_map_custom2 instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - ggml_binary_op_f32_t fun), - "use ggml_map_custom2_inplace instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - ggml_custom1_op_f32_t fun), - "use ggml_map_custom1 instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - ggml_custom1_op_f32_t fun), - "use ggml_map_custom1_inplace instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - ggml_custom2_op_f32_t fun), - "use ggml_map_custom2 instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - ggml_custom2_op_f32_t fun), - "use ggml_map_custom2_inplace instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - ggml_custom3_op_f32_t fun), - "use ggml_map_custom3 instead"); - - GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - ggml_custom3_op_f32_t fun), - "use ggml_map_custom3_inplace instead"); - - // custom operators v2 - - typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); - typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); - typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); - -#define GGML_N_TASKS_MAX (-1) - // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks - - GGML_API struct ggml_tensor * ggml_map_custom1( - struct ggml_context * ctx, - struct ggml_tensor * a, - ggml_custom1_op_t fun, - int n_tasks, - void * userdata); - - GGML_API struct ggml_tensor * ggml_map_custom1_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - ggml_custom1_op_t fun, - int n_tasks, - void * userdata); - - GGML_API struct ggml_tensor * ggml_map_custom2( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - ggml_custom2_op_t fun, - int n_tasks, - void * userdata); - - GGML_API struct ggml_tensor * ggml_map_custom2_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - ggml_custom2_op_t fun, - int n_tasks, - void * userdata); - - GGML_API struct ggml_tensor * ggml_map_custom3( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - ggml_custom3_op_t fun, - int n_tasks, - void * userdata); - - GGML_API struct ggml_tensor * ggml_map_custom3_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - ggml_custom3_op_t fun, - int n_tasks, - void * userdata); - - // loss function - - GGML_API struct ggml_tensor * ggml_cross_entropy_loss( - struct ggml_context * ctx, - struct ggml_tensor * a, // logits - struct ggml_tensor * b); // labels - - GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( - struct ggml_context * ctx, - struct ggml_tensor * a, // logits - struct ggml_tensor * b, // labels - struct ggml_tensor * c); // gradients of cross_entropy_loss result - - // AdamW optimizer step - // Paper: https://arxiv.org/pdf/1711.05101v3.pdf - // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html - GGML_API struct ggml_tensor * ggml_opt_step_adamw( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * grad, - struct ggml_tensor * m, - struct ggml_tensor * v, - struct ggml_tensor * adamw_params); // parameters such a the learning rate - - // - // automatic differentiation - // - - GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - GGML_API void ggml_build_backward_expand( - struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation) - struct ggml_context * ctx_compute, // context for gradient computation - struct ggml_cgraph * cgraph, - bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static - - // graph allocation in a context - GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false - GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); - GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 - GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); - - GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph); - GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i] - GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); - GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); - - GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - - GGML_API size_t ggml_graph_overhead(void); - GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); - - GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name); - GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node); - GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node); - - GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); - GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); - - // print info and performance information for the graph - GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); - - // dump the graph into a file using the dot format - GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); - - // TODO these functions were sandwiched in the old optimization interface, is there a better place for them? - typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); - - // Set callback for all future logging events. - // If this is not called, or NULL is supplied, everything is output on stderr. - GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); - - GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); - - // - // quantization - // - - // - ggml_quantize_init can be called multiple times with the same type - // it will only initialize the quantization tables for the first call or after ggml_quantize_free - // automatically called by ggml_quantize_chunk for convenience - // - // - ggml_quantize_free will free any memory allocated by ggml_quantize_init - // call this at the end of the program to avoid memory leaks - // - // note: these are thread-safe - // - GGML_API void ggml_quantize_init(enum ggml_type type); - GGML_API void ggml_quantize_free(void); - - // some quantization type cannot be used without an importance matrix - GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type); - - // calls ggml_quantize_init internally (i.e. can allocate memory) - GGML_API size_t ggml_quantize_chunk( - enum ggml_type type, - const float * src, - void * dst, - int64_t start, - int64_t nrows, - int64_t n_per_row, - const float * imatrix); - -#ifdef __cplusplus - // restrict not standard in C++ -# if defined(__GNUC__) -# define GGML_RESTRICT __restrict__ -# elif defined(__clang__) -# define GGML_RESTRICT __restrict -# elif defined(_MSC_VER) -# define GGML_RESTRICT __restrict -# else -# define GGML_RESTRICT -# endif -#else -# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L) -# define GGML_RESTRICT __restrict -# else -# define GGML_RESTRICT restrict -# endif -#endif - typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - - struct ggml_type_traits { - const char * type_name; - int64_t blck_size; - int64_t blck_size_interleave; // interleave elements in blocks - size_t type_size; - bool is_quantized; - ggml_to_float_t to_float; - ggml_from_float_t from_float_ref; - }; - - GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); - - // ggml threadpool - // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend - // the goal should be to create an API that other backends can use move everything to the ggml base - - // scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - - GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h deleted file mode 100644 index 79ee2020..00000000 --- a/ggml/include/gguf.h +++ /dev/null @@ -1,202 +0,0 @@ -// This file contains functionality related to "GGUF" files, the binary file format used by ggml. -// GGUF files have the following structure: -// -// 1. File magic "GGUF" (4 bytes). -// 2. File version (uint32_t). -// 3. Number of ggml tensors in file (int64_t). -// 4. Number of key-value-pairs in file (int64_t). -// 5. For each KV pair: -// 1. The key (string). -// 2. The value type (gguf_type). -// 3a. If the value type is GGUF_TYPE_ARRAY: -// 1. The type of the array (gguf_type). -// 2. The number of elements in the array (uint64_t). -// 3. The binary representation of each element in the array. -// 3b. Otherwise: -// 1. The binary representation of the value. -// 6. For each ggml tensor: -// 1. The tensor name (string). -// 2. The number of dimensions of the tensor (uint32_t). -// 3. For each dimension: -// 1. The size of the tensor in the dimension (int64_t). -// 4. The tensor data type (ggml_type). -// 5. The tensor data offset in the tensor data binary blob (uint64_t). -// 7. The tensor data binary blob (optional, aligned). -// -// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator. -// All enums are stored as int32_t. -// All bool values are stored as int8_t. -// If the special key "general.alignment" (uint32_t) is defined it is used for alignment, -// otherwise GGUF_DEFAULT_ALIGNMENT is used. -// -// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de) - -#pragma once - -#include "ggml.h" - -#include -#include - -#define GGUF_MAGIC "GGUF" -#define GGUF_VERSION 3 - -#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment" - -#define GGUF_DEFAULT_ALIGNMENT 32 - -#ifdef __cplusplus -extern "C" { -#endif - - // types that can be stored as GGUF KV data - enum gguf_type { - GGUF_TYPE_UINT8 = 0, - GGUF_TYPE_INT8 = 1, - GGUF_TYPE_UINT16 = 2, - GGUF_TYPE_INT16 = 3, - GGUF_TYPE_UINT32 = 4, - GGUF_TYPE_INT32 = 5, - GGUF_TYPE_FLOAT32 = 6, - GGUF_TYPE_BOOL = 7, - GGUF_TYPE_STRING = 8, - GGUF_TYPE_ARRAY = 9, - GGUF_TYPE_UINT64 = 10, - GGUF_TYPE_INT64 = 11, - GGUF_TYPE_FLOAT64 = 12, - GGUF_TYPE_COUNT, // marks the end of the enum - }; - - struct gguf_context; - - struct gguf_init_params { - bool no_alloc; - - // if not NULL, create a ggml_context and allocate the tensor data in it - struct ggml_context ** ctx; - }; - - GGML_API struct gguf_context * gguf_init_empty(void); - GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); - - GGML_API void gguf_free(struct gguf_context * ctx); - - GGML_API const char * gguf_type_name(enum gguf_type type); - - GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx); - GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); - GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); - - GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx); - GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found - GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id); - - GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id); - GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id); - - // will abort if the wrong type is used for the key - GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id); - GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id); - GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id); - GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id); - GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id); - GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id); - GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id); - GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id); - GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id); - GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id); - GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id); - GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id); - GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id); - GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id); - - // get raw pointer to the first element of the array with the given key_id - // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference) - GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id); - - // get ith C string from array with given key_id - GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); - - GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx); - GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found - GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id); - GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id); - GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id); - GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id); - - // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist) - GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key); - - // overrides an existing KV pair or adds a new one, the new KV pair is always at the back - GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); - GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); - GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); - GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); - GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); - GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); - GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); - GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); - GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); - GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); - GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); - GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); - - // creates a new array with n elements of the given type and copies the corresponding number of bytes from data - GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n); - - // creates a new array with n strings and copies the corresponding strings from data - GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n); - - // set or add KV pairs from another context - GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src); - - // add tensor to GGUF context, tensor name must be unique - GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); - - // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated - // in such a way that the tensor data remains as one contiguous block (except for padding) - GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); - - // assumes that at least gguf_get_tensor_size bytes can be read from data - GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data); - - // writing gguf files can be done in 3 ways: - // - // - write the entire gguf_context to a binary file in a single pass: - // - // gguf_write_to_file(ctx, fname, /*only_meta =*/ false); - // - // - write only the meta data to a file, then re-open the file and append the tensor data: - // - // gguf_write_to_file(ctx, fname, /*only_meta =*/ true); - // FILE * f = fopen(fname, "ab"); - // fwrite(f, ...); // write tensor data - // fclose(f); - // - // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: - // - // FILE * f = fopen(fname, "wb"); - // const size_t size_meta = gguf_get_meta_size(ctx); - // fseek(f, size_meta, SEEK_SET); - // fwrite(f, ...); // write tensor data - // void * data = malloc(size_meta); - // gguf_get_meta_data(ctx, data); - // rewind(f); - // fwrite(data, 1, data, f); - // free(data); - // fclose(f); - // - - // write the entire context to a binary file - GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); - - // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding - GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); - - // writes the meta data to pointer "data" - GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt deleted file mode 100644 index f00700da..00000000 --- a/ggml/src/CMakeLists.txt +++ /dev/null @@ -1,342 +0,0 @@ -include(CheckCXXCompilerFlag) -include("../cmake/common.cmake") - -add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) - -# enable libstdc++ assertions for debug builds -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) -endif() - -if (NOT MSVC) - if (GGML_SANITIZE_THREAD) - add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) - endif() - - if (GGML_SANITIZE_ADDRESS) - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) - endif() - - if (GGML_SANITIZE_UNDEFINED) - add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) - endif() -endif() - -if (GGML_FATAL_WARNINGS) - if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND C_FLAGS -Werror) - list(APPEND CXX_FLAGS -Werror) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - add_compile_options(/WX) - endif() -endif() - -if (GGML_ALL_WARNINGS) - if (NOT MSVC) - list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - - list(APPEND C_FLAGS ${WARNING_FLAGS}) - list(APPEND CXX_FLAGS ${WARNING_FLAGS}) - - ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) - - add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" - "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") - else() - # todo : msvc - set(C_FLAGS "") - set(CXX_FLAGS "") - endif() -endif() - -if (GGML_LTO) - include(CheckIPOSupported) - check_ipo_supported(RESULT result OUTPUT output) - if (result) - set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) - else() - message(WARNING "IPO is not supported: ${output}") - endif() -endif() - -if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER) - find_program(GGML_CCACHE_FOUND ccache) - find_program(GGML_SCCACHE_FOUND sccache) - - if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND) - if(GGML_CCACHE_FOUND) - set(GGML_CCACHE_VARIANT ccache) - else() - set(GGML_CCACHE_VARIANT sccache) - endif() - # TODO: should not be set globally - if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl") - else () - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}") - endif () - set(ENV{CCACHE_SLOPPINESS} time_macros) - message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.") - else() - message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF") - endif () -endif() - -# this version of Apple ld64 is buggy -execute_process( - COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v - ERROR_VARIABLE output - OUTPUT_QUIET -) - -if (output MATCHES "dyld-1015\.7") - add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) -endif() - -# architecture specific -# TODO: probably these flags need to be tweaked on some architectures -# feel free to update the Makefile for your architecture and send a pull request or issue -message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") -else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") -endif () - -if (NOT MSVC) - if (GGML_STATIC) - add_link_options(-static) - if (MINGW) - add_link_options(-static-libgcc -static-libstdc++) - endif() - endif() - if (GGML_GPROF) - add_compile_options(-pg) - endif() -endif() - -if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) -endif() - -# -# POSIX conformance -# - -# clock_gettime came in POSIX.1b (1993) -# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional -# posix_memalign came in POSIX.1-2001 / SUSv3 -# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) - -# Somehow in OpenBSD whenever POSIX conformance is specified -# some string functions rely on locale_t availability, -# which was introduced in POSIX.1-2008, forcing us to go higher -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - add_compile_definitions(_XOPEN_SOURCE=700) -else() - add_compile_definitions(_XOPEN_SOURCE=600) -endif() - -# Data types, macros and functions related to controlling CPU affinity and -# some memory allocation are available on Linux through GNU extensions in libc -if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android") - add_compile_definitions(_GNU_SOURCE) -endif() - -# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, -# and on macOS its availability depends on enabling Darwin extensions -# similarly on DragonFly, enabling BSD extensions is necessary -if ( - CMAKE_SYSTEM_NAME MATCHES "Darwin" OR - CMAKE_SYSTEM_NAME MATCHES "iOS" OR - CMAKE_SYSTEM_NAME MATCHES "tvOS" OR - CMAKE_SYSTEM_NAME MATCHES "DragonFly" -) - add_compile_definitions(_DARWIN_C_SOURCE) -endif() - -# alloca is a non-standard interface that is not visible on BSDs when -# POSIX conformance is specified, but not all of them provide a clean way -# to enable it in such cases -if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_compile_definitions(__BSD_VISIBLE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") - add_compile_definitions(_NETBSD_SOURCE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - add_compile_definitions(_BSD_SOURCE) -endif() - -if (WIN32) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) -endif() - -# ggml - -if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS) - message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS") -endif() - -add_library(ggml-base - ../include/ggml.h - ../include/ggml-alloc.h - ../include/ggml-backend.h - ../include/ggml-cpp.h - ../include/ggml-opt.h - ../include/gguf.h - ggml.c - ggml-alloc.c - ggml-backend.cpp - ggml-opt.cpp - ggml-threading.cpp - ggml-threading.h - ggml-quants.c - ggml-quants.h - gguf.cpp) - -target_include_directories(ggml-base PRIVATE .) -if (GGML_BACKEND_DL) - target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL) -endif() - -add_library(ggml - ggml-backend-reg.cpp) - -target_link_libraries(ggml PUBLIC ggml-base) - -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - target_link_libraries(ggml PRIVATE dl stdc++fs) -endif() - -function(ggml_add_backend_library backend) - if (GGML_BACKEND_DL) - add_library(${backend} MODULE ${ARGN}) - # write the shared library to the output directory - set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) - add_dependencies(ggml ${backend}) - else() - add_library(${backend} ${ARGN}) - target_link_libraries(ggml PUBLIC ${backend}) - install(TARGETS ${backend} LIBRARY) - endif() - - target_link_libraries(${backend} PRIVATE ggml-base) - target_include_directories(${backend} PRIVATE ..) - - if (${BUILD_SHARED_LIBS}) - target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) - target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) - endif() - - if(NOT GGML_AVAILABLE_BACKENDS) - set(GGML_AVAILABLE_BACKENDS "${backend}" - CACHE INTERNAL "List of backends for cmake package") - else() - list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend) - if(has_backend EQUAL -1) - set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}" - CACHE INTERNAL "List of backends for cmake package") - endif() - endif() -endfunction() - -function(ggml_add_backend backend) - string(TOUPPER "GGML_${backend}" backend_id) - if (${backend_id}) - string(TOLOWER "ggml-${backend}" backend_target) - add_subdirectory(${backend_target}) - message(STATUS "Including ${backend} backend") - if (NOT GGML_BACKEND_DL) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) - endif() - endif() -endfunction() - -function(ggml_add_cpu_backend_variant tag_name) - set(GGML_CPU_TAG_NAME ${tag_name}) - # other: OPENMP LLAMAFILE CPU_HBM - foreach (feat NATIVE - AVX AVX2 BMI2 AVX_VNNI FMA F16C - AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 - AMX_TILE AMX_INT8 AMX_BF16) - set(GGML_${feat} OFF) - endforeach() - - foreach (feat ${ARGN}) - set(GGML_${feat} ON) - endforeach() - - ggml_add_cpu_backend_variant_impl(${tag_name}) -endfunction() - -ggml_add_backend(CPU) - -if (GGML_CPU_ALL_VARIANTS) - if (NOT GGML_BACKEND_DL) - message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") - endif() - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) - if (NOT MSVC) - # MSVC doesn't support AMX - ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) - endif() -elseif (GGML_CPU) - ggml_add_cpu_backend_variant_impl("") -endif() - -ggml_add_backend(BLAS) -ggml_add_backend(CANN) -ggml_add_backend(CUDA) -ggml_add_backend(HIP) -ggml_add_backend(Kompute) -ggml_add_backend(METAL) -ggml_add_backend(MUSA) -ggml_add_backend(RPC) -ggml_add_backend(SYCL) -ggml_add_backend(Vulkan) -ggml_add_backend(OpenCL) - -foreach (target ggml-base ggml) - target_include_directories(${target} PUBLIC $ $) - target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump -endforeach() - -target_link_libraries(ggml-base PRIVATE Threads::Threads) - -find_library(MATH_LIBRARY m) -if (MATH_LIBRARY) - if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT}) - target_link_libraries(ggml-base PRIVATE m) - endif() -endif() - -if (CMAKE_SYSTEM_NAME MATCHES "Android") - target_link_libraries(ggml-base PRIVATE dl) -endif() - -if(CMAKE_SYSTEM_NAME MATCHES "visionOS") - target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE) -endif() - -if (BUILD_SHARED_LIBS) - foreach (target ggml-base ggml) - set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(${target} PRIVATE GGML_BUILD) - target_compile_definitions(${target} PUBLIC GGML_SHARED) - endforeach() -endif() diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c deleted file mode 100644 index a3d3f690..00000000 --- a/ggml/src/ggml-alloc.c +++ /dev/null @@ -1,1039 +0,0 @@ -#include "ggml-alloc.h" -#include "ggml-backend-impl.h" -#include "ggml.h" -#include "ggml-impl.h" -#include -#include -#include -#include -#include -#include - -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define MAX_FREE_BLOCKS 256 - -//#define GGML_ALLOCATOR_DEBUG - -//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) -#define AT_PRINTF(...) - - -static bool ggml_is_view(const struct ggml_tensor * t) { - return t->view_src != NULL; -} - -static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; -} - -// ops that return true for this function must not use restrict pointers for their backend implementations -static bool ggml_op_can_inplace(enum ggml_op op) { - switch (op) { - case GGML_OP_SCALE: - case GGML_OP_DIAG_MASK_ZERO: - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_ADD: - case GGML_OP_ADD1: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_UNARY: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - case GGML_OP_SILU_BACK: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - return true; - - default: - return false; - } -} - -static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { - assert(alignment && !(alignment & (alignment - 1))); // power of 2 - size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; - return offset + align; -} - -// tallocr - -struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) { - void * base = ggml_backend_buffer_get_base(buffer); - size_t align = ggml_backend_buffer_get_alignment(buffer); - - assert(align && !(align & (align - 1))); // power of 2 - - struct ggml_tallocr talloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.base = */ base, - /*.alignment = */ align, - /*.offset = */ aligned_offset(base, 0, align), - }; - return talloc; -} - -enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { - size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); - size = GGML_PAD(size, talloc->alignment); - - if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { - GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", - __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); - GGML_ABORT("not enough space in the buffer"); - } - - void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; - talloc->offset += size; - - assert(((uintptr_t)addr % talloc->alignment) == 0); - - return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); -} - -// dynamic tensor allocator - -struct free_block { - size_t offset; - size_t size; -}; - -struct ggml_dyn_tallocr { - size_t alignment; - int n_free_blocks; - struct free_block free_blocks[MAX_FREE_BLOCKS]; - size_t max_size; - -#ifdef GGML_ALLOCATOR_DEBUG - struct { - const struct ggml_tensor * tensor; - size_t offset; - } allocated_tensors[1024]; -#endif -}; - -#ifdef GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { - for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i].tensor == NULL) { - alloc->allocated_tensors[i].tensor = tensor; - alloc->allocated_tensors[i].offset = offset; - return; - } - } - GGML_ABORT("out of allocated_tensors"); -} -static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { - for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i].offset == offset) { - alloc->allocated_tensors[i].tensor = NULL; - return; - } - } - GGML_ABORT("tried to free tensor %s not found\n", tensor->name); -} -#endif - -static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { - size = aligned_offset(NULL, size, alloc->alignment); - - AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); - - size_t max_avail = 0; - - // find the best fitting free block besides the last block - int best_fit_block = -1; - size_t best_fit_size = SIZE_MAX; - for (int i = 0; i < alloc->n_free_blocks - 1; i++) { - struct free_block * block = &alloc->free_blocks[i]; - max_avail = MAX(max_avail, block->size); - if (block->size >= size && block->size <= best_fit_size) { - best_fit_block = i; - best_fit_size = block->size; - } - } - - if (best_fit_block == -1) { - // the last block is our last resort - struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; - max_avail = MAX(max_avail, block->size); - if (block->size >= size) { - best_fit_block = alloc->n_free_blocks - 1; - } else { - // this should never happen - GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", - __func__, size, max_avail); - GGML_ABORT("not enough space in the buffer"); - } - } - - struct free_block * block = &alloc->free_blocks[best_fit_block]; - size_t offset = block->offset; - block->offset = offset + size; - block->size -= size; - if (block->size == 0) { - // remove block if empty - alloc->n_free_blocks--; - for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { - alloc->free_blocks[j] = alloc->free_blocks[j+1]; - } - } - - AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset); - -#ifdef GGML_ALLOCATOR_DEBUG - add_allocated_tensor(alloc, offset, tensor); - size_t cur_max = offset + size; - if (cur_max > alloc->max_size) { - // sort allocated_tensors by offset - for (int i = 0; i < 1024; i++) { - for (int j = i + 1; j < 1024; j++) { - if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) { - const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor; - size_t tmp_offset = alloc->allocated_tensors[i].offset; - alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor; - alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset; - alloc->allocated_tensors[j].tensor = tmp_tensor; - alloc->allocated_tensors[j].offset = tmp_offset; - } - } - } - GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); - for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i].tensor) { - GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, - alloc->allocated_tensors[i].offset, - alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), - ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); - } - } - GGML_LOG_DEBUG("\n"); - } -#endif - - alloc->max_size = MAX(alloc->max_size, offset + size); - - return offset; - - GGML_UNUSED(tensor); -} - -// this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) { - size = aligned_offset(NULL, size, alloc->alignment); - - AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks); - -#ifdef GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, offset, tensor); -#endif - - // see if we can merge with an existing block - for (int i = 0; i < alloc->n_free_blocks; i++) { - struct free_block * block = &alloc->free_blocks[i]; - // check if ptr is at the end of the block - if (block->offset + block->size == offset) { - block->size += size; - // check if we can merge with the next block - if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) { - block->size += alloc->free_blocks[i+1].size; - alloc->n_free_blocks--; - for (int j = i+1; j < alloc->n_free_blocks; j++) { - alloc->free_blocks[j] = alloc->free_blocks[j+1]; - } - } - return; - } - // check if ptr is at the beginning of the block - if (offset + size == block->offset) { - block->offset = offset; - block->size += size; - // check if we can merge with the previous block - if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) { - alloc->free_blocks[i-1].size += block->size; - alloc->n_free_blocks--; - for (int j = i; j < alloc->n_free_blocks; j++) { - alloc->free_blocks[j] = alloc->free_blocks[j+1]; - } - } - return; - } - } - // otherwise, add a new block - GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); - // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) - int insert_pos = 0; - while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) { - insert_pos++; - } - // shift all blocks from insert_pos onward to make room for the new block - for (int i = alloc->n_free_blocks; i > insert_pos; i--) { - alloc->free_blocks[i] = alloc->free_blocks[i-1]; - } - // insert the new block - alloc->free_blocks[insert_pos].offset = offset; - alloc->free_blocks[insert_pos].size = size; - alloc->n_free_blocks++; - - GGML_UNUSED(tensor); -} - -static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { - alloc->n_free_blocks = 1; - alloc->free_blocks[0].offset = 0; - alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows - alloc->max_size = 0; - -#ifdef GGML_ALLOCATOR_DEBUG - for (int i = 0; i < 1024; i++) { - alloc->allocated_tensors[i].tensor = NULL; - } -#endif -} - -static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) { - struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr)); - - *alloc = (struct ggml_dyn_tallocr) { - /*.alignment = */ alignment, - /*.n_free_blocks = */ 0, - /*.free_blocks = */ {{0}}, - /*.max_size = */ 0, -#ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {{0}}, -#endif - }; - - ggml_dyn_tallocr_reset(alloc); - - return alloc; -} - -static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { - free(alloc); -} - -static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { - return alloc->max_size; -} - - -///////////////////////////////////// - -// graph allocator - -struct hash_node { - int n_children; - int n_views; - int buffer_id; - size_t offset; // offset within the buffer - bool allocated; -}; - -struct tensor_alloc { - int buffer_id; - size_t offset; - size_t size_max; // 0 = pre-allocated, unused, or view -}; - -struct leaf_alloc { - struct tensor_alloc leaf; -}; - -struct node_alloc { - struct tensor_alloc dst; - struct tensor_alloc src[GGML_MAX_SRC]; -}; - -struct ggml_gallocr { - ggml_backend_buffer_type_t * bufts; // [n_buffers] - ggml_backend_buffer_t * buffers; // [n_buffers] - struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] - int n_buffers; - - struct ggml_hash_set hash_set; - struct hash_node * hash_values; // [hash_set.size] - - struct node_alloc * node_allocs; // [n_nodes] - int n_nodes; - - struct leaf_alloc * leaf_allocs; // [n_leafs] - int n_leafs; -}; - -ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { - ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); - GGML_ASSERT(galloc != NULL); - - galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); - GGML_ASSERT(galloc->bufts != NULL); - - galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); - GGML_ASSERT(galloc->buffers != NULL); - - galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); - GGML_ASSERT(galloc->buf_tallocs != NULL); - - for (int i = 0; i < n_bufs; i++) { - galloc->bufts[i] = bufts[i]; - galloc->buffers[i] = NULL; - - // check if the same buffer type is used multiple times and reuse the same allocator - for (int j = 0; j < i; j++) { - if (bufts[i] == bufts[j]) { - galloc->buf_tallocs[i] = galloc->buf_tallocs[j]; - break; - } - } - - if (galloc->buf_tallocs[i] == NULL) { - size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); - galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); - } - } - galloc->n_buffers = n_bufs; - - return galloc; -} - -ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) { - return ggml_gallocr_new_n(&buft, 1); -} - -void ggml_gallocr_free(ggml_gallocr_t galloc) { - if (galloc == NULL) { - return; - } - - for (int i = 0; i < galloc->n_buffers; i++) { - if (galloc->buffers != NULL) { - // skip if already freed - bool freed = false; - for (int j = 0; j < i; j++) { - if (galloc->buffers[j] == galloc->buffers[i]) { - freed = true; - break; - } - } - if (!freed) { - ggml_backend_buffer_free(galloc->buffers[i]); - } - } - if (galloc->buf_tallocs != NULL) { - // skip if already freed - bool freed = false; - for (int j = 0; j < i; j++) { - if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { - freed = true; - break; - } - } - if (!freed) { - ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); - } - } - } - - ggml_hash_set_free(&galloc->hash_set); - free(galloc->hash_values); - free(galloc->bufts); - free(galloc->buffers); - free(galloc->buf_tallocs); - free(galloc->node_allocs); - free(galloc->leaf_allocs); - free(galloc); -} - -typedef struct ggml_gallocr * ggml_gallocr_t; - -static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { - size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t); - return &galloc->hash_values[i]; -} - -static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return ggml_gallocr_hash_get(galloc, t)->allocated; -} - -static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; -} - -static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { - GGML_ASSERT(buffer_id >= 0); - struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - - if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { - hn->allocated = true; - assert(hn->offset == 0); - - // try to reuse a parent's buffer (inplace) - if (ggml_op_can_inplace(node->op)) { - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - continue; - } - - // if the node's data is external, then we cannot re-use it - if (!ggml_gallocr_is_own(galloc, parent)) { - AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); - continue; - } - - // outputs cannot be reused - if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { - AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); - continue; - } - - if (!ggml_are_same_layout(node, parent)) { - AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name); - continue; - } - - struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); - if (p_hn->n_children == 1 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - assert(view_src_hn->offset == p_hn->offset); - hn->buffer_id = p_hn->buffer_id; - hn->offset = p_hn->offset; - p_hn->allocated = false; // avoid freeing the parent - view_src_hn->allocated = false; - return; - } - } else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - hn->buffer_id = p_hn->buffer_id; - hn->offset = p_hn->offset; - p_hn->allocated = false; // avoid freeing the parent - return; - } - } - } - } - // allocate tensor from the buffer - struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; - ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; - size_t size = ggml_backend_buft_get_alloc_size(buft, node); - size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); - hn->buffer_id = buffer_id; - hn->offset = offset; - } -} - -static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - // graph outputs are never freed - if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { - AT_PRINTF("not freeing output %s\n", node->name); - return; - } - - struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - size_t offset = hn->offset; - int buffer_id = hn->buffer_id; - struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; - ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; - size_t size = ggml_backend_buft_get_alloc_size(buft, node); - ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); - hn->allocated = false; -} - -static int get_node_buffer_id(const int * node_buffer_ids, int i) { - return node_buffer_ids ? node_buffer_ids[i] : 0; -} - -static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { - // clear hash tables - ggml_hash_set_reset(&galloc->hash_set); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); - - // allocate leafs - // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i)); - } - - // count number of children and views - // allocate other graph inputs and leafs first to avoid overwriting them - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - - // TODO: better way to add external dependencies - // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to - // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node - // itself is never used and should not be considered a dependency - if (ggml_is_view(node) && node->op != GGML_OP_NONE) { - struct ggml_tensor * view_src = node->view_src; - ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; - } - - if (node->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); - } - - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - - ggml_gallocr_hash_get(galloc, src)->n_children += 1; - - // allocate explicit inputs - if (src->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); - } - } - } - - // allocate tensors - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - int buffer_id = get_node_buffer_id(node_buffer_ids, i); - - // allocate parents (only leafs need to be allocated at this point) - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - continue; - } - ggml_gallocr_allocate_node(galloc, parent, buffer_id); - } - - // allocate node - ggml_gallocr_allocate_node(galloc, node, buffer_id); - - AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - continue; - } - AT_PRINTF("%s", parent->name); - if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); - } - } - AT_PRINTF("\n"); - - // update parents - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - continue; - } - struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); - p_hn->n_children -= 1; - - AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n", - parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", - view_src->name, view_src_hn->n_children, view_src_hn->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { - ggml_gallocr_free_node(galloc, view_src); - } - } - else if (p_hn->allocated) { - ggml_gallocr_free_node(galloc, parent); - } - } - AT_PRINTF("\n"); - } - } -} - -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { - size_t min_hash_size = graph->n_nodes + graph->n_leafs; - // add 25% margin to avoid hash collisions - min_hash_size += min_hash_size / 4; - - // initialize hash table - if (galloc->hash_set.size < min_hash_size) { - ggml_hash_set_free(&galloc->hash_set); - galloc->hash_set = ggml_hash_set_new(min_hash_size); - GGML_ASSERT(galloc->hash_set.keys != NULL); - - free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size); - GGML_ASSERT(galloc->hash_values != NULL); - } - - // reset allocators - for (int i = 0; i < galloc->n_buffers; i++) { - ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]); - } - - // allocate in hash table - ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids); - - // set the node_allocs from the hash table - if (galloc->n_nodes < graph->n_nodes) { - free(galloc->node_allocs); - galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc)); - GGML_ASSERT(galloc->node_allocs != NULL); - } - galloc->n_nodes = graph->n_nodes; - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - struct node_alloc * node_alloc = &galloc->node_allocs[i]; - if (node->view_src || node->data) { - node_alloc->dst.buffer_id = -1; - node_alloc->dst.offset = SIZE_MAX; - node_alloc->dst.size_max = 0; - } else { - struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - node_alloc->dst.buffer_id = hn->buffer_id; - node_alloc->dst.offset = hn->offset; - node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (!src || src->view_src || src->data) { - node_alloc->src[j].buffer_id = -1; - node_alloc->src[j].offset = SIZE_MAX; - node_alloc->src[j].size_max = 0; - } else { - struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); - node_alloc->src[j].buffer_id = hn->buffer_id; - node_alloc->src[j].offset = hn->offset; - node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); - } - } - } - if (galloc->n_leafs < graph->n_leafs) { - free(galloc->leaf_allocs); - galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0])); - GGML_ASSERT(galloc->leaf_allocs != NULL); - } - galloc->n_leafs = graph->n_leafs; - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - if (leaf->view_src || leaf->data) { - galloc->leaf_allocs[i].leaf.buffer_id = -1; - galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; - galloc->leaf_allocs[i].leaf.size_max = 0; - } else { - galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; - galloc->leaf_allocs[i].leaf.offset = hn->offset; - galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); - } - } - - // reallocate buffers if needed - for (int i = 0; i < galloc->n_buffers; i++) { - // if the buffer type is used multiple times, we reuse the same buffer - for (int j = 0; j < i; j++) { - if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { - galloc->buffers[i] = galloc->buffers[j]; - break; - } - } - - size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; - size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); - - // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views - if (new_size > cur_size || galloc->buffers[i] == NULL) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - - ggml_backend_buffer_free(galloc->buffers[i]); - galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); - if (galloc->buffers[i] == NULL) { - GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); - return false; - } - ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); - } - } - - return true; -} - -bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { - return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); -} - -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { - int buffer_id = tensor_alloc->buffer_id; - assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); - - if (tensor->view_src != NULL) { - if (tensor->buffer == NULL) { - assert(tensor_alloc->offset == SIZE_MAX); - if (tensor->view_src->buffer == NULL) { - // this tensor was allocated without ggml-backend - return; - } - ggml_backend_view_init(tensor); - } - } else { - if (tensor->data == NULL) { - assert(tensor_alloc->offset != SIZE_MAX); - assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); - void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); - void * addr = (char *)base + tensor_alloc->offset; - ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr); - } else { - if (tensor->buffer == NULL) { - // this tensor was allocated without ggml-backend - return; - } - } - } -} - -static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { - size_t node_size = 0; - if (!node->data && !node->view_src) { - GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API - node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); - } - return talloc->size_max >= node_size; -} - -static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { - if (galloc->n_nodes != graph->n_nodes) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__); -#endif - return true; - } - - if (galloc->n_leafs != graph->n_leafs) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__); -#endif - return true; - } - - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - struct node_alloc * node_alloc = &galloc->node_allocs[i]; - - if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name); -#endif - return true; - } - - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); -#endif - return true; - } - } - } - - return false; -} - -bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { - if (ggml_gallocr_needs_realloc(galloc, graph)) { - if (galloc->n_buffers == 1) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); -#endif - if (!ggml_gallocr_reserve(galloc, graph)) { - return false; - } - } else { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); -#endif - return false; - } - } - - // reset buffers - for (int i = 0; i < galloc->n_buffers; i++) { - if (galloc->buffers[i] != NULL) { - ggml_backend_buffer_reset(galloc->buffers[i]); - } - } - - // allocate the graph tensors from the previous assignments - // leafs - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; - ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf); - } - // nodes - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - struct node_alloc * node_alloc = &galloc->node_allocs[i]; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]); - } - ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst); - } - - return true; -} - -size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { - GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); - - if (galloc->buffers[buffer_id] == NULL) { - return 0; - } - - for (int i = 0; i < buffer_id; i++) { - if (galloc->buffers[i] == galloc->buffers[buffer_id]) { - // this buffer is the same as a previous one due to the same buffer type being used multiple times - // only return the buffer size the first time it appears to avoid double counting - return 0; - } - } - - return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); -} - -// utils - -static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { - for (size_t i = 0; i < *n_buffers; i++) { - ggml_backend_buffer_free((*buffers)[i]); - } - free(*buffers); -} - -static bool alloc_tensor_range(struct ggml_context * ctx, - struct ggml_tensor * first, struct ggml_tensor * last, - ggml_backend_buffer_type_t buft, size_t size, - ggml_backend_buffer_t ** buffers, size_t * n_buffers) { - - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); - if (buffer == NULL) { - GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); - free_buffers(buffers, n_buffers); - return false; - } - - *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1)); - (*buffers)[(*n_buffers)++] = buffer; - - struct ggml_tallocr tallocr = ggml_tallocr_new(buffer); - - for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { - enum ggml_status status = GGML_STATUS_SUCCESS; - if (t->data == NULL) { - if (t->view_src == NULL) { - status = ggml_tallocr_alloc(&tallocr, t); - } else if (t->buffer == NULL) { - status = ggml_backend_view_init(t); - } - } else { - if (t->view_src != NULL && t->buffer == NULL) { - // view of a pre-allocated tensor - status = ggml_backend_view_init(t); - } - } - if (status != GGML_STATUS_SUCCESS) { - GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name); - free_buffers(buffers, n_buffers); - return false; - } - } - - return true; -} - -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { - GGML_ASSERT(ggml_get_no_alloc(ctx) == true); - - size_t alignment = ggml_backend_buft_get_alignment(buft); - size_t max_size = ggml_backend_buft_get_max_size(buft); - - ggml_backend_buffer_t * buffers = NULL; - size_t n_buffers = 0; - - size_t cur_buf_size = 0; - struct ggml_tensor * first = ggml_get_first_tensor(ctx); - for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { - size_t this_size = 0; - if (t->data == NULL && t->view_src == NULL) { - this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); - } - - if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { - // allocate tensors in the current buffer - if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { - return NULL; - } - first = t; - cur_buf_size = this_size; - } else { - cur_buf_size += this_size; - } - } - - // allocate remaining tensors - if (cur_buf_size > 0) { - if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { - return NULL; - } - } - - if (n_buffers == 0) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); -#endif - return NULL; - } - - ggml_backend_buffer_t buffer; - if (n_buffers == 1) { - buffer = buffers[0]; - } else { - buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); - } - free(buffers); - return buffer; -} - -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { - return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); -} diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt deleted file mode 100644 index d6676f3f..00000000 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ /dev/null @@ -1,107 +0,0 @@ -if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND - CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0) - message(STATUS "Using AMX") - - file(GLOB GGML_HEADERS_AMX "*.h") - list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h") - - file(GLOB GGML_SOURCES_AMX "*.cpp") - - add_library(ggml-amx - ${GGML_HEADERS_AMX} - ${GGML_SOURCES_AMX}) - - target_link_libraries(ggml-amx PRIVATE ggml-base) - target_include_directories(ggml-amx PRIVATE . ..) - - # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags - # TODO: integrate AMX backend into the CPU backend - if (MSVC) - # instruction set detection for MSVC only - if (GGML_NATIVE) - # TODO: improve, should not reference files from the parent folder - include(../ggml-cpu/cmake/FindSIMD.cmake) - endif () - if (GGML_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (GGML_AVX512_VBMI) - add_compile_definitions($<$:__AVX512VBMI__>) - add_compile_definitions($<$:__AVX512VBMI__>) - endif() - if (GGML_AVX512_VNNI) - add_compile_definitions($<$:__AVX512VNNI__>) - add_compile_definitions($<$:__AVX512VNNI__>) - endif() - if (GGML_AVX512_BF16) - add_compile_definitions($<$:__AVX512BF16__>) - add_compile_definitions($<$:__AVX512BF16__>) - endif() - if (GGML_AMX_TILE) - add_compile_definitions($<$:__AMX_TILE__>) - add_compile_definitions($<$:__AMX_TILE__>) - endif() - if (GGML_AMX_INT8) - add_compile_definitions($<$:__AMX_INT8__>) - add_compile_definitions($<$:__AMX_INT8__>) - endif() - if (GGML_AMX_BF16) - add_compile_definitions($<$:__AMX_BF16__>) - add_compile_definitions($<$:__AMX_BF16__>) - endif() - elseif (GGML_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (GGML_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) - endif() - else() - if (GGML_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (GGML_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (GGML_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (GGML_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (GGML_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (GGML_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512dq) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (GGML_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_FLAGS -mamx-tile) - endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_FLAGS -mamx-int8) - endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_FLAGS -mamx-bf16) - endif() - endif() - - target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS}) -else() - set(GGML_AMX OFF PARENT_SCOPE) - message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.") -endif() diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-amx/common.h deleted file mode 100644 index 5db8ce30..00000000 --- a/ggml/src/ggml-amx/common.h +++ /dev/null @@ -1,94 +0,0 @@ -#pragma once - -#include "ggml.h" -// hack until AMX is moved into the CPU backend -#include "../ggml-cpu/ggml-cpu-impl.h" // - -#include -#include -#include - -#if defined(_OPENMP) -#include -#endif - -#define TILE_M 16 -#define TILE_N 16 -#define TILE_K 32 -#define VNNI_BLK 4 - -#define AMX_BLK_SIZE 32 - -#define TMM0 0 -#define TMM1 1 -#define TMM2 2 -#define TMM3 3 -#define TMM4 4 -#define TMM5 5 -#define TMM6 6 -#define TMM7 7 - -// parallel routines -template ::value, int>::type = 0> -inline T div_up(T x, T y) { return (x + y - 1) / y; } - -template -inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) { -#if 0 - // onednn partition pattern - T& n_my = n_end; - if (nth <= 1 || n == 0) { - n_start = 0; - n_my = n; - } else { - T n1 = div_up(n, nth); - T n2 = n1 - 1; - T T1 = n - n2 * nth; - n_my = ith < T1 ? n1 : n2; - n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2; - } - n_end += n_start; -#else - // pytorch aten partition pattern - T n_my = div_up(n, nth); - n_start = ith * n_my; - n_end = std::min(n_start + n_my, n); -#endif -} - -template -inline void parallel_for(int nth, int n, const func_t& f) { -#if defined(_OPENMP) -#pragma omp parallel num_threads(nth) -{ - //int nth = omp_get_num_threads(); - int ith = omp_get_thread_num(); - int tbegin, tend; - balance211(n, nth, ith, tbegin, tend); - f(tbegin, tend); -} -#else - f(0, n); - - GGML_UNUSED(nth); -#endif -} - -// quantized types that have AMX support -inline bool qtype_has_amx_kernels(const enum ggml_type type) { - // TODO: fix padding for vnni format - return (type == GGML_TYPE_Q4_0) || - (type == GGML_TYPE_Q4_1); - //(type == GGML_TYPE_Q8_0) || - //(type == GGML_TYPE_Q4_K) || - //(type == GGML_TYPE_Q5_K) || - //(type == GGML_TYPE_Q6_K) || - //(type == GGML_TYPE_IQ4_XS); -} - -// ggml backend context -struct ggml_backend_amx_context { - int n_threads = GGML_DEFAULT_N_THREADS; - std::unique_ptr work_data; - size_t work_size = 0; -}; diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp deleted file mode 100644 index 8568e796..00000000 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ /dev/null @@ -1,446 +0,0 @@ -#include "ggml-amx.h" -#include "ggml-amx/common.h" -#include "ggml-amx/mmq.h" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" - -#if defined(__gnu_linux__) -#include -#include -#endif - -#include -#include -#include - -#if defined(__AMX_INT8__) - -// AMX buffer interface -static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); -} - -static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)(buffer->context); -} - -static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - if (qtype_has_amx_kernels(tensor->type)) { - ggml_backend_amx_convert_weight(tensor, data, offset, size); - } else { - memcpy((char *)tensor->data + offset, data, size); - } - - GGML_UNUSED(buffer); -} - -static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); - memcpy(data, (const char *)tensor->data + offset, size); - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { - if (ggml_backend_buffer_is_host(src->buffer)) { - if (qtype_has_amx_kernels(src->type)) { - ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst)); - } else { - memcpy(dst->data, src->data, ggml_nbytes(src)); - } - return true; - } - return false; - - GGML_UNUSED(buffer); -} - -static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - memset(buffer->context, value, buffer->size); -} - -static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { - /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, - /* .get_base = */ ggml_backend_amx_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, - /* .clear = */ ggml_backend_amx_buffer_clear, - /* .reset = */ NULL, -}; - -static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "AMX"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * data = aligned_alloc(TENSOR_ALIGNMENT, size); - if (data == NULL) { - fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); - return NULL; - } - - return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); -} - -static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return TENSOR_ALIGNMENT; - - GGML_UNUSED(buft); -} - -static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { - return ggml_backend_amx_get_alloc_size(tensor); - - GGML_UNUSED(buft); -} - -static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; - - GGML_UNUSED(buft); -} - -ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { - /* .iface = */ { - /* .get_name = */ ggml_backend_amx_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_amx_buffer_type_is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_buffer_type_amx; -} - -// backend interface - -static const char * ggml_backend_amx_name(ggml_backend_t backend) { - return "AMX"; - - GGML_UNUSED(backend); -} - -static void ggml_backend_amx_free(ggml_backend_t backend) { - ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; - delete ctx; - delete backend; -} - -static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; - - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - switch (node->op) { - case GGML_OP_MUL_MAT: - ggml_backend_amx_mul_mat(ctx, node); - break; - - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; - - default: - fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - GGML_ASSERT(false); - } - } - - return GGML_STATUS_SUCCESS; - - GGML_UNUSED(backend); -} - -static struct ggml_backend_i ggml_backend_amx_i = { - /* .get_name = */ ggml_backend_amx_name, - /* .free = */ ggml_backend_amx_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_amx_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_amx_guid() { - static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e }; - return &guid; -} - -#define ARCH_GET_XCOMP_PERM 0x1022 -#define ARCH_REQ_XCOMP_PERM 0x1023 -#define XFEATURE_XTILECFG 17 -#define XFEATURE_XTILEDATA 18 - -static bool ggml_amx_init() { -#if defined(__gnu_linux__) - if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { - fprintf(stderr, "AMX is not ready to be used!\n"); - return false; - } - return true; -#elif defined(_WIN32) - return true; -#endif -} - -ggml_backend_t ggml_backend_amx_init() { - - // invoke a Linux system call to request access to AMX features - ggml_amx_init(); - - // backend context - ggml_backend_amx_context * ctx = new ggml_backend_amx_context; - - // ggml amx backend - ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_amx_guid(), - /* .interface = */ ggml_backend_amx_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), - /* .context = */ ctx, - }; - - return backend; -} - -bool ggml_backend_is_amx(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid()); -} - -void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { - GGML_ASSERT(ggml_backend_is_amx(backend_amx)); - - ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context; - ctx->n_threads = n_threads; -} - -// device interface - -static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) { - return "AMX"; - - GGML_UNUSED(dev); -} - -static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) { - return "Intel Advanced Matrix Extensions"; - - GGML_UNUSED(dev); -} - -static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO - *free = 0; - *total = 0; - - GGML_UNUSED(dev); -} - -static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_ACCEL; - - GGML_UNUSED(dev); -} - -static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_amx_device_get_name(dev); - props->description = ggml_backend_amx_device_get_description(dev); - props->type = ggml_backend_amx_device_get_type(dev); - ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total); - - // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged - props->caps = { - /* .async = */ false, - /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; -} - -static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) { - return ggml_backend_amx_init(); - - GGML_UNUSED(dev); - GGML_UNUSED(params); -} - -static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_amx_buffer_type(); - - GGML_UNUSED(dev); -} - -static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - - // handle only 2d gemm for now - auto is_contiguous_2d = [](const struct ggml_tensor * t) { - return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; - }; - - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - return true; - - case GGML_OP_MUL_MAT: { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - const enum ggml_type type = src0->type; - const int64_t ne0 = op->ne[0]; - - // amx kernels enables for Q4_0, Q4_1, Q8_0, F16 - // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256 - bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); - - bool can_use_amx = - is_contiguous_2d(src0) && // src0 must be contiguous - is_contiguous_2d(src1) && // src1 must be contiguous - src1->type == GGML_TYPE_F32 && // src1 must be float32 - has_amx_kernels && // with amx kernel impls - ne0 % (TILE_N * 2) == 0; // out_features is 32x - - return can_use_amx; - } - default: - return false; - } - - GGML_UNUSED(dev); -} - -static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; - - GGML_UNUSED(dev); -} - -static const struct ggml_backend_device_i ggml_backend_amx_device_i = { - /* .get_name = */ ggml_backend_amx_device_get_name, - /* .get_description = */ ggml_backend_amx_device_get_description, - /* .get_memory = */ ggml_backend_amx_device_get_memory, - /* .get_type = */ ggml_backend_amx_device_get_type, - /* .get_props = */ ggml_backend_amx_device_get_props, - /* .init_backend = */ ggml_backend_amx_device_init, - /* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ ggml_backend_amx_device_supports_op, - /* .supports_buft = */ ggml_backend_amx_device_supports_buft, - /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -// backend reg interface - -static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) { - return "AMX"; - - GGML_UNUSED(reg); -} - -static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) { - return 1; - - GGML_UNUSED(reg); -} - -static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) { - GGML_ASSERT(index == 0); - - static ggml_backend_device ggml_backend_amx_device = { - /* .iface = */ ggml_backend_amx_device_i, - /* .reg = */ reg, - /* .context = */ nullptr, - }; - - return &ggml_backend_amx_device; - - GGML_UNUSED(reg); - GGML_UNUSED(index); -} - -static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { - return (void *)ggml_backend_amx_set_n_threads; - } - return NULL; - - GGML_UNUSED(reg); - GGML_UNUSED(name); -} - -static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { - /* .get_name = */ ggml_backend_amx_reg_get_name, - /* .get_device_count = */ ggml_backend_amx_reg_get_device_count, - /* .get_device = */ ggml_backend_amx_reg_get_device, - /* .get_proc_address = */ ggml_backend_amx_get_proc_address, -}; - -ggml_backend_reg_t ggml_backend_amx_reg(void) { - static struct ggml_backend_reg ggml_backend_amx_reg = { - /* .iface = */ ggml_backend_amx_reg_i, - /* .context = */ NULL, - }; - - return &ggml_backend_amx_reg; -} - -#else // if defined(__AMX_INT8__) - -ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) { - return nullptr; -} - -bool ggml_backend_is_amx(ggml_backend_t backend) { - GGML_UNUSED(backend); - return false; -} - -ggml_backend_t ggml_backend_amx_init(void) { - fprintf(stderr, "GGML is not compiled with AMX support!\n"); - return nullptr; -} - -void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { - fprintf(stderr, "GGML is not compiled with AMX support!\n"); - - GGML_UNUSED(backend_amx); - GGML_UNUSED(n_threads); -} - -ggml_backend_reg_t ggml_backend_amx_reg(void) { - return nullptr; -} - -#endif diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-amx/mmq.cpp deleted file mode 100644 index 529bee25..00000000 --- a/ggml/src/ggml-amx/mmq.cpp +++ /dev/null @@ -1,2510 +0,0 @@ - -#if defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wpedantic" -#pragma GCC diagnostic ignored "-Wunused-local-typedefs" -#endif - -#include "mmq.h" -#include "ggml-impl.h" -#include "ggml-quants.h" -#include -#include - -#if defined(__gnu_linux__) -#include -#include -#endif - -#if defined(_OPENMP) -#include -#endif - -#if (defined(_WIN32) || defined(_WIN64)) -#define RESTRICT __restrict -#else -#define RESTRICT __restrict__ -#endif - -#if (defined(_WIN32) || defined(_WIN64)) -#define ALWAYS_INLINE __forceinline -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define ALWAYS_INLINE __attribute__((__always_inline__)) inline -#else -#define ALWAYS_INLINE inline -#endif - -#if defined(__AMX_INT8__) - -namespace { - -// Forced unrolling -template -struct Unroll { - template - ALWAYS_INLINE void operator()(const Func& f, Args... args) const { - Unroll{}(f, args...); - f(std::integral_constant{}, args...); - } -}; - -template <> -struct Unroll<1> { - template - ALWAYS_INLINE void operator()(const Func& f, Args... args) const { - f(std::integral_constant{}, args...); - } -}; - -// type traits -template struct PackedTypes {}; -template <> struct PackedTypes { using type = int8_t; }; -template <> struct PackedTypes { using type = uint8_t; }; -template <> struct PackedTypes { using type = int8_t; }; -template using packed_B_type = typename PackedTypes::type; - -template -struct do_compensate : std::integral_constant::value> {}; - -template -struct do_unpack : std::integral_constant::value || - std::is_same::value> {}; - -template -struct is_type_qkk : std::integral_constant::value || - std::is_same::value || - std::is_same::value || - std::is_same::value> {}; - -#define GGML_DISPATCH_FLOATING_TYPES(TYPE, ...) \ - [&] { \ - switch (TYPE) { \ - case GGML_TYPE_F16: { \ - using type = ggml_fp16_t; \ - constexpr int blck_size = 16; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_BF16: { \ - using type = ggml_bf16_t; \ - constexpr int blck_size = 32; \ - return __VA_ARGS__(); \ - } \ - default: \ - fprintf(stderr, "Unsupported floating data type\n"); \ - } \ - }() - -#define GGML_DISPATCH_QTYPES(QT, ...) \ - [&] { \ - switch (QT) { \ - case GGML_TYPE_Q4_0: { \ - using type = block_q4_0; \ - using vec_dot_type = block_q8_0; \ - constexpr int blck_size = QK4_0; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_Q4_1: { \ - using type = block_q4_1; \ - using vec_dot_type = block_q8_1; \ - constexpr int blck_size = QK4_1; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_Q8_0: { \ - using type = block_q8_0; \ - using vec_dot_type = block_q8_0; \ - constexpr int blck_size = QK8_0; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_Q4_K: { \ - using type = block_q4_K; \ - using vec_dot_type = block_q8_K; \ - constexpr int blck_size = QK_K; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_Q5_K: { \ - using type = block_q5_K; \ - using vec_dot_type = block_q8_K; \ - constexpr int blck_size = QK_K; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_Q6_K: { \ - using type = block_q6_K; \ - using vec_dot_type = block_q8_K; \ - constexpr int blck_size = QK_K; \ - return __VA_ARGS__(); \ - } \ - case GGML_TYPE_IQ4_XS: { \ - using type = block_iq4_xs; \ - using vec_dot_type = block_q8_K; \ - constexpr int blck_size = QK_K; \ - return __VA_ARGS__(); \ - } \ - default: \ - fprintf(stderr, "Unsupported quantized data type: %d\n", int(TYPE)); \ - } \ - }() - -#define GGML_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \ - [&] { \ - if (BOOL_V) { \ - constexpr bool BOOL_NAME = true; \ - return __VA_ARGS__(); \ - } else { \ - constexpr bool BOOL_NAME = false; \ - return __VA_ARGS__(); \ - } \ - }() - -// define amx tile config data structure -struct tile_config_t{ - uint8_t palette_id = 0; - uint8_t start_row = 0; - uint8_t reserved_0[14] = {0}; - uint16_t colsb[16] = {0}; - uint8_t rows[16] = {0}; -}; - -// Notes: amx tile config -// -// Typically, TMUL calculates A and B of size 16 x 64 containing INT8 values, -// and accumulate the result to a 16 x 16 matrix C containing INT32 values, -// -// As many GGUF quantized types as `block_size` of 32, so a 16-16-32 config is used -// instead of the normally used 16-16-64 config. -// -// Block A: {16, 32}, dtype = int8_t -// Block B: {16, 32}, dtype = uint8_t/int8_t -// Block C: {16, 16}, dtype = int32_t -// -// Block B needs to be prepacked to vnni format before feeding into TMUL: -// packed_B: from {n, k} to {k/vnni_blk, n, vnni_blck}, viewed in 2d, we get {8, 64} -// -// Therefore, we get tileconfig: -// A B C -// rows 16 8 16 -// colsb 32 64 16 -// -// For tile distribution, follow a 2-2-4 pattern, e.g. A used TMM2-TMM3, B used TMM0-TMM1, -// C used TMM4-TMM7: -// B TMM0 B TMM1 -// A TMM2 C TMM4 C TMM6 -// A TMM3 C TMM5 C TMM7 -// -// Each `amx` kernel handles 4 blocks at a time: 2MB * 2NB, when m < 2 * BLOCK_M, unpack A -// will be needed. -// -// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16; -// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`. -// -// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/ -// advanced-matrix-extensions-intrinsics-functions.html -// - -#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb -void ggml_tile_config_init(void) { - static thread_local bool is_first_time = true; - - if (!is_first_time) { - return; - } - - static thread_local tile_config_t tc; - tile_config_t current_tc; - _tile_storeconfig(¤t_tc); - - // load only when config changes - if (tc.palette_id == 0 || (memcmp(¤t_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 && - memcmp(¤t_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) { - tc.palette_id = 1; - tc.start_row = 0; - TC_CONFIG_TILE(TMM0, 8, 64); - TC_CONFIG_TILE(TMM1, 8, 64); - TC_CONFIG_TILE(TMM2, 16, 32); - TC_CONFIG_TILE(TMM3, 16, 32); - TC_CONFIG_TILE(TMM4, 16, 64); - TC_CONFIG_TILE(TMM5, 16, 64); - TC_CONFIG_TILE(TMM6, 16, 64); - TC_CONFIG_TILE(TMM7, 16, 64); - _tile_loadconfig(&tc); - } - - is_first_time = false; -} - -// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation. -// See the notes `s8s8 igemm compensation in avx512-vnni` for detail. -template -int get_tile_size() { - int tile_size = TILE_N * sizeof(TB); - if (do_compensate::value) { - tile_size += TILE_N * sizeof(int32_t); - } - if (std::is_same::value || - std::is_same::value) { - tile_size += TILE_N * 4; - } - if (std::is_same::value) { - tile_size += TILE_N * 2; - } - return tile_size; -} - -template -int get_row_size(int K) { - int KB = K / BLOCK_K; - int row_size = KB * sizeof(TB); - if (do_compensate::value) { - row_size += KB * sizeof(int32_t); - } - if (std::is_same::value || - std::is_same::value) { - row_size += KB * 4; - } - if (std::is_same::value) { - row_size += KB * 2; - } - return row_size; -} - -// vectorized dtype conversion -inline float FP16_TO_FP32(ggml_half val) { - __m256i v = _mm256_setr_epi16( - val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - __m512 o = _mm512_cvtph_ps(v); - return _mm512_cvtss_f32(o); -} - -inline __m512 FP16_TO_FP32_VEC(ggml_half val) { - __m256i v = _mm256_set1_epi16(val); - return _mm512_cvtph_ps(v); -} - -// horizontal reduce -inline float _mm512_reduce_max_ps(const __m512 x) { - __m512 v = x; - __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E); - v = _mm512_max_ps(v, v1); - v1 = _mm512_shuffle_f32x4(v, v, 0xB1); - v = _mm512_max_ps(v, v1); - v1 = _mm512_shuffle_ps(v, v, 0x4E); - v = _mm512_max_ps(v, v1); - v1 = _mm512_shuffle_ps(v, v, 0xB1); - v = _mm512_max_ps(v, v1); - return _mm512_cvtss_f32(v); -} - -// transpose utils -#define SHUFFLE_EPI32(a, b, mask) \ - _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask)) -inline void transpose_8x8_32bit(__m256i * v, __m256i * v1) { - // unpacking and 32-bit elements - v1[0] = _mm256_unpacklo_epi32(v[0], v[1]); - v1[1] = _mm256_unpackhi_epi32(v[0], v[1]); - v1[2] = _mm256_unpacklo_epi32(v[2], v[3]); - v1[3] = _mm256_unpackhi_epi32(v[2], v[3]); - v1[4] = _mm256_unpacklo_epi32(v[4], v[5]); - v1[5] = _mm256_unpackhi_epi32(v[4], v[5]); - v1[6] = _mm256_unpacklo_epi32(v[6], v[7]); - v1[7] = _mm256_unpackhi_epi32(v[6], v[7]); - - // shuffling the 32-bit elements - v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44); - v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee); - v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44); - v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee); - v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44); - v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee); - v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44); - v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee); - - // shuffling 128-bit elements - v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02); - v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02); - v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02); - v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02); - v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13); - v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13); - v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13); - v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13); -} - -inline void transpose_16x4_32bit(__m512i * r, __m512i * d) { - - static const __m512i index1 = _mm512_set_epi32( - 0x0f, 0x0b, 0x07, 0x03, - 0x0e, 0x0a, 0x06, 0x02, - 0x0d, 0x09, 0x05, 0x01, - 0x0c, 0x08, 0x04, 0x00); - - d[0] = _mm512_permutexvar_epi32(index1, r[0]); - d[1] = _mm512_permutexvar_epi32(index1, r[1]); - d[2] = _mm512_permutexvar_epi32(index1, r[2]); - d[3] = _mm512_permutexvar_epi32(index1, r[3]); - - r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44); - r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee); - r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44); - r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee); - - d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88); - d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd); - d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88); - d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd); -} - -inline void transpose_16x16_32bit(__m512i * v) { - __m512i v1[16]; - v1[0] = _mm512_unpacklo_epi32(v[0], v[1]); - v1[1] = _mm512_unpackhi_epi32(v[0], v[1]); - v1[2] = _mm512_unpacklo_epi32(v[2], v[3]); - v1[3] = _mm512_unpackhi_epi32(v[2], v[3]); - v1[4] = _mm512_unpacklo_epi32(v[4], v[5]); - v1[5] = _mm512_unpackhi_epi32(v[4], v[5]); - v1[6] = _mm512_unpacklo_epi32(v[6], v[7]); - v1[7] = _mm512_unpackhi_epi32(v[6], v[7]); - v1[8] = _mm512_unpacklo_epi32(v[8], v[9]); - v1[9] = _mm512_unpackhi_epi32(v[8], v[9]); - v1[10] = _mm512_unpacklo_epi32(v[10], v[11]); - v1[11] = _mm512_unpackhi_epi32(v[10], v[11]); - v1[12] = _mm512_unpacklo_epi32(v[12], v[13]); - v1[13] = _mm512_unpackhi_epi32(v[12], v[13]); - v1[14] = _mm512_unpacklo_epi32(v[14], v[15]); - v1[15] = _mm512_unpackhi_epi32(v[14], v[15]); - - v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]); - v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]); - v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]); - v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]); - v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]); - v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]); - v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]); - v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]); - v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]); - v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]); - v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]); - v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]); - v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]); - v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]); - v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]); - v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]); - - v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88); - v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88); - v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88); - v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88); - v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd); - v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd); - v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd); - v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd); - v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88); - v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88); - v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88); - v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88); - v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd); - v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd); - v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd); - v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd); - - v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88); - v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88); - v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88); - v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88); - v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88); - v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88); - v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88); - v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88); - v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd); - v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd); - v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd); - v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd); - v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd); - v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd); - v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd); - v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd); -} - -void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - const int KB = k / QK_K; - constexpr int kVecs = QK_K / 16; - - block_q8_K * y = reinterpret_cast(vy); - - // hold 16 float vecs from x - __m512 v[kVecs]; - - // hold the quants vecs - __m512i vq[kVecs / 4]; - - // hold the packed quants vecs - __m512i vq_packed[kVecs / 4]; - - const __m512 signBit = _mm512_set1_ps(-0.f); - - for (int i = 0; i < KB; ++i) { - // Compute max(abs(e)) for the block - __m512 vamax = _mm512_set1_ps(0.f); - for (int j = 0; j < kVecs; ++j) { - v[j] = _mm512_loadu_ps(x); x += 16; - vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(signBit, v[j])); - } - const float amax = _mm512_reduce_max_ps(vamax); - - // Quantize these floats - const float iscale = 127.f / amax; - y[i].d = GGML_FP32_TO_FP16(1 / iscale); - const float id = ( amax != 0.0f ) ? iscale : 0.f; - const __m512 vscale = _mm512_set1_ps(id); - - // Apply multiplier and round to nearest integer - for (int j = 0; j < kVecs; ++j) { - v[j] = _mm512_mul_ps(v[j], vscale); - v[j] = _mm512_roundscale_ps(v[j], (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - } - - // Pack to epi8 vecs - for (int j = 0; j < kVecs / 4; ++j) { - __m128i q8_0 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 0])); - __m128i q8_1 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 1])); - __m128i q8_2 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 2])); - __m128i q8_3 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 3])); - - __m256i q8_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_0), (q8_1), 1); - __m256i q8_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_2), (q8_3), 1); - - vq[j] = _mm512_inserti32x8(_mm512_castsi256_si512(q8_01), q8_23, 1); - _mm512_storeu_si512((__m512i *)(y[i].qs + j * 64), vq[j]); - } - - // Compute the bsums with vnni - transpose_16x4_32bit(vq, vq_packed); - - const __m512i one = _mm512_set1_epi8(1); - __m512i sum = _mm512_setzero_si512(); - for (int k = 0; k < 4; ++k) { - sum = _mm512_dpbusd_epi32(sum, one, vq_packed[k]); - } - _mm256_storeu_si256((__m256i *)(y[i].bsums), _mm512_cvtepi32_epi16(sum)); - } -} - -// quantize A from float to `vec_dot_type` -template -inline void from_float(const float * x, char * vy, int64_t k); - -template <> -inline void from_float(const float * x, char * vy, int64_t k) { - // FIXME: using unoptimized reference impl until moved to CPU backend - quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k); -} - -template <> -inline void from_float(const float * x, char * vy, int64_t k) { - quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k); -} - -template <> -inline void from_float(const float * x, char * vy, int64_t k) { -#if 1 - // TODO: this is reference impl! - quantize_row_q8_K_ref(x, (block_q8_K *)vy, k); -#else - quantize_row_q8_K_vnni(x, vy, k); -#endif -} - -// load A from memory to array when nrows can not fill in whole tile -void unpack_A(int8_t * RESTRICT tile, const block_q8_0 * RESTRICT A, int lda, int nr) { - assert(nr != TILE_M); - for (int m = 0; m < nr; ++m) { - const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs)); - _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v); - } -} - -void unpack_A(int8_t * RESTRICT tile, const block_q8_1 * RESTRICT A, int lda, int nr) { - assert(nr != TILE_M); - for (int m = 0; m < nr; ++m) { - const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs)); - _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v); - } -} - -template -void unpack_A(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) { - assert(nr <= TILE_M); - for (int m = 0; m < nr; ++m) { - const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs + k * 32)); - _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v); - } -} - -template <> -void unpack_A(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) { - assert(nr <= TILE_M); - // zero padding k from 16 to 32, so that we don't have to re-config amx - const __m128i zero = _mm_setzero_si128(); - for (int m = 0; m < nr; ++m) { - const __m128i v = _mm_loadu_si128((const __m128i *)(A[m * lda].qs + k * 16)); - const __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(v), zero, 1); - _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), r); - } -} - -#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) -inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8(0xF); - return _mm256_and_si256(lowMask, bytes); -} - -// used for block_q4_K -inline __m512i bytes_from_nibbles_64(const uint8_t * rsi) { - const __m256i tmp = _mm256_loadu_si256((const __m256i *)rsi); - const __m256i lowMask = _mm256_set1_epi8(0xF); - const __m256i q4l = _mm256_and_si256(tmp, lowMask); - const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(tmp, 4), lowMask); - return _mm512_inserti32x8(_mm512_castsi256_si512(q4l), q4h, 1); -} - -// used for block_q5_K -inline __m512i bytes_from_nibbles_64(const uint8_t * qs, const uint8_t * qh, int k) { - const __m256i lowMask = _mm256_set1_epi8(0xF); - __m256i hmask = _mm256_set1_epi8(1); - hmask = _mm256_slli_epi16(hmask, k); - - const __m256i q5bits = _mm256_loadu_si256((const __m256i *)qs); - const __m256i hbits = _mm256_loadu_si256((const __m256i *)qh); - - const __m256i q5l_0 = _mm256_and_si256(q5bits, lowMask); - const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 0), 4); - const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0); - hmask = _mm256_slli_epi16(hmask, 1); - - const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), lowMask); - const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 1), 4); - const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1); - - return _mm512_inserti32x8(_mm512_castsi256_si512(q5_0), q5_1, 1); -} - -// used for block_q6_K -inline void bytes_from_nibbles_128(__m512i& r0, __m512i& r1, const uint8_t * qs, const uint8_t * qh) { - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m256i m2 = _mm256_set1_epi8(0x3); - - const __m256i q6bits1 = _mm256_loadu_si256((const __m256i *)qs); - const __m256i q6bits2 = _mm256_loadu_si256((const __m256i *)(qs + 32)); - const __m256i q6bitsH = _mm256_loadu_si256((const __m256i *)qh); - - const __m256i q6h_0 = _mm256_slli_epi16(_mm256_and_si256( q6bitsH, m2), 4); - const __m256i q6h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 2), m2), 4); - const __m256i q6h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 4), m2), 4); - const __m256i q6h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 6), m2), 4); - - const __m256i q6_0 = _mm256_or_si256(_mm256_and_si256(q6bits1, m4), q6h_0); - const __m256i q6_1 = _mm256_or_si256(_mm256_and_si256(q6bits2, m4), q6h_1); - const __m256i q6_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits1, 4), m4), q6h_2); - const __m256i q6_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits2, 4), m4), q6h_3); - - r0 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_0), q6_1, 1); - r1 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_2), q6_3, 1); -} - -inline __m512i packNibbles(__m512i r0, __m512i r1) { - return _mm512_or_si512(r0, _mm512_slli_epi16(r1, 4)); -} - -template -inline void pack_qs(void * RESTRICT packed_B, const TB * RESTRICT B, int KB) { - int8_t tmp[8 * 64]; - __m256i v[8], v2[8]; - for (int n = 0; n < 8; ++n) { - v[n] = bytes_from_nibbles_32(B[n * KB].qs); - } - transpose_8x8_32bit(v, v2); - for (int n = 0; n < 8; ++n) { - _mm256_storeu_si256((__m256i *)(tmp + n * 64), v2[n]); - } - for (int n = 0; n < 8; ++n) { - v[n] = bytes_from_nibbles_32(B[(n + 8) * KB].qs); - } - transpose_8x8_32bit(v, v2); - for (int n = 0; n < 8; ++n) { - _mm256_storeu_si256((__m256i *)(tmp + n * 64 + 32), v2[n]); - } - - // pack again with 128 to fully utilize vector length - for (int n = 0; n < 8; n += 2) { - __m512i r0 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64)); - __m512i r1 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64 + 64)); - __m512i r1r0 = packNibbles(r0, r1); - _mm512_storeu_si512((__m512i *)((char *)packed_B + n * 32), r1r0); - } -} - -template <> -inline void pack_qs(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) { - __m256i v[8], v2[8]; - for (int n = 0; n < 8; ++n) { - v[n] = _mm256_loadu_si256((const __m256i *)(B[n * KB].qs)); - } - transpose_8x8_32bit(v, v2); - for (int n = 0; n < 8; ++n) { - _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64), v2[n]); - } - for (int n = 0; n < 8; ++n) { - v[n] = _mm256_loadu_si256((const __m256i *)(B[(n + 8) * KB].qs)); - } - transpose_8x8_32bit(v, v2); - for (int n = 0; n < 8; ++n) { - _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64 + 32), v2[n]); - } -} - -template <> -inline void pack_qs(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) { - __m512i v[16]; - // QK_K 256 with 8 groups, handle 2 groups at a time - char * pb = (char *)packed_B; - for (int k = 0; k < QK_K / 64; ++k) { - // pack 2 groups { n, g, k} to {g, k/4, 4n} - // e.g. {16, 2, 32} to {2, 8, 64} - for (int n = 0; n < TILE_N; ++n) { - v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32); - } - - transpose_16x16_32bit(v); - - // pack again with 128 to fully utilize vector length - for (int n = 0; n < TILE_N; n += 2) { - _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1])); - pb += 64; - } - } -} - -template <> -inline void pack_qs(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) { - __m512i v[16]; - const __m512i lowMask = _mm512_set1_epi8(0xF); - // QK_K 256 with 8 groups, handle 2 groups at a time - char * pb = (char *)packed_B; - char * ph = (char *)packed_B + (QK_K / 2) * TILE_N; - for (int k = 0; k < QK_K / 64; ++k) { - // pack 2 groups { n, g, k} to {g, k/4, 4n} - // e.g. {16, 2, 32} to {2, 8, 64} - for (int n = 0; n < TILE_N; ++n) { - v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32, B[n * KB].qh, /* group */2 * k); - } - - transpose_16x16_32bit(v); - - // 1. pack lower 4bits with 2 groups - for (int n = 0; n < TILE_N; n += 2) { - // get lower 4 bits - const __m512i r0 = _mm512_and_si512(v[n], lowMask); - const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask); - _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64; - } - - // 2. pack higher 1bit with 2 groups - const __m512i hmask = _mm512_set1_epi8(0x10); - for (int g = 0; g < 2; ++g) { - __m512i hbits = _mm512_setzero_si512(); - hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 0], hmask), 4)); - hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 1], hmask), 3)); - hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 2], hmask), 2)); - hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 3], hmask), 1)); - hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 8 + 4], hmask) ); - hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 5], hmask), 1)); - hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 6], hmask), 2)); - hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 7], hmask), 3)); - _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64; - } - } -} - -template <> -inline void pack_qs(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) { - __m512i v[32]; - const __m512i lowMask = _mm512_set1_epi8(0xF); - // QK_K 256 with 8 groups, handle 4 groups at a time - char * pb = (char *)packed_B; - char * ph = (char *)packed_B + (QK_K / 2) * TILE_N; - for (int k = 0; k < QK_K / 128; ++k) { - for (int n = 0; n < TILE_N; ++n) { - bytes_from_nibbles_128(v[n], v[n + 16], B[n * KB].ql + k * 64, B[n * KB].qh + k * 32); - } - - // top half: group 0,1 or 4,5; bottom half: group 2,3 or 6,7 - transpose_16x16_32bit(v); - transpose_16x16_32bit(v + 16); - - // 1. pack lower 4bits with 4 groups - for (int n = 0; n < 32; n += 2) { - const __m512i r0 = _mm512_and_si512(v[n], lowMask); - const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask); - _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64; - } - - // 2. pack higher 2bit with 4 groups - const __m512i hmask = _mm512_set1_epi8(0x30); - for (int g = 0; g < 8; ++g) { - __m512i hbits = _mm512_setzero_si512(); - hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 0], hmask), 4)); - hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 1], hmask), 2)); - hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 4 + 2], hmask) ); - hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 4 + 3], hmask), 2)); - _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64; - } - } -} - -template <> -inline void pack_qs(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) { - __m512i v[16]; - char * pb = (char *)packed_B; - for (int k = 0; k < QK_K / 64; ++k) { - for (int n = 0; n < TILE_N; ++n) { - __m256i r0 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 0); - __m256i r1 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 16); - v[n] = _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); - } - - transpose_16x16_32bit(v); - - // pack again with 128 to fully utilize vector length - for (int n = 0; n < TILE_N; n += 2) { - _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1])); - pb += 64; - } - } -} - -// pack B to vnni formats in 4bits or 8 bits -void pack_B(void * RESTRICT packed_B, const block_q4_0 * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - ggml_half * d0 = reinterpret_cast((char *)packed_B + TILE_N * TILE_K / 2); - for (int n = 0; n < TILE_N; ++n) { - d0[n] = B[n * KB].d; - } -} - -void pack_B(void * RESTRICT packed_B, const block_q4_1 * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - ggml_half * d0 = reinterpret_cast((char *)packed_B + TILE_N * TILE_K / 2); - ggml_half * m0 = d0 + TILE_N; - for (int n = 0; n < TILE_N; ++n) { - d0[n] = B[n * KB].d; - m0[n] = B[n * KB].m; - } -} - -inline void s8s8_compensation(void * RESTRICT packed_B) { - // packed_B layout: - // quants {TILE_N, TILEK} int8_t - // d0 {TILE_N} ggml_half - // comp {TILE_N} int32_t - const int offset = TILE_N * TILE_K + TILE_N * sizeof(ggml_half); - __m512i vcomp = _mm512_setzero_si512(); - const __m512i off = _mm512_set1_epi8(static_cast(0x80)); - for (int k = 0; k < 8; ++k) { - __m512i vb = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + k * 64)); - vcomp = _mm512_dpbusd_epi32(vcomp, off, vb); - } - _mm512_storeu_si512((__m512i *)((char *)(packed_B) + offset), vcomp); -} - -void pack_B(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - ggml_half * d0 = reinterpret_cast((char *)packed_B + TILE_N * TILE_K); - for (int n = 0; n < TILE_N; ++n) { - d0[n] = B[n * KB].d; - } - s8s8_compensation(packed_B); -} - -// convert 8 * {min, scale} from int6 to int8 -inline void unpack_mins_and_scales(const uint8_t * scales, uint32_t * utmp) { - const uint32_t kmask1 = 0x3f3f3f3f; - const uint32_t kmask2 = 0x0f0f0f0f; - const uint32_t kmask3 = 0x03030303; - - memcpy(utmp, scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; -} - -// packed_B layout: -// quants {8, TILE_N, 16} uint8 -// scales {8, TILE_N} uint8 -// mins {8, TILE_N} uint8 -// d {TILE_N} ggml_half -// dmin {TILE_N} ggml_half -void pack_B(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - - uint8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N); - uint8_t * mins = scales + 8 * TILE_N; - ggml_half * d = reinterpret_cast(mins + 8 * TILE_N); - ggml_half * dmin = d + TILE_N; - - union { - uint32_t u32[4]; - uint8_t u8[16]; - } s; - - for (int n = 0; n < TILE_N; ++n) { - unpack_mins_and_scales(B[n * KB].scales, s.u32); - for (int k = 0; k < 8; ++k) { - scales[k * TILE_N + n] = s.u8[k]; - mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8]; - } - d[n] = B[n * KB].d; - dmin[n] = B[n * KB].dmin; - } -} - -// packed_B layout: -// quants {8, TILE_N, 16} uint8 -// qh {8, TILE_N, 4} uint8 -// scales {8, TILE_N} uint8 -// mins {8, TILE_N} uint8 -// d {TILE_N} ggml_half -// dmin {TILE_N} ggml_half -void pack_B(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - - uint8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N); - uint8_t * mins = scales + 8 * TILE_N; - ggml_half * d = reinterpret_cast(mins + 8 * TILE_N); - ggml_half * dmin = d + TILE_N; - - union { - uint32_t u32[4]; - uint8_t u8[16]; - } s; - - for (int n = 0; n < TILE_N; ++n) { - unpack_mins_and_scales(B[n * KB].scales, s.u32); - for (int k = 0; k < 8; ++k) { - scales[k * TILE_N + n] = s.u8[k]; - mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8]; - } - d[n] = B[n * KB].d; - dmin[n] = B[n * KB].dmin; - } -} - -// packed_B layout: -// quants {16, TILE_N, 8} uint8 -// qh {16, TILE_N, 4} uint8 -// scales {16, TILE_N} uint8 -// d {TILE_N} ggml_half -void pack_B(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - - uint8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N); - ggml_half * d = reinterpret_cast(scales + 16 * TILE_N); - for (int n = 0; n < TILE_N; ++n) { - const int8_t * ps = B[n * KB].scales; - for (int k = 0; k < 16; ++k) { - scales[k * TILE_N + n] = ps[k]; - } - d[n] = B[n * KB].d; - } -} - -// packed_B layout: -// quants {8, TILE_N, 16} uint8 -// scales {8, TILE_N} int8 -// d {TILE_N} ggml_half -void pack_B(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) { - pack_qs(packed_B, B, KB); - - int8_t * scales = reinterpret_cast((char *)packed_B + (QK_K / 2) * TILE_N); - ggml_half * d = reinterpret_cast(scales + 8 * TILE_N); - - // pack the scales - for (int n = 0; n < TILE_N; ++n) { - uint16_t sh = B[n * KB].scales_h; - for (int k = 0; k < 8; k += 2) { - const int16_t ls1 = ((B[n * KB].scales_l[k / 2] & 0xf) | ((sh << 4) & 0x30)) - 32; - const int16_t ls2 = ((B[n * KB].scales_l[k / 2] >> 4) | ((sh << 2) & 0x30)) - 32; - scales[(k + 0) * TILE_N + n] = ls1; - scales[(k + 1) * TILE_N + n] = ls2; - sh >>= 4; - } - d[n] = B[n * KB].d; - } -} - -template> -void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) { - GGML_UNUSED(tile); - GGML_UNUSED(packed_B); -}; - -template <> -void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B) { - const __m512i off = _mm512_set1_epi8(8); - const __m512i lowMask = _mm512_set1_epi8(0xF); - for (int n = 0; n < 8; n += 2) { - __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32)); - const __m512i r0 = _mm512_sub_epi8(_mm512_and_si512(bytes, lowMask), off); - const __m512i r1 = _mm512_sub_epi8(_mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask), off); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); - } -} - -template <> -void unpack_B(uint8_t * RESTRICT tile, const void * RESTRICT packed_B) { - const __m512i lowMask = _mm512_set1_epi8(0xF); - for (int n = 0; n < 8; n += 2) { - __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32)); - const __m512i r0 = _mm512_and_si512(bytes, lowMask); - const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); - } -} - -// packed_B_t for QKK is int8_t -template -void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { - const int packed_B_group_size = QK_K / 2 * TILE_N / 8; - const char * packed_B_group = (const char *)packed_B + k * packed_B_group_size; - const __m512i lowMask = _mm512_set1_epi8(0xF); - for (int n = 0; n < 8; n += 2) { - __m512i bytes = _mm512_loadu_si512(packed_B_group + n * 32); - const __m512i r0 = _mm512_and_si512(bytes, lowMask); - const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); - } -} - -template <> -void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { - // lower 4bits, stride 256 bytes - const int packed_l4_group_size = QK_K / 2 * TILE_N / 8; - const char * pb = (const char *)packed_B + k * packed_l4_group_size; - - // higher 1bit, stride 64 bytes - const int packed_h1_group_size = QK_K / 8 * TILE_N / 8; - const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h1_group_size; - const __m512i hbits = _mm512_loadu_si512(ph); - - const __m512i lowMask = _mm512_set1_epi8(0xF); - __m512i hmask0 = _mm512_set1_epi8(0x1); - __m512i hmask1 = _mm512_set1_epi8(0x2); - - for (int n = 0; n < 8; n += 2) { - __m512i bytes = _mm512_loadu_si512(pb + n * 32); - __m512i r0 = _mm512_and_si512(bytes, lowMask); - __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - __m512i h0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), n), 4); - __m512i h1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), n + 1), 4); - - hmask0 = _mm512_slli_epi16(hmask0, 2); - hmask1 = _mm512_slli_epi16(hmask1, 2); - r0 = _mm512_add_epi8(r0, h0); - r1 = _mm512_add_epi8(r1, h1); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); - } -} - -template <> -void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { - // lower 4bits, stride 128 bytes - const int packed_l4_group_size = QK_K / 2 * TILE_N / 16; - const char * pb = (const char *)packed_B + k * packed_l4_group_size; - - // higher 2bits, stride 64 bytes - const int packed_h2_group_size = QK_K / 4 * TILE_N / 16; - const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h2_group_size; - const __m512i hbits = _mm512_loadu_si512(ph); - - const __m512i off = _mm512_set1_epi8(32); - const __m512i lowMask = _mm512_set1_epi8(0xF); - __m512i hmask0 = _mm512_set1_epi8(0x3); // 0011 - __m512i hmask1 = _mm512_set1_epi8(0xC); // 1100 - - // notes: skip zero padding from row4 to row7 as we have done so in `unpack_A` - __m512i bytes = _mm512_loadu_si512(pb); - __m512i r0 = _mm512_and_si512(bytes, lowMask); - __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - __m512i h0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask0), 4); - __m512i h1 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask1), 2); - _mm512_storeu_si512((__m512i *)(tile + 0), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off)); - _mm512_storeu_si512((__m512i *)(tile + 64), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off)); - - hmask0 = _mm512_slli_epi16(hmask0, 4); - hmask1 = _mm512_slli_epi16(hmask1, 4); - - bytes = _mm512_loadu_si512(pb + 64); - r0 = _mm512_and_si512(bytes, lowMask); - r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - h0 = _mm512_and_si512(hbits, hmask0); - h1 = _mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), 2); - _mm512_storeu_si512((__m512i *)(tile + 128), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off)); - _mm512_storeu_si512((__m512i *)(tile + 192), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off)); -} - -template <> -void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) { - static const __m512i values128 = _mm512_set_epi8( - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127 - ); - - const int packed_B_group_size = QK_K / 2 * TILE_N / 8; - const char * pb = (const char *)packed_B + k * packed_B_group_size; - const __m512i lowMask = _mm512_set1_epi8(0xF); - - for (int n = 0; n < 8; n += 2) { - __m512i bytes = _mm512_loadu_si512(pb + n * 32); - const __m512i r0 = _mm512_shuffle_epi8(values128, _mm512_and_si512(bytes, lowMask)); - const __m512i r1 = _mm512_shuffle_epi8(values128, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask)); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 0), r0); - _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1); - } -} - -template -struct acc_C {}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) { - const int offset = TILE_N * TILE_K / 2; - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); - - for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_1 * A, int lda, const void * packed_B, int nr) { - const int offset = TILE_N * TILE_K / 2; - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); - const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half)))); - - for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); - const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s)); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum); - vsum = _mm512_fmadd_ps(vm0, vs1, vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) { - const int offset = TILE_N * TILE_K; - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); - - for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { - const uint8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N); - const uint8_t * mins = scales + 8 * TILE_N; - const ggml_half * d0 = reinterpret_cast(mins + 8 * TILE_N); - const ggml_half * dmin = d0 + TILE_N; - - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); - const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin)); - - for (int m = 0; m < nr; ++m) { - const float d1 = A[m * lda].d; - const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); - const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - - const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums); - const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); - - __m512i acc_m = _mm512_setzero_si512(); - for (int k = 0; k < 4; ++k) { - __m512i vmask = _mm512_set1_epi32(k); - __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s)); - __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32))); - acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); - } - - vsum = _mm512_fmadd_ps(vtile, vd, vsum); - vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { - const uint8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N); - const uint8_t * mins = scales + 8 * TILE_N; - const ggml_half * d0 = reinterpret_cast(mins + 8 * TILE_N); - const ggml_half * dmin = d0 + TILE_N; - - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); - const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin)); - - for (int m = 0; m < nr; ++m) { - const float d1 = A[m * lda].d; - const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); - const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - - const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums); - const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); - - __m512i acc_m = _mm512_setzero_si512(); - for (int k = 0; k < 4; ++k) { - __m512i vmask = _mm512_set1_epi32(k); - __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s)); - __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32))); - acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); - } - - vsum = _mm512_fmadd_ps(vtile, vd, vsum); - vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { - const uint8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N); - const ggml_half * d0 = reinterpret_cast(scales + 16 * TILE_N); - - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); - - for (int m = 0; m < nr; ++m) { - const float d1 = A[m * lda].d; - const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - - vsum = _mm512_fmadd_ps(vtile, vd, vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template -struct acc_C { - static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) { - const int8_t * scales = reinterpret_cast((const char *)packed_B + (QK_K / 2) * TILE_N); - const ggml_half * d0 = reinterpret_cast(scales + 8 * TILE_N); - - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0)); - - for (int m = 0; m < nr; ++m) { - const float d1 = A[m * lda].d; - const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0); - const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); - - __m512 vsum; - if (is_acc) { - vsum = _mm512_loadu_ps(C + m * ldc); - } else { - vsum = _mm512_set1_ps(0.f); - } - - vsum = _mm512_fmadd_ps(vtile, vd, vsum); - _mm512_storeu_ps(C + m * ldc, vsum); - } - } -}; - -template constexpr int get_quants_size(); -template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N; } -template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N; } -template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N; } -template <> constexpr int get_quants_size() { return (QK_K / 2) * TILE_N; } - -// used for QKK format -template ::value, int>::type = 0> -inline void scale_C(const int32_t * RESTRICT tile, int32_t * RESTRICT sumi, const void * packed_B, int k, int nr) { - const uint8_t * scales = reinterpret_cast((const char *)packed_B + get_quants_size()); - const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(scales + k * TILE_N))); - - for (int m = 0; m < nr; ++m) { - __m512i vsumi; - if (is_acc) { - vsumi = _mm512_loadu_si512(sumi + m * TILE_N); - } else { - vsumi = _mm512_setzero_si512(); - } - __m512i vtile = _mm512_loadu_si512(tile + m * TILE_N); - vsumi = _mm512_add_epi32(vsumi, _mm512_mullo_epi32(vtile, vscale)); - _mm512_storeu_si512((__m512i *)(sumi + m * TILE_N), vsumi); - } -} - -template -struct tinygemm_kernel_avx { - static void apply(int K, const TA * RESTRICT A, const TB * RESTRICT B, TC * RESTRICT C, int ldc) { - GGML_UNUSED(K); - GGML_UNUSED(A); - GGML_UNUSED(B); - GGML_UNUSED(C); - GGML_UNUSED(ldc); - } -}; - -template -struct tinygemm_kernel_avx { - static void apply(int K, const float * RESTRICT A, const ggml_fp16_t * RESTRICT B, float * RESTRICT C, int ldc) { - constexpr int ROWS = BLOCK_M; - constexpr int COLS = BLOCK_N; - assert(BLOCK_K == 16); - - __m512 va; - __m512 vb[COLS]; - __m512 vc[ROWS * COLS]; - - auto loadc = [&](int idx) { - vc[idx] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - auto compute = [&](int idx, int k) { - // TODO: use `constexpr` here to get rid of interger div - // when upgraded to C++17 - const int row = idx / COLS; - const int col = idx % COLS; - - if (col == 0) { - va = _mm512_loadu_ps(A + row * K + k); - } - if (row == 0) { - vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k))); - } - vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); - }; - - for (int k = 0; k < K; k += 16) { - Unroll{}(compute, k); - } - - auto storec = [&](int idx) { - const int row = idx / COLS; - const int col = idx % COLS; - C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]); - }; - Unroll{}(storec); - } -}; - -#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE) \ - tinygemm_kernel_avx::apply( \ - K, (const float *)src1->data + mb_start * K, \ - (const type *)src0->data + nb_start * K, \ - (float *)dst->data + mb_start * ldc + nb_start, ldc); - - -// re-organize in the format {NB, KB, TILE_SIZE}: -#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size - -template -void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) { - const int NB = N / TILE_N; - const int KB = K / BLOCK_K; - const int TILE_SIZE = get_tile_size(); - - // parallel on NB should be enough - parallel_for(n_threads, NB, [&](int begin, int end) { - for (int n = begin; n < end; ++n) { - for (int k = 0; k < KB; ++k) { - int n0 = n * TILE_N; - pack_B((char *)packed_B + PACKED_INDEX(n, k, KB, TILE_SIZE), &B[n0 * KB + k], KB); - } - } - }); -} - -template -struct tinygemm_kernel_vnni {}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_q4_0); - - const block_q8_0 * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - __m512i va[8]; - __m512 vc[COLS]; - __m512 vd1; - - // sum of offsets, shared across COLS - // - // avx512-vnni does not have `_mm512_dpbssd_epi32`, - // need to transfrom ss to us: - // a * (b - 8) is equavilent to b * a - 8 * a - // s u u u s u s - // - __m512i vcomp; - - const __m512i off = _mm512_set1_epi8(8); - const __m512i lowMask = _mm512_set1_epi8(0xF); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - auto compute = [&](int col, int i) { - // load a and compute compensation - if (col == 0) { - const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); - vcomp = _mm512_setzero_si512(); - for (int k = 0; k < 8; ++k) { - va[k] = _mm512_set1_epi32(a_ptr[k]); - vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]); - } - vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); - } - - // load b - __m512i vsum = _mm512_setzero_si512(); - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - for (int k = 0; k < 8; k += 2) { - __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32)); - __m512i vb0 = _mm512_and_si512(bytes, lowMask); - vsum = _mm512_dpbusd_epi32(vsum, vb0, va[k + 0]); - __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - vsum = _mm512_dpbusd_epi32(vsum, vb1, va[k + 1]); - } - const int offset = TILE_N * TILE_K / 2; - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); - vsum = _mm512_sub_epi32(vsum, vcomp); - - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_q4_1); - - const block_q8_1 * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - __m512i va[8]; - __m512i vb[8]; - __m512 vc[COLS]; - __m512 vd1, vs1; - - const __m512i lowMask = _mm512_set1_epi8(0xF); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - auto compute = [&](int col, int i) { - // load a - if (col == 0) { - const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); - for (int k = 0; k < 8; ++k) { - va[k] = _mm512_set1_epi32(a_ptr[k]); - } - vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); - vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s)); - } - - // load b - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - for (int k = 0; k < 8; k += 2) { - __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32)); - vb[k + 0] = _mm512_and_si512(bytes, lowMask); - vb[k + 1] = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - } - const int offset = TILE_N * TILE_K / 2; - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); - const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset + TILE_N * sizeof(ggml_half)))); - - __m512i vsum = _mm512_setzero_si512(); - for (int k = 0; k < 8; ++k) { - vsum = _mm512_dpbusd_epi32(vsum, vb[k], va[k]); - } - - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]); - vc[col] = _mm512_fmadd_ps(vm0, vs1, vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_q8_0) + TILE_N * sizeof(int32_t); - - const block_q8_0 * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - __m512i va[8]; - __m512i vb[8]; - __m512 vc[COLS]; - __m512 vd1; - - // Notes: s8s8 igemm compensation in avx512-vnni - // change s8s8 to u8s8 with compensate - // a * b = (a + 128) * b - 128 * b - // s s u s u s - // - // (128 * b is pre-computed when packing B to vnni formats) - // - const __m512i off = _mm512_set1_epi8(static_cast(0x80)); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - auto compute = [&](int col, int i) { - // load a and add offset 128 - if (col == 0) { - const int32_t * a_ptr = reinterpret_cast(A[0 * KB + i].qs); - for (int k = 0; k < 8; ++k) { - va[k] = _mm512_set1_epi32(a_ptr[k]); - va[k] = _mm512_add_epi8(va[k], off); - } - vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); - } - - // load b - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - for (int k = 0; k < 8; ++k) { - vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64)); - } - const int offset = TILE_N * TILE_K; - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset))); - const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half); - const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr + offset2)); - - __m512i vsum = _mm512_setzero_si512(); - for (int k = 0; k < 8; ++k) { - vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]); - } - vsum = _mm512_sub_epi32(vsum, vcomp); - - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_q4_K) + TILE_N * 4; - - const block_q8_K * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - // a.qs: 8 groups, 32 bytes each group (m256i) - __m512i va[8]; - // a.bsum: 8 groups, 2 bytes each group (m128i) - __m512i va_bsum; - __m512 vc[COLS]; - __m512 vd1; - - // packed_B: - const int offset_scales = (QK_K / 2) * TILE_N; - const int offset_mins = (QK_K / 2) * TILE_N + 8 * TILE_N; - const int offset_d0 = (QK_K / 2) * TILE_N + 16 * TILE_N; - const int offset_dmin = (QK_K / 2) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half); - - const __m512i lowMask = _mm512_set1_epi8(0xF); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - // Notes: vnni formats in QK_K - // a) quants vnni format - // int8 {k/4, n, 4}, viewed as 2d {k/4, 4n}, k = 32 - // from {16, 32} to {8, 64} - // - // b) min vnni format - // int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8 - // from {16, 8} to {4, 32} - // - auto compute = [&](int col, int i) { - // load a - if (col == 0) { - for (int k_group = 0; k_group < QK_K / 32; ++k_group) { - va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); - } - const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); - const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); - va_bsum = _mm512_castsi128_si512(q8s); - vd1 = _mm512_set1_ps(A[0 * KB + i].d); - } - - // step 1: accumultate the quants - __m512i acc = _mm512_setzero_si512(); - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - const char * b_qs = b_ptr; - for (int k_group = 0; k_group < QK_K / 32; ++k_group) { - __m512i vsum = _mm512_setzero_si512(); - for (int k = 0; k < 8; k += 2) { - __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]); - __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]); - - __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs); - __m512i vb0 = _mm512_and_si512(bytes, lowMask); - vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); - __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); - - b_qs += 64; - } - // vacc += scale * (q8 @ q4) - const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); - acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); - } - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); - - // step 2: accumulate the mins - __m512i acc_m = _mm512_setzero_si512(); - for (int k = 0; k < 4; ++k) { - __m512i vmask = _mm512_set1_epi32(k); - __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum); - __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32))); - acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); - } - const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin))); - vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_q5_K) + TILE_N * 4; - - const block_q8_K * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - // a.qs: 8 groups, 32 bytes each group (m256i) - __m512i va[8]; - // a.bsum: 8 groups, 2 bytes each group (m128i) - __m512i va_bsum; - __m512 vc[COLS]; - __m512 vd1; - - // packed_B: - const int offset_qh = (QK_K / 2) * TILE_N; - const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N; - const int offset_mins = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 8 * TILE_N; - const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N; - const int offset_dmin = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half); - - const __m512i lowMask = _mm512_set1_epi8(0xF); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - // Q5_K and Q4_K shares the same vnni formats, refer to notes above. - auto compute = [&](int col, int i) { - // load a - if (col == 0) { - for (int k_group = 0; k_group < QK_K / 32; ++k_group) { - va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); - } - const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); - const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); - va_bsum = _mm512_castsi128_si512(q8s); - vd1 = _mm512_set1_ps(A[0 * KB + i].d); - } - - // step 1: accumultate the quants - __m512i acc = _mm512_setzero_si512(); - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - const char * b_qs = b_ptr; - const char * b_qh = b_ptr + offset_qh; - for (int k_group = 0; k_group < QK_K / 32; ++k_group) { - __m512i vsum = _mm512_setzero_si512(); - __m512i hmask0 = _mm512_set1_epi8(0x1); - __m512i hmask1 = _mm512_set1_epi8(0x2); - __m512i hbits = _mm512_loadu_si512((const __m512i *)(b_qh + k_group * 64)); - for (int k = 0; k < 8; k += 2) { - __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]); - __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]); - - __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs); - __m512i vb0 = _mm512_and_si512(bytes, lowMask); - __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - - __m512i vh0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), k), 4); - __m512i vh1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), k + 1), 4); - - hmask0 = _mm512_slli_epi16(hmask0, 2); - hmask1 = _mm512_slli_epi16(hmask1, 2); - vb0 = _mm512_add_epi8(vb0, vh0); - vb1 = _mm512_add_epi8(vb1, vh1); - - vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); - vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); - - b_qs += 64; - } - // vacc += scale * (q8 @ q5) - const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); - acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); - } - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); - - // step 2: accumulate the mins - __m512i acc_m = _mm512_setzero_si512(); - for (int k = 0; k < 4; ++k) { - __m512i vmask = _mm512_set1_epi32(k); - __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum); - __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32))); - acc_m = _mm512_dpwssds_epi32(acc_m, va, vb); - } - const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin))); - vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_q6_K); - - const block_q8_K * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - // load the 256 bytes from A to 4 avx512 vectors - __m512i va[4]; - __m512 vc[COLS]; - __m512 vd1; - - // packed_B: - const int offset_qh = (QK_K / 2) * TILE_N; - const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N; - const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N + 16 * TILE_N; - - // compensation - __m512i vcomp; - - const __m512i m32s = _mm512_set1_epi32(32); - const __m512i lowMask = _mm512_set1_epi8(0xF); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - auto compute = [&](int col, int i) { - if (col == 0) { - // load a - va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); - va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); - va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128)); - va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192)); - - const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); - vcomp = _mm512_mullo_epi32(_mm512_cvtepi16_epi32(q8sums), m32s); - vd1 = _mm512_set1_ps(A[0 * KB + i].d); - } - - // accmulate the quants - __m512i acc = _mm512_setzero_si512(); - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - const char * b_qs = b_ptr; - const char * b_qh = b_ptr + offset_qh; - int mask = 0; - for (int k_group = 0; k_group < QK_K / 16; ++k_group) { - int r = k_group >> 2; - __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); - __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); - - __m512i vsum = _mm512_setzero_si512(); - __m512i hmask = _mm512_set1_epi8(0x3); - - __m512i bytes = _mm512_loadu_si512(b_qs); - __m512i hbits = _mm512_loadu_si512(b_qh); - __m512i vb0 = _mm512_and_si512(bytes, lowMask); - __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - __m512i vh0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask), 4); - __m512i vh1 = _mm512_slli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 2)), 2); - - vb0 = _mm512_add_epi8(vb0, vh0); - vb1 = _mm512_add_epi8(vb1, vh1); - vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); - vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); - b_qs += 64; - - va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); - va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); - - bytes = _mm512_loadu_si512(b_qs); - vb0 = _mm512_and_si512(bytes, lowMask); - vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask); - vh0 = _mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 4)); - vh1 = _mm512_srli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 6)), 2); - vb0 = _mm512_add_epi8(vb0, vh0); - vb1 = _mm512_add_epi8(vb1, vh1); - vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); - vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); - b_qs += 64; - b_qh += 64; - - // B * A - 32 * A - __m512i vmask = _mm512_set1_epi32(k_group); - vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp)); - - // vacc += scale * (q8 @ q6) - const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); - acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); - } - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -template -struct tinygemm_kernel_vnni { - static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - - constexpr int COLS = BLOCK_N / 16; - const int TILE_SIZE = TILE_N * sizeof(block_iq4_xs) + TILE_N * 2; - - const block_q8_K * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - // load the 256 bytes from A to 4 avx512 vectors - __m512i va[4]; - __m512 vc[COLS]; - __m512 vd1; - - // packed_B: - const int offset_scales = (QK_K / 2) * TILE_N ; - const int offset_d0 = (QK_K / 2) * TILE_N + 8 * TILE_N; - - // compensation - __m512i vcomp; - - const __m256i m128s = _mm256_set1_epi16(128); - const __m512i lowMask = _mm512_set1_epi8(0xF); - - const __m512i values128 = _mm512_set_epi8( - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, - 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127 - ); - const __m512i off = _mm512_set1_epi8(static_cast(0x80)); - const __m512i values256 = _mm512_add_epi8(values128, off); - - auto loadc = [&](int col) { - vc[col] = _mm512_setzero_ps(); - }; - Unroll{}(loadc); - - auto compute = [&](int col, int i) { - if (col == 0) { - // load a - va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); - va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); - va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128)); - va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192)); - - // compensation: 128 * A - const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums); - vcomp = _mm512_castsi256_si512(_mm256_madd_epi16(q8sums, m128s)); - vd1 = _mm512_set1_ps(A[0 * KB + i].d); - } - - // accmulate the quants - __m512i acc = _mm512_setzero_si512(); - const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE); - const char * b_qs = b_ptr; - int mask = 0; - for (int k_group = 0; k_group < QK_K / 32; ++k_group) { - int r = k_group >> 1; - __m512i vmask = _mm512_set1_epi32(k_group); - __m512i vsum = _mm512_setzero_si512(); - for (int k = 0; k < 8; k += 2) { - __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); - __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]); - - __m512i bytes = _mm512_loadu_si512(b_qs); - __m512i vb0 = _mm512_shuffle_epi8(values256, _mm512_and_si512(bytes, lowMask)); - __m512i vb1 = _mm512_shuffle_epi8(values256, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask)); - - vsum = _mm512_dpbusd_epi32(vsum, vb0, va0); - vsum = _mm512_dpbusd_epi32(vsum, vb1, va1); - b_qs += 64; - } - // (B + 128) * A - 128 * A - vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp)); - - // vacc += scale * (q8 @ q4) - const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N))); - acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale)); - } - const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0))); - vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]); - }; - - for (int i = 0; i < KB; ++i) { - Unroll{}(compute, i); - } - - //store to C - auto storec = [&](int col) { - _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); - }; - Unroll{}(storec); - } -}; - -#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \ - tinygemm_kernel_vnni::apply( \ - KB, (const char *)wdata + 0 * row_size_A, \ - (const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \ - (float *) dst->data + 0 * N + nb_start, ldc) - -template ::value, int>::type = 0> -void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, TC * RESTRICT C, int ldc) { - using packed_B_t = packed_B_type; - const int TILE_SIZE = get_tile_size(); - const bool need_unpack = do_unpack::value; - - GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N); - const TA * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - const int m0 = std::min(M, TILE_M); - const int m1 = std::max(M - TILE_M, 0); - const int lda = KB * sizeof(TA); - //const int ldb = KB * sizeof(TB); - - static thread_local packed_B_t Tile0[TILE_N * TILE_K]; - static thread_local packed_B_t Tile1[TILE_N * TILE_K]; - static thread_local int8_t Tile23[TILE_M * TILE_K]; - - static thread_local int32_t TileC0[TILE_M * TILE_N * 4]; - static thread_local int32_t TileC1[TILE_M * TILE_N * 4]; - - // double buffering C to interleave avx512 and amx - int32_t * C_cur = TileC0; - int32_t * C_pre = TileC1; - - auto Tile4 = [&](int32_t * base) { return base; }; - auto Tile5 = [&](int32_t * base) { return base + TILE_M * TILE_N; }; - auto Tile6 = [&](int32_t * base) { return base + 2 * TILE_M * TILE_N; }; - auto Tile7 = [&](int32_t * base) { return base + 3 * TILE_M * TILE_N; }; - - if (M == 2 * TILE_M) { - // i = 0 - const char * B_blk0 = B + PACKED_INDEX(0, 0, KB, TILE_SIZE); - const char * B_blk1 = B + PACKED_INDEX(1, 0, KB, TILE_SIZE); - if (need_unpack) { - unpack_B(Tile0, B_blk0); - _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); - } else { - _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK); - } - - _tile_zero(TMM4); - _tile_loadd(TMM2, A[0].qs, lda); - _tile_dpbssd(TMM4, TMM2, TMM0); - _tile_stored(TMM4, Tile4(C_pre), TILE_N * sizeof(int32_t)); - - _tile_zero(TMM5); - _tile_loadd(TMM3, A[TILE_M * KB + 0].qs, lda); - _tile_dpbssd(TMM5, TMM3, TMM0); - _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t)); - - if (need_unpack) { - unpack_B(Tile1, B_blk0); - _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); - } else { - _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); - } - - _tile_zero(TMM6); - _tile_dpbssd(TMM6, TMM2, TMM1); - _tile_stored(TMM6, Tile6(C_pre), TILE_N * sizeof(int32_t)); - - _tile_zero(TMM7); - _tile_dpbssd(TMM7, TMM3, TMM1); - _tile_stored(TMM7, Tile7(C_pre), TILE_N * sizeof(int32_t)); - - for (int i = 1; i < KB; ++i) { - // index of previous iter - const int ii = i - 1; - const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE); - const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE); - GGML_DISPATCH_BOOL(ii > 0, is_acc, [&] { - if (need_unpack) { - unpack_B(Tile0, B_blk0); - _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); - } else { - _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK); - } - _tile_zero(TMM4); - _tile_loadd(TMM2, A[i].qs, lda); - acc_C::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); - - _tile_dpbssd(TMM4, TMM2, TMM0); - _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t)); - - _tile_zero(TMM5); - _tile_loadd(TMM3, A[TILE_M * KB + i].qs, lda); - acc_C::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); - - _tile_dpbssd(TMM5, TMM3, TMM0); - _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t)); - - if (need_unpack) { - unpack_B(Tile1, B_blk1); - _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); - } else { - _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); - } - _tile_zero(TMM6); - acc_C::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); - - _tile_dpbssd(TMM6, TMM2, TMM1); - _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t)); - - _tile_zero(TMM7); - acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); - - _tile_dpbssd(TMM7, TMM3, TMM1); - _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t)); - - std::swap(C_cur, C_pre); - }); - } - // final accumulation - { - int ii = KB - 1; - acc_C::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); - acc_C::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M); - acc_C::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); - acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M); - } - } else { - for (int i = 0; i < KB; ++i) { - _tile_zero(TMM4); - _tile_zero(TMM6); - if (m1 != 0) { - _tile_zero(TMM5); - _tile_zero(TMM7); - } - - const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE); - const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE); - if (need_unpack) { - unpack_B(Tile0, B_blk0); - _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); - } else { - _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK); - } - - if (need_unpack) { - unpack_B(Tile1, B_blk1); - _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); - } else { - _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); - } - - if (m0 == TILE_M) { - _tile_loadd(TMM2, A[i].qs, lda); - } else { - unpack_A(Tile23, &A[i], KB, m0); - _tile_loadd(TMM2, Tile23, TILE_K); - } - - _tile_dpbssd(TMM4, TMM2, TMM0); - _tile_dpbssd(TMM6, TMM2, TMM1); - - _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t)); - _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t)); - - GGML_DISPATCH_BOOL(i > 0, is_acc, [&] { - acc_C::apply(C, ldc, Tile4(C_cur), &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0); - acc_C::apply(C + TILE_N, ldc, Tile6(C_cur), &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0); - }); - - if (m1 != 0) { - unpack_A(Tile23, &A[TILE_M * KB + i], KB, m1); - _tile_loadd(TMM3, Tile23, TILE_K); - - _tile_dpbssd(TMM5, TMM3, TMM0); - _tile_dpbssd(TMM7, TMM3, TMM1); - _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t)); - _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t)); - GGML_DISPATCH_BOOL(i > 0, is_acc, [&] { - acc_C::apply(C + TILE_M * ldc, ldc, Tile5(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1); - acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1); - }); - } - } - } - return; -} - -template ::value, int>::type = 0> -void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) { - static_assert(std::is_same::value); - const int TILE_SIZE = get_tile_size(); - - GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N); - const TA * RESTRICT A = static_cast(_A); - const char * RESTRICT B = static_cast(_B); - - const int m0 = std::min(M, TILE_M); - const int m1 = std::max(M - TILE_M, 0); - //const int lda = KB * sizeof(TA); - - static thread_local int8_t Tile0[TILE_N * TILE_K]; - static thread_local int8_t Tile1[TILE_N * TILE_K]; - static thread_local int8_t Tile23[TILE_M * TILE_K]; - - // mat mul result for each group - static thread_local int32_t Tile4[TILE_M * TILE_N]; - static thread_local int32_t Tile5[TILE_M * TILE_N]; - static thread_local int32_t Tile6[TILE_M * TILE_N]; - static thread_local int32_t Tile7[TILE_M * TILE_N]; - - // sum of each QK_K block, contains 8 groups, int32 - static thread_local int32_t Sumi4[TILE_M * TILE_N]; - static thread_local int32_t Sumi5[TILE_M * TILE_N]; - static thread_local int32_t Sumi6[TILE_M * TILE_N]; - static thread_local int32_t Sumi7[TILE_M * TILE_N]; - - const int k_group_size = std::is_same::value ? 16 : 32; - for (int i = 0; i < KB; ++i) { - // step 1: accumulate the quants across 8 groups, each group with 32 - for (int k = 0; k < QK_K / k_group_size; ++k) { - GGML_DISPATCH_BOOL(k > 0, is_acc, [&] { - _tile_zero(TMM4); - _tile_zero(TMM6); - - unpack_B(Tile0, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k); - _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK); - - unpack_B(Tile1, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k); - _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); - - unpack_A(Tile23, &A[i], KB, k, m0); - _tile_loadd(TMM2, Tile23, TILE_K); - - _tile_dpbssd(TMM4, TMM2, TMM0); - _tile_dpbssd(TMM6, TMM2, TMM1); - - _tile_stored(TMM4, Tile4, TILE_N * sizeof(int32_t)); - _tile_stored(TMM6, Tile6, TILE_N * sizeof(int32_t)); - - scale_C(Tile4, Sumi4, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m0); - scale_C(Tile6, Sumi6, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m0); - - if (m1 != 0) { - _tile_zero(TMM5); - _tile_zero(TMM7); - - unpack_A(Tile23, &A[TILE_M * KB + i], KB, k, m1); - _tile_loadd(TMM3, Tile23, TILE_K); - - _tile_dpbssd(TMM5, TMM3, TMM0); - _tile_dpbssd(TMM7, TMM3, TMM1); - - _tile_stored(TMM5, Tile5, TILE_N * sizeof(int32_t)); - _tile_stored(TMM7, Tile7, TILE_N * sizeof(int32_t)); - - scale_C(Tile5, Sumi5, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m1); - scale_C(Tile7, Sumi7, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m1); - } - }); - } - - // step 2: accmulate the mins - GGML_DISPATCH_BOOL(i > 0, is_acc, [&] { - acc_C::apply(C, ldc, Sumi4, &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0); - acc_C::apply(C + TILE_N, ldc, Sumi6, &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0); - if (m1 != 0) { - acc_C::apply(C + TILE_M * ldc, ldc, Sumi5, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1); - acc_C::apply(C + TILE_M * ldc + TILE_N, ldc, Sumi7, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1); - } - }); - } - return; -} - -} // anonymous namespace - -// get the packed tensor size for quantized weights -size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) { - const enum ggml_type TYPE = tensor->type; - - const int K = tensor->ne[0]; // ne0: in_features - const int N = tensor->ne[1]; // ne1: out_features - - auto get_tensor_size = [&] { - size_t row_size_B{0}; - GGML_DISPATCH_QTYPES(TYPE, [&] { - row_size_B = get_row_size(K); - }); - return N * row_size_B; - }; - - if (qtype_has_amx_kernels(TYPE)) { - return get_tensor_size(); - } else { - // for f16, bf16 we don't do packing - return ggml_nbytes(tensor); - } -} - -// pack weight to vnni format -void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - - size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor); - GGML_ASSERT(alloc_size == size); - - const enum ggml_type TYPE = tensor->type; - - const int K = tensor->ne[0]; // ne0: in_features - const int N = tensor->ne[1]; // ne1: out_features - -#if defined(_OPENMP) - // the buffer ctx is not initialized when .set_tensor is called - int n_threads = omp_get_num_threads(); -#else - int n_threads = 1; -#endif - - GGML_DISPATCH_QTYPES(TYPE, [&] { - convert_B_packed_format((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads); - }); -} - -// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX) -// -// src0: weight in shape of {N, K}, quantized -// src1: input in shape of {M, K}, float32 -// dst: output in shape of {M, N}, float32 -// -// the function performs: dst = src1 @ src0.T -// -void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { - struct ggml_tensor * src0 = dst->src[0]; - struct ggml_tensor * src1 = dst->src[1]; - - const enum ggml_type TYPE = src0->type; - - const int n_threads = ctx->n_threads; - - // f16 only has avx512 kernels for now, - // amx kernels will be added once 6th gen xeon is released. - const bool is_floating_type = TYPE == GGML_TYPE_F16; - - const int M = dst->ne[1]; - const int N = dst->ne[0]; - const int K = src0->ne[0]; - const int ldc = dst->nb[1] / dst->nb[0]; - - if (is_floating_type) { - constexpr int BLOCK_M = 4; - constexpr int BLOCK_N = 6; - const int MB = div_up(M, BLOCK_M); - const int NB = div_up(N, BLOCK_N); - - parallel_for(n_threads, MB * NB, [&](int begin, int end) { - GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { - for (int i = begin; i < end; ++i) { - int mb = i / NB; - int nb = i % NB; - - int mb_start = mb * BLOCK_M; - int mb_size = std::min(BLOCK_M, M - mb_start); - int nb_start = nb * BLOCK_N; - int nb_size = std::min(BLOCK_N, N - nb_start); - - switch (mb_size << 4 | nb_size) { - case 0x12: LAUNCH_TINYGEMM_KERNEL_AVX(1, 2); break; - case 0x14: LAUNCH_TINYGEMM_KERNEL_AVX(1, 4); break; - case 0x16: LAUNCH_TINYGEMM_KERNEL_AVX(1, 6); break; - case 0x22: LAUNCH_TINYGEMM_KERNEL_AVX(2, 2); break; - case 0x24: LAUNCH_TINYGEMM_KERNEL_AVX(2, 4); break; - case 0x26: LAUNCH_TINYGEMM_KERNEL_AVX(2, 6); break; - case 0x32: LAUNCH_TINYGEMM_KERNEL_AVX(3, 2); break; - case 0x34: LAUNCH_TINYGEMM_KERNEL_AVX(3, 4); break; - case 0x36: LAUNCH_TINYGEMM_KERNEL_AVX(3, 6); break; - case 0x42: LAUNCH_TINYGEMM_KERNEL_AVX(4, 2); break; - case 0x44: LAUNCH_TINYGEMM_KERNEL_AVX(4, 4); break; - case 0x46: LAUNCH_TINYGEMM_KERNEL_AVX(4, 6); break; - default: fprintf(stderr, "Unexpected block size!\n"); - } - } - }); - }); - return; - } - - // pointer to work space, used convert A from float to quantized type - void * wdata = nullptr; - - //TODO: performance improvement: merge quant A - GGML_DISPATCH_QTYPES(TYPE, [&] { - const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); - const size_t desired_wsize = M * row_size_A; - if (ctx->work_size < desired_wsize) { - ctx->work_data.reset(new char[desired_wsize]); - ctx->work_size = desired_wsize; - } - wdata = ctx->work_data.get(); - - // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size - // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size - GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); - - const float * A_data = static_cast(src1->data); - for (int m = 0; m < M; ++m) { - from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); - } - }); - - if (M == 1) { - // MB = 1 and handle 8 tiles in each block - constexpr int kTilesN = 4; - constexpr int BLOCK_N = TILE_N * kTilesN; - const int NB = div_up(N, BLOCK_N); - - parallel_for(n_threads, NB, [&](int begin, int end) { - GGML_DISPATCH_QTYPES(TYPE, [&] { - const int KB = K / blck_size; - const int TILE_SIZE = get_tile_size(); - const int row_size_A = KB * sizeof(vec_dot_type); - for (int i = begin; i < end; ++i) { - int nb = i; - int nb_start = nb * BLOCK_N; - int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96 - - switch (nb_size) { - //case 160: LAUNCH_TINYGEMM_KERNEL_VNNI(160); break; - case 128: LAUNCH_TINYGEMM_KERNEL_VNNI(128); break; - case 96: LAUNCH_TINYGEMM_KERNEL_VNNI(96); break; - case 64: LAUNCH_TINYGEMM_KERNEL_VNNI(64); break; - case 32: LAUNCH_TINYGEMM_KERNEL_VNNI(32); break; - default: fprintf(stderr, "Unexpected n block size!\n"); - } - } - }); - }); - return; - } - - // handle 4 tiles at a tile - constexpr int BLOCK_M = TILE_M * 2; - constexpr int BLOCK_N = TILE_N * 2; - const int MB = div_up(M, BLOCK_M); - const int NB = div_up(N, BLOCK_N); - - parallel_for(n_threads, MB * NB, [&](int begin, int end) { - // init tile config for each thread - ggml_tile_config_init(); - - GGML_DISPATCH_QTYPES(TYPE, [&] { - const int KB = K / blck_size; - const int TILE_SIZE = get_tile_size(); - const int row_size_A = KB * sizeof(vec_dot_type); - - for (int i = begin; i < end; ++i) { - int mb = i / NB; - int nb = i % NB; - - int mb_start = mb * BLOCK_M; - int mb_size = std::min(BLOCK_M, M - mb_start); - int nb_start = nb * BLOCK_N; - int nb_size = BLOCK_N; - - tinygemm_kernel_amx( - mb_size, nb_size, KB, - (const char *)wdata + mb_start * row_size_A, - (const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE), - (float *) dst->data + mb_start * N + nb_start, ldc); - } - }); - }); -} - -#else // if defined(__AMX_INT8__) - -void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { - fprintf(stderr, "GGML is not compiled with AMX support!\n"); - - GGML_UNUSED(ctx); - GGML_UNUSED(dst); -} - -#endif // if defined(__AMX_INT8__) diff --git a/ggml/src/ggml-amx/mmq.h b/ggml/src/ggml-amx/mmq.h deleted file mode 100644 index cf092062..00000000 --- a/ggml/src/ggml-amx/mmq.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once -#include "common.h" -#include - -#ifdef __cplusplus -extern "C" { -#endif - -size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); - -void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - -void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h deleted file mode 100644 index c36c12d6..00000000 --- a/ggml/src/ggml-backend-impl.h +++ /dev/null @@ -1,255 +0,0 @@ -#pragma once - -// ggml-backend internal header - -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - - #define GGML_BACKEND_API_VERSION 1 - - // - // Backend buffer type - // - - struct ggml_backend_buffer_type_i { - const char * (*get_name) (ggml_backend_buffer_type_t buft); - // allocate a buffer of this type - ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - // tensor alignment - size_t (*get_alignment) (ggml_backend_buffer_type_t buft); - // (optional) max buffer size that can be allocated (defaults to SIZE_MAX) - size_t (*get_max_size) (ggml_backend_buffer_type_t buft); - // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) - size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); - // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false) - bool (*is_host) (ggml_backend_buffer_type_t buft); - }; - - struct ggml_backend_buffer_type { - struct ggml_backend_buffer_type_i iface; - ggml_backend_dev_t device; - void * context; - }; - - // - // Backend buffer - // - - struct ggml_backend_buffer_i { - // (optional) free the buffer - void (*free_buffer) (ggml_backend_buffer_t buffer); - // base address of the buffer - void * (*get_base) (ggml_backend_buffer_t buffer); - // (optional) initialize a tensor in the buffer (eg. add tensor extras) - enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - // tensor data access - void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported) - bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); - // clear the entire buffer - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); - // (optional) reset any internal state due to tensor initialization, such as tensor extras - void (*reset) (ggml_backend_buffer_t buffer); - }; - - struct ggml_backend_buffer { - struct ggml_backend_buffer_i iface; - ggml_backend_buffer_type_t buft; - void * context; - size_t size; - enum ggml_backend_buffer_usage usage; - }; - - GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - void * context, - size_t size); - - // do not use directly, use ggml_backend_tensor_copy instead - GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); - - // multi-buffer - // buffer that contains a collection of buffers - GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - - // - // Backend (stream) - // - - struct ggml_backend_i { - const char * (*get_name)(ggml_backend_t backend); - - void (*free)(ggml_backend_t backend); - - // (optional) asynchronous tensor data access - void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); - - // (optional) complete all pending operations (required if the backend supports async operations) - void (*synchronize)(ggml_backend_t backend); - - // (optional) graph plans (not used currently) - // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); - void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology - void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); - // compute the graph with the plan - enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - - // compute graph (always async if supported by the backend) - enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - - // (optional) event synchronization - // record an event on this stream - void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); - // wait for an event on on a different stream - void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); - }; - - struct ggml_backend { - ggml_guid_t guid; - struct ggml_backend_i iface; - ggml_backend_dev_t device; - void * context; - }; - - struct ggml_backend_event { - struct ggml_backend_device * device; - void * context; - }; - - // - // Backend device - // - - // Note: if additional properties are needed, we should add a struct with all of them - // the current functions to obtain the properties can remain, since they are more convenient for often used properties - struct ggml_backend_device_i { - // device name: short identifier for this device, such as "CPU" or "CUDA0" - const char * (*get_name)(ggml_backend_dev_t dev); - - // device description: short informative description of the device, could be the model name - const char * (*get_description)(ggml_backend_dev_t dev); - - // device memory in bytes - void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total); - - // device type - enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev); - - // device properties - void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props); - - // backend (stream) initialization - ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params); - - // preferred buffer type - ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev); - - // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device) - ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev); - - // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries) - ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size); - - // check if the backend can compute an operation - bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op); - - // check if the backend can use tensors allocated in a buffer type - bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft); - - // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer - // these should be expensive operations that may benefit from running on this backend instead of the CPU backend - bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op); - - // (optional) event synchronization - ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); - void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); - void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); - }; - - struct ggml_backend_device { - struct ggml_backend_device_i iface; - ggml_backend_reg_t reg; - void * context; - }; - - // - // Backend (reg) - // - - struct ggml_backend_reg_i { - const char * (*get_name)(ggml_backend_reg_t reg); - - // enumerate available devices - size_t (*get_device_count)(ggml_backend_reg_t reg); - ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index); - - // (optional) get a pointer to a function in the backend - // backends can add custom functions that are not part of the standard ggml-backend interface - void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name); - }; - - struct ggml_backend_reg { - int api_version; // initialize to GGML_BACKEND_API_VERSION - struct ggml_backend_reg_i iface; - void * context; - }; - - // Internal backend registry API - GGML_API void ggml_backend_register(ggml_backend_reg_t reg); - - // Add backend dynamic loading support to the backend - - // Initialize the backend - typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); - // Optional: obtain a score for the backend based on the system configuration - // Higher scores are preferred, 0 means the backend is not supported in the current system - typedef int (*ggml_backend_score_t)(void); - -#ifdef GGML_BACKEND_DL -# ifdef __cplusplus -# define GGML_BACKEND_DL_IMPL(reg_fn) \ - extern "C" { \ - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ - } \ - ggml_backend_reg_t ggml_backend_init(void) { \ - return reg_fn(); \ - } -# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \ - extern "C" { \ - GGML_BACKEND_API int ggml_backend_score(void); \ - } \ - int ggml_backend_score(void) { \ - return score_fn(); \ - } -# else -# define GGML_BACKEND_DL_IMPL(reg_fn) \ - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ - ggml_backend_reg_t ggml_backend_init(void) { \ - return reg_fn(); \ - } -# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \ - GGML_BACKEND_API int ggml_backend_score(void); \ - int ggml_backend_score(void) { \ - return score_fn(); \ - } -# endif -#else -# define GGML_BACKEND_DL_IMPL(reg_fn) -# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) -#endif - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp deleted file mode 100644 index 405d8e31..00000000 --- a/ggml/src/ggml-backend-reg.cpp +++ /dev/null @@ -1,586 +0,0 @@ -#include "ggml-backend-impl.h" -#include "ggml-backend.h" -#include "ggml-impl.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include -#elif defined(__APPLE__) -# include -# include -#else -# include -# include -#endif - -// Backend registry -#ifdef GGML_USE_CPU -#include "ggml-cpu.h" -#endif - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef GGML_USE_OPENCL -#include "ggml-opencl.h" -#endif - -#ifdef GGML_USE_BLAS -#include "ggml-blas.h" -#endif - -#ifdef GGML_USE_RPC -#include "ggml-rpc.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -#ifdef GGML_USE_KOMPUTE -#include "ggml-kompute.h" -#endif - -// disable C++17 deprecation warning for std::codecvt_utf8 -#if defined(__clang__) -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wdeprecated-declarations" -#endif - -namespace fs = std::filesystem; - -static std::string path_str(const fs::path & path) { - std::string u8path; - try { -#if defined(__cpp_lib_char8_t) - // C++20 and later: u8string() returns std::u8string - std::u8string u8str = path.u8string(); - u8path = std::string(reinterpret_cast(u8str.c_str())); -#else - // C++17: u8string() returns std::string - u8path = path.u8string(); -#endif - } catch (...) { - } - return u8path; -} - -#if defined(__clang__) -# pragma clang diagnostic pop -#endif - -#ifdef _WIN32 - -using dl_handle = std::remove_pointer_t; - -struct dl_handle_deleter { - void operator()(HMODULE handle) { - FreeLibrary(handle); - } -}; - -static dl_handle * dl_load_library(const fs::path & path) { - // suppress error dialogs for missing DLLs - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - HMODULE handle = LoadLibraryW(path.wstring().c_str()); - - SetErrorMode(old_mode); - - return handle; -} - -static void * dl_get_sym(dl_handle * handle, const char * name) { - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - void * p = (void *) GetProcAddress(handle, name); - - SetErrorMode(old_mode); - - return p; -} - -#else - -using dl_handle = void; - -struct dl_handle_deleter { - void operator()(void * handle) { - dlclose(handle); - } -}; - -static void * dl_load_library(const fs::path & path) { - dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); - - return handle; -} - -static void * dl_get_sym(dl_handle * handle, const char * name) { - return dlsym(handle, name); -} - -#endif - -using dl_handle_ptr = std::unique_ptr; - -struct ggml_backend_reg_entry { - ggml_backend_reg_t reg; - dl_handle_ptr handle; -}; - -struct ggml_backend_registry { - std::vector backends; - std::vector devices; - - ggml_backend_registry() { -#ifdef GGML_USE_CUDA - register_backend(ggml_backend_cuda_reg()); -#endif -#ifdef GGML_USE_METAL - register_backend(ggml_backend_metal_reg()); -#endif -#ifdef GGML_USE_SYCL - register_backend(ggml_backend_sycl_reg()); -#endif -#ifdef GGML_USE_VULKAN - register_backend(ggml_backend_vk_reg()); -#endif -#ifdef GGML_USE_OPENCL - register_backend(ggml_backend_opencl_reg()); -#endif -#ifdef GGML_USE_CANN - register_backend(ggml_backend_cann_reg()); -#endif -#ifdef GGML_USE_BLAS - register_backend(ggml_backend_blas_reg()); -#endif -#ifdef GGML_USE_RPC - register_backend(ggml_backend_rpc_reg()); -#endif -#ifdef GGML_USE_KOMPUTE - register_backend(ggml_backend_kompute_reg()); -#endif -#ifdef GGML_USE_CPU - register_backend(ggml_backend_cpu_reg()); -#endif - } - - ~ggml_backend_registry() { - // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources, - // since backend threads may still be running and accessing resources from the dynamic library - for (auto & entry : backends) { - if (entry.handle) { - entry.handle.release(); // NOLINT - } - } - } - - void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) { - if (!reg) { - return; - } - -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", - __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); -#endif - backends.push_back({ reg, std::move(handle) }); - for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { - register_device(ggml_backend_reg_dev_get(reg, i)); - } - } - - void register_device(ggml_backend_dev_t device) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); -#endif - devices.push_back(device); - } - - ggml_backend_reg_t load_backend(const fs::path & path, bool silent) { - dl_handle_ptr handle { dl_load_library(path) }; - if (!handle) { - if (!silent) { - GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str()); - } - return nullptr; - } - - auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); - if (score_fn && score_fn() == 0) { - if (!silent) { - GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_str(path).c_str()); - } - return nullptr; - } - - auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); - if (!backend_init_fn) { - if (!silent) { - GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_str(path).c_str()); - } - return nullptr; - } - - ggml_backend_reg_t reg = backend_init_fn(); - if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { - if (!silent) { - if (!reg) { - GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", - __func__, path_str(path).c_str()); - } else { - GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", - __func__, path_str(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION); - } - } - return nullptr; - } - - GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str()); - - register_backend(reg, std::move(handle)); - - return reg; - } - - void unload_backend(ggml_backend_reg_t reg, bool silent) { - auto it = std::find_if(backends.begin(), backends.end(), - [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; }); - - if (it == backends.end()) { - if (!silent) { - GGML_LOG_ERROR("%s: backend not found\n", __func__); - } - return; - } - - if (!silent) { - GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); - } - - // remove devices - devices.erase( - std::remove_if(devices.begin(), devices.end(), - [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), - devices.end()); - - // remove backend - backends.erase(it); - } -}; - -static ggml_backend_registry & get_reg() { - static ggml_backend_registry reg; - return reg; -} - -// Internal API -void ggml_backend_register(ggml_backend_reg_t reg) { - get_reg().register_backend(reg); -} - -void ggml_backend_device_register(ggml_backend_dev_t device) { - get_reg().register_device(device); -} - -// Backend (reg) enumeration -static bool striequals(const char * a, const char * b) { - for (; *a && *b; a++, b++) { - if (std::tolower(*a) != std::tolower(*b)) { - return false; - } - } - return *a == *b; -} - -size_t ggml_backend_reg_count() { - return get_reg().backends.size(); -} - -ggml_backend_reg_t ggml_backend_reg_get(size_t index) { - GGML_ASSERT(index < ggml_backend_reg_count()); - return get_reg().backends[index].reg; -} - -ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { - for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - ggml_backend_reg_t reg = ggml_backend_reg_get(i); - if (striequals(ggml_backend_reg_name(reg), name)) { - return reg; - } - } - return nullptr; -} - -// Device enumeration -size_t ggml_backend_dev_count() { - return get_reg().devices.size(); -} - -ggml_backend_dev_t ggml_backend_dev_get(size_t index) { - GGML_ASSERT(index < ggml_backend_dev_count()); - return get_reg().devices[index]; -} - -ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (striequals(ggml_backend_dev_name(dev), name)) { - return dev; - } - } - return nullptr; -} - -ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == type) { - return dev; - } - } - return nullptr; -} - -// Convenience functions -ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { - ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); - if (!dev) { - return nullptr; - } - return ggml_backend_dev_init(dev, params); -} - -ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { - ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); - if (!dev) { - return nullptr; - } - return ggml_backend_dev_init(dev, params); -} - -ggml_backend_t ggml_backend_init_best(void) { - ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); - if (!dev) { - dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - } - if (!dev) { - return nullptr; - } - return ggml_backend_dev_init(dev, nullptr); -} - -// Dynamic loading -ggml_backend_reg_t ggml_backend_load(const char * path) { - return get_reg().load_backend(path, false); -} - -void ggml_backend_unload(ggml_backend_reg_t reg) { - get_reg().unload_backend(reg, true); -} - -static fs::path get_executable_path() { -#if defined(__APPLE__) - // get executable path - std::vector path; - uint32_t size; - while (true) { - size = path.size(); - if (_NSGetExecutablePath(path.data(), &size) == 0) { - break; - } - path.resize(size); - } - std::string base_path(path.data(), size); - // remove executable name - auto last_slash = base_path.find_last_of('/'); - if (last_slash != std::string::npos) { - base_path = base_path.substr(0, last_slash); - } - return base_path + "/"; -#elif defined(__linux__) || defined(__FreeBSD__) - std::string base_path = "."; - std::vector path(1024); - while (true) { - // get executable path -# if defined(__linux__) - ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); -# elif defined(__FreeBSD__) - ssize_t len = readlink("/proc/curproc/file", path.data(), path.size()); -# endif - if (len == -1) { - break; - } - if (len < (ssize_t) path.size()) { - base_path = std::string(path.data(), len); - // remove executable name - auto last_slash = base_path.find_last_of('/'); - if (last_slash != std::string::npos) { - base_path = base_path.substr(0, last_slash); - } - break; - } - path.resize(path.size() * 2); - } - - return base_path + "/"; -#elif defined(_WIN32) - std::vector path(MAX_PATH); - DWORD len = GetModuleFileNameW(NULL, path.data(), path.size()); - if (len == 0) { - return {}; - } - std::wstring base_path(path.data(), len); - // remove executable name - auto last_slash = base_path.find_last_of('\\'); - if (last_slash != std::string::npos) { - base_path = base_path.substr(0, last_slash); - } - return base_path + L"\\"; -#else - return {}; -#endif -} - -static fs::path backend_filename_prefix() { -#ifdef _WIN32 - return fs::u8path("ggml-"); -#else - return fs::u8path("libggml-"); -#endif -} - -static fs::path backend_filename_extension() { -#ifdef _WIN32 - return fs::u8path(".dll"); -#else - return fs::u8path(".so"); -#endif -} - -static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { - // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths - const fs::path name_path = fs::u8path(name); - const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native(); - const fs::path file_extension = backend_filename_extension(); - - std::vector search_paths; - if (user_search_path == nullptr) { - // default search paths: executable directory, current directory - search_paths.push_back(get_executable_path()); - search_paths.push_back(fs::current_path()); - } else { - search_paths.push_back(fs::u8path(user_search_path)); - } - - int best_score = 0; - fs::path best_path; - - for (const auto & search_path : search_paths) { - if (!fs::exists(search_path)) { - GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str()); - continue; - } - fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied); - for (const auto & entry : dir_it) { - if (entry.is_regular_file()) { - auto filename = entry.path().filename(); - auto ext = entry.path().extension(); - if (filename.native().find(file_prefix) == 0 && ext == file_extension) { - dl_handle_ptr handle { dl_load_library(entry) }; - if (!handle && !silent) { - GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str()); - } - if (handle) { - auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); - if (score_fn) { - int s = score_fn(); -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_str(entry.path()).c_str(), s); -#endif - if (s > best_score) { - best_score = s; - best_path = entry.path(); - } - } else { - if (!silent) { - GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, path_str(entry.path()).c_str()); - } - } - } - } - } - } - } - - if (best_score == 0) { - // try to load the base backend - for (const auto & search_path : search_paths) { - fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native(); - fs::path path = search_path / filename; - if (fs::exists(path)) { - return get_reg().load_backend(path, silent); - } - } - return nullptr; - } - - return get_reg().load_backend(best_path, silent); -} - -void ggml_backend_load_all() { - ggml_backend_load_all_from_path(nullptr); -} - -void ggml_backend_load_all_from_path(const char * dir_path) { -#ifdef NDEBUG - bool silent = true; -#else - bool silent = false; -#endif - - ggml_backend_load_best("blas", silent, dir_path); - ggml_backend_load_best("cann", silent, dir_path); - ggml_backend_load_best("cuda", silent, dir_path); - ggml_backend_load_best("hip", silent, dir_path); - ggml_backend_load_best("kompute", silent, dir_path); - ggml_backend_load_best("metal", silent, dir_path); - ggml_backend_load_best("rpc", silent, dir_path); - ggml_backend_load_best("sycl", silent, dir_path); - ggml_backend_load_best("vulkan", silent, dir_path); - ggml_backend_load_best("opencl", silent, dir_path); - ggml_backend_load_best("musa", silent, dir_path); - ggml_backend_load_best("cpu", silent, dir_path); - // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend - const char * backend_path = std::getenv("GGML_BACKEND_PATH"); - if (backend_path) { - ggml_backend_load(backend_path); - } -} diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp deleted file mode 100644 index 273075f4..00000000 --- a/ggml/src/ggml-backend.cpp +++ /dev/null @@ -1,2004 +0,0 @@ -// Note: porting this file to C++ is a work in progress - -#ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include -#endif - -#include "ggml-backend.h" -#include "ggml-backend-impl.h" -#include "ggml-alloc.h" -#include "ggml-impl.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __APPLE__ -#include -#include -#endif - - -// backend buffer type - -const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name(buft); -} - -ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - if (size == 0) { - // return a dummy buffer for zero-sized allocations - return ggml_backend_buffer_init(buft, {}, NULL, 0); - } - - return buft->iface.alloc_buffer(buft, size); -} - -size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { - return buft->iface.get_alignment(buft); -} - -size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { - // get_max_size is optional, defaults to SIZE_MAX - if (buft->iface.get_max_size) { - return buft->iface.get_max_size(buft); - } - return SIZE_MAX; -} - -size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { - // get_alloc_size is optional, defaults to ggml_nbytes - if (buft->iface.get_alloc_size) { - size_t size = buft->iface.get_alloc_size(buft, tensor); - assert(size >= ggml_nbytes(tensor)); - return size; - } - return ggml_nbytes(tensor); -} - -bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { - if (buft->iface.is_host) { - return buft->iface.is_host(buft); - } - return false; -} - -ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) { - return buft->device; -} - -// backend buffer - -ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - void * context, - size_t size) { - ggml_backend_buffer_t buffer = new ggml_backend_buffer { - /* .interface = */ iface, - /* .buft = */ buft, - /* .context = */ context, - /* .size = */ size, - /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY - }; - - return buffer; -} - -const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer)); -} - -void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { - if (buffer == NULL) { - return; - } - - if (buffer->iface.free_buffer != NULL) { - buffer->iface.free_buffer(buffer); - } - delete buffer; -} - -size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { - return buffer->size; -} - -void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { - // get_base is optional if the buffer is zero-sized - if (buffer->size == 0) { - return NULL; - } - - void * base = buffer->iface.get_base(buffer); - - GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); - - return base; -} - -enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - // init_tensor is optional - if (buffer->iface.init_tensor) { - return buffer->iface.init_tensor(buffer, tensor); - } - return GGML_STATUS_SUCCESS; -} - -void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - // clear is optional if the buffer is zero-sized - if (buffer->size == 0) { - return; - } - - buffer->iface.clear(buffer, value); -} - -size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); -} - -size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer)); -} - -size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); -} - -bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); -} - -void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { - buffer->usage = usage; - - // FIXME: add a generic callback to the buffer interface - if (ggml_backend_buffer_is_multi_buffer(buffer)) { - ggml_backend_multi_buffer_set_usage(buffer, usage); - } -} - -enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) { - return buffer->usage; -} - -ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { - return buffer->buft; -} - -void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { - if (buffer->iface.reset) { - buffer->iface.reset(buffer); - } -} - -bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; - if (dst_buf->iface.cpy_tensor) { - return dst_buf->iface.cpy_tensor(dst_buf, src, dst); - } - return false; -} - -// backend - -ggml_guid_t ggml_backend_guid(ggml_backend_t backend) { - if (backend == NULL) { - return NULL; - } - return backend->guid; -} - -const char * ggml_backend_name(ggml_backend_t backend) { - if (backend == NULL) { - return "NULL"; - } - return backend->iface.get_name(backend); -} - -void ggml_backend_free(ggml_backend_t backend) { - if (backend == NULL) { - return; - } - - backend->iface.free(backend); -} - -ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_dev_buffer_type(backend->device); -} - -ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { - return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size); -} - -size_t ggml_backend_get_alignment(ggml_backend_t backend) { - return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)); -} - -size_t ggml_backend_get_max_size(ggml_backend_t backend) { - return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend)); -} - -void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - - if (backend->iface.set_tensor_async == NULL) { - ggml_backend_tensor_set(tensor, data, offset, size); - } else { - backend->iface.set_tensor_async(backend, tensor, data, offset, size); - } -} - -void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - - if (backend->iface.get_tensor_async == NULL) { - ggml_backend_tensor_get(tensor, data, offset, size); - } else { - backend->iface.get_tensor_async(backend, tensor, data, offset, size); - } -} - -void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor); - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - if (size == 0) { - return; - } - - GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - - buf->iface.set_tensor(buf, tensor, data, offset, size); -} - -void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor); - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - if (size == 0) { - return; - } - - GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - - buf->iface.get_tensor(buf, tensor, data, offset, size); -} - -void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - if (size == 0) { - return; - } - - GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer"); - - buf->iface.memset_tensor(buf, tensor, value, offset, size); -} - -void ggml_backend_synchronize(ggml_backend_t backend) { - if (backend->iface.synchronize == NULL) { - return; - } - - backend->iface.synchronize(backend); -} - -ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - GGML_ASSERT(backend->iface.graph_plan_create != NULL); - - return backend->iface.graph_plan_create(backend, cgraph); -} - -void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(backend->iface.graph_plan_free != NULL); - - backend->iface.graph_plan_free(backend, plan); -} - -enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(backend->iface.graph_plan_compute != NULL); - - return backend->iface.graph_plan_compute(backend, plan); -} - -enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); - ggml_backend_synchronize(backend); - return err; -} - -enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - return backend->iface.graph_compute(backend, cgraph); -} - -bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - return ggml_backend_dev_supports_op(backend->device, op); -} - -bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return ggml_backend_dev_supports_buft(backend->device, buft); -} - -bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { - return ggml_backend_dev_offload_op(backend->device, op); -} - -ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { - return backend->device; -} - -// backend copy - -static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; -} - -void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - - if (src == dst) { - return; - } - - if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); - } else if (ggml_backend_buffer_is_host(dst->buffer)) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); -#endif - size_t nbytes = ggml_nbytes(src); - void * data = malloc(nbytes); - ggml_backend_tensor_get(src, data, 0, nbytes); - ggml_backend_tensor_set(dst, data, 0, nbytes); - free(data); - } -} - -void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - - if (src == dst) { - return; - } - - if (backend_dst->iface.cpy_tensor_async != NULL) { - if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) { - return; - } - } - - // an async copy would normally happen after all the queued operations on both backends are completed - // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy - ggml_backend_synchronize(backend_src); - ggml_backend_synchronize(backend_dst); - ggml_backend_tensor_copy(src, dst); -} - -// events - -ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) { - // null device is allowed for the transition period to the device interface - if (device == NULL || device->iface.event_new == NULL) { - return NULL; - } - return device->iface.event_new(device); -} - -void ggml_backend_event_free(ggml_backend_event_t event) { - if (event == NULL) { - return; - } - event->device->iface.event_free(event->device, event); -} - -void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) { - GGML_ASSERT(backend->iface.event_record != NULL); - - backend->iface.event_record(backend, event); -} - -void ggml_backend_event_synchronize(ggml_backend_event_t event) { - GGML_ASSERT(event->device->iface.event_synchronize); - - event->device->iface.event_synchronize(event->device, event); -} - -void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { - GGML_ASSERT(backend->iface.event_wait != NULL); - - backend->iface.event_wait(backend, event); -} - -// Backend device - -const char * ggml_backend_dev_name(ggml_backend_dev_t device) { - return device->iface.get_name(device); -} - -const char * ggml_backend_dev_description(ggml_backend_dev_t device) { - return device->iface.get_description(device); -} - -void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - device->iface.get_memory(device, free, total); -} - -enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { - return device->iface.get_type(device); -} - -void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) { - memset(props, 0, sizeof(*props)); - device->iface.get_props(device, props); -} - -ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { - return device->reg; -} - -ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) { - return device->iface.init_backend(device, params); -} - -ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { - return device->iface.get_buffer_type(device); -} - -ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) { - if (device->iface.get_host_buffer_type == NULL) { - return NULL; - } - - return device->iface.get_host_buffer_type(device); -} - -ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { - return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size); -} - -bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { - return device->iface.supports_op(device, op); -} - -bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { - return device->iface.supports_buft(device, buft); -} - -bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { - if (device->iface.offload_op != NULL) { - return device->iface.offload_op(device, op); - } - - return false; -} - -// Backend (reg) - -const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { - return reg->iface.get_name(reg); -} - -size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) { - return reg->iface.get_device_count(reg); -} - -ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) { - return reg->iface.get_device(reg, index); -} - -void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (!reg->iface.get_proc_address) { - return NULL; - } - return reg->iface.get_proc_address(reg, name); -} - -// multi-buffer buffer - -struct ggml_backend_multi_buffer_context { - ggml_backend_buffer_t * buffers; - size_t n_buffers; -}; - -static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; - for (size_t i = 0; i < ctx->n_buffers; i++) { - ggml_backend_buffer_free(ctx->buffers[i]); - } - - free(ctx->buffers); - free(ctx); -} - -static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; - for (size_t i = 0; i < ctx->n_buffers; i++) { - ggml_backend_buffer_clear(ctx->buffers[i], value); - } -} - -static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { - /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, - /* .get_base = */ NULL, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ NULL, - /* .get_tensor = */ NULL, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_multi_buffer_clear, - /* .reset = */ NULL, -}; - -ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { - ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context)); - ctx->n_buffers = n_buffers; - ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); - - GGML_ASSERT(ctx->buffers != NULL); - - size_t total_size = 0; - for (size_t i = 0; i < n_buffers; i++) { - ctx->buffers[i] = buffers[i]; - total_size += ggml_backend_buffer_get_size(buffers[i]); - } - - return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size); -} - -bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { - return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer; -} - -void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { - GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer)); - ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; - for (size_t i = 0; i < ctx->n_buffers; i++) { - ggml_backend_buffer_set_usage(ctx->buffers[i], usage); - } -} - -// creates a copy of the tensor with the same memory layout -static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { - struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); - for (int i = 0; i < GGML_MAX_DIMS; i++) { - dup->nb[i] = tensor->nb[i]; - } - return dup; -} - -static bool ggml_is_view_op(enum ggml_op op) { - return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; -} - -// scheduler - -#ifndef GGML_SCHED_MAX_BACKENDS -#define GGML_SCHED_MAX_BACKENDS 16 -#endif - -#ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC -#endif - -#ifndef GGML_SCHED_MAX_COPIES -#define GGML_SCHED_MAX_COPIES 4 -#endif - -struct ggml_backend_sched_split { - int backend_id; - int i_start; - int i_end; - struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; - int n_inputs; - // graph view of this split - struct ggml_cgraph graph; -}; - -struct ggml_backend_sched { - bool is_reset; // true if the scheduler has been reset since the last graph split - bool is_alloc; - - int n_backends; - - ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS]; - ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; - ggml_gallocr_t galloc; - - // hash map of the nodes in the graph - struct ggml_hash_set hash_set; - int * hv_tensor_backend_ids; // [hash_set.size] - struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] - - int * node_backend_ids; // [graph_size] - int * leaf_backend_ids; // [graph_size] - - int * prev_node_backend_ids; // [graph_size] - int * prev_leaf_backend_ids; // [graph_size] - - // copy of the graph with modified inputs - struct ggml_cgraph graph; - - // graph splits - struct ggml_backend_sched_split * splits; - int n_splits; - int splits_capacity; - - // pipeline parallelism support - int n_copies; - int cur_copy; - ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; - struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; - int n_graph_inputs; - - struct ggml_context * ctx; - - ggml_backend_sched_eval_callback callback_eval; - void * callback_eval_user_data; - - char * context_buffer; - size_t context_buffer_size; - - int debug; -}; - -#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) -#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] -#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] -#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) - -// returns the priority of the backend, lower id is higher priority -static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { - for (int i = 0; i < sched->n_backends; i++) { - if (sched->backends[i] == backend) { - return i; - } - } - return -1; -} - -static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) { - ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - if (buffer == NULL) { - return -1; - } - - // find highest prio backend that supports the buffer type and the op - for (int i = 0; i < sched->n_backends; i++) { - if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && - ggml_backend_supports_op(sched->backends[i], op)) { - return i; - } - } - -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", - __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); -#endif - - return -1; -} - -#if 0 -#define GGML_SCHED_MAX_SPLITS_DEBUG 4096 -static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only -#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) -#define GET_CAUSE(node) causes[hash_id(node)] -#else -#define SET_CAUSE(node, ...) -#define GET_CAUSE(node) "" -#endif - -// returns the backend that should be used for the node based on the current locations -static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { - // assign pre-allocated nodes to their backend - int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor); - if (cur_backend_id != -1) { - SET_CAUSE(tensor, "1.dst"); - return cur_backend_id; - } - - // view_src - if (tensor->view_src != NULL) { - cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor); - if (cur_backend_id != -1) { - SET_CAUSE(tensor, "1.vsrc"); - return cur_backend_id; - } - } - - if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { - // since the tensor is pre-allocated, it cannot be moved to another backend - ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op)); - } - - // graph input - if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { - cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) - SET_CAUSE(tensor, "1.inp"); - return cur_backend_id; - } - - // operations with weights are preferably run on the same backend as the weights - for (int i = 0; i < GGML_MAX_SRC; i++) { - const struct ggml_tensor * src = tensor->src[i]; - if (src == NULL) { - continue; - } - // skip ROPE since the rope freqs tensor is too small to choose a backend based on it - // not an ideal solution - if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); - // check if a backend with higher prio wants to offload the op - if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { - for (int b = 0; b < src_backend_id; b++) { - if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { - SET_CAUSE(tensor, "1.off"); - return b; - } - } - } - SET_CAUSE(tensor, "1.wgt%d", i); - return src_backend_id; - } - } - - return -1; -} - -static char * fmt_size(size_t size) { - static char buffer[128]; - if (size >= 1024*1024) { - snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024); - } else { - snprintf(buffer, sizeof(buffer), "%zuK", size/1024); - } - return buffer; -} - -static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { - ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend), - sched->splits[cur_split].n_inputs); - for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { - if (j == 0) { - GGML_LOG_DEBUG(": "); - } - GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, - fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); - } - GGML_LOG_DEBUG("\n"); - cur_split++; - } - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - if (sched->debug > 1) { - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, - fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); - } - GGML_LOG_DEBUG("\n"); - } - } -} - -static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) { - ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; - ggml_backend_buffer_type_t buft = NULL; - - if (buf) { - // the tensor is already allocated - buft = buf->buft; - } else { - // see if the tensor already has a backend assigned, and use the buffer type of that backend - int tensor_backend_id = tensor_backend_id(t); - if (tensor_backend_id == -1 && t->view_src) { - tensor_backend_id = tensor_backend_id(t->view_src); - } - if (tensor_backend_id != -1) { - buft = sched->bufts[tensor_backend_id]; - } - } - - return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft); -} - -static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { - if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.sup"); - } -} - -// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - // reset splits - sched->n_splits = 0; - sched->n_graph_inputs = 0; - sched->is_reset = false; - - struct ggml_init_params params = { - /* .mem_size = */ sched->context_buffer_size, - /* .mem_buffer = */ sched->context_buffer, - /* .no_alloc = */ true - }; - - ggml_free(sched->ctx); - - sched->ctx = ggml_init(params); - if (sched->ctx == NULL) { - GGML_ABORT("%s: failed to initialize context\n", __func__); - } - - // pass 1: assign backends to ops with pre-allocated inputs - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - int * leaf_backend_id = &tensor_backend_id(leaf); - // do not overwrite user assignments - if (*leaf_backend_id == -1) { - *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); - } - } - - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - int * node_backend_id = &tensor_backend_id(node); - // do not overwrite user assignments - if (*node_backend_id == -1) { - *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); - -#if 0 - // src - if (node->op == GGML_OP_NONE) { - continue; - } - - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - int * src_backend_id = &tensor_backend_id(src); - if (*src_backend_id == -1) { - *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); - } - } -#endif - } - } - - // pass 2: expand current backend assignments - // assign the same backend to adjacent nodes - // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) - // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known - // expand gpu down - { - int cur_backend_id = -1; - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - if (*node_backend_id == sched->n_backends - 1) { - // skip cpu (lowest prio backend) - cur_backend_id = -1; - } else { - cur_backend_id = *node_backend_id; - } - } else if (cur_backend_id != -1) { - ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); - } - } - } - // expand gpu up - { - int cur_backend_id = -1; - for (int i = graph->n_nodes - 1; i >= 0; i--) { - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - if (*node_backend_id == sched->n_backends - 1) { - // skip cpu (lowest prio backend) - cur_backend_id = -1; - } else { - cur_backend_id = *node_backend_id; - } - } else if (cur_backend_id != -1) { - ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); - } - } - } - // expand rest down - { - int cur_backend_id = -1; - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - cur_backend_id = *node_backend_id; - } else if (cur_backend_id != -1) { - ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); - } - } - } - // expand rest up - { - int cur_backend_id = -1; - for (int i = graph->n_nodes - 1; i >= 0; i--) { - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - cur_backend_id = *node_backend_id; - } else if (cur_backend_id != -1) { - ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); - } - } - } - - // pass 3: upgrade nodes to higher prio backends with compatible buffer types - // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there - // however, we also need to verify that the sources are in compatible buffer types - // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph - // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same - // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) - // additionally, set remaining unassigned nodes to the backend with the most supported inputs - // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id == -1) { - // unassigned node: find the backend with the most supported inputs - int n_supported_best = -1; - for (int b = 0; b < sched->n_backends; b++) { - if (ggml_backend_supports_op(sched->backends[b], node)) { - int n_supported = 0; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) { - n_supported++; - } - } - if (n_supported > n_supported_best) { - n_supported_best = n_supported; - *node_backend_id = b; - SET_CAUSE(node, "3.best"); - } - } - } - } else { - // assigned node: upgrade to higher prio backend if possible - for (int b = 0; b < *node_backend_id; b++) { - if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { - bool supported = true; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - if (!ggml_backend_sched_buffer_supported(sched, src, b)) { - supported = false; - break; - } - } - if (supported) { - *node_backend_id = b; - SET_CAUSE(node, "3.upg"); - break; - } - } - } - } - } - - // pass 4: assign backends to remaining src from dst and view_src - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - int * cur_backend_id = &tensor_backend_id(node); - if (node->view_src != NULL && *cur_backend_id == -1) { - *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "4.vsrc"); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - int * src_backend_id = &tensor_backend_id(src); - if (*src_backend_id == -1) { - if (src->view_src != NULL) { - // views are always on the same backend as the source - *src_backend_id = tensor_backend_id(src->view_src); - SET_CAUSE(src, "4.vsrc"); - } else { - *src_backend_id = *cur_backend_id; - SET_CAUSE(src, "4.cur"); - } - } - } - } - - // pass 5: split graph, find tensors that need to be copied - { - int i_split = 0; - struct ggml_backend_sched_split * split = &sched->splits[0]; - // find the backend of the first split, skipping view ops - int i = 0; - for (; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (!ggml_is_view_op(node->op)) { - split->backend_id = tensor_backend_id(node); - break; - } - } - split->i_start = 0; - split->n_inputs = 0; - int cur_backend_id = split->backend_id; - for (; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - - if (ggml_is_view_op(node->op)) { - continue; - } - - const int node_backend_id = tensor_backend_id(node); - - assert(node_backend_id != -1); // all nodes should be assigned by now - - // check if we should start a new split based on the sources of the current node - bool need_new_split = false; - if (node_backend_id == cur_backend_id && split->n_inputs > 0) { - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - // check if a weight is on a different and incompatible backend - // by starting a new split, the memory of the previously offloaded weights can be reused - if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = tensor_backend_id(src); - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { - need_new_split = true; - break; - } - } - // check if the split has too many inputs - // FIXME: count the number of inputs instead of only checking when full - if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { - const size_t id = hash_id(src); - int src_backend_id = sched->hv_tensor_backend_ids[id]; - bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { - need_new_split = true; - break; - } - } - } - } - - if (node_backend_id != cur_backend_id || need_new_split) { - split->i_end = i; - i_split++; - if (i_split >= sched->splits_capacity) { - sched->splits_capacity *= 2; - sched->splits = (ggml_backend_sched_split *) - realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); - GGML_ASSERT(sched->splits != NULL); - } - split = &sched->splits[i_split]; - split->backend_id = node_backend_id; - split->i_start = i; - split->n_inputs = 0; - cur_backend_id = node_backend_id; - } - - // find inputs that are not on the same backend - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - - size_t src_id = hash_id(src); - const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; - assert(src_backend_id != -1); // all inputs should be assigned by now - - if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { - if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { - ggml_backend_t backend = sched->backends[src_backend_id]; - for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * tensor_copy; - if (c == sched->cur_copy) { - tensor_copy = src; // use the original tensor as the current copy - } else { - tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); - } - if (sched->n_copies > 1) { - ggml_set_input(tensor_copy); - ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor - } - tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; - SET_CAUSE(tensor_copy, "4.cpy"); - } - int n_graph_inputs = sched->n_graph_inputs++; - GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - sched->graph_inputs[n_graph_inputs] = src; - } - } - - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { - // create a copy of the input in the split's backend - if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { - ggml_backend_t backend = sched->backends[cur_backend_id]; - for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); - if (sched->n_copies > 1) { - ggml_set_input(tensor_copy); - ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor - } - tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; - SET_CAUSE(tensor_copy, "4.cpy"); - } - int n_inputs = split->n_inputs++; - GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - split->inputs[n_inputs] = src; - } - node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); - } - } - } - split->i_end = graph->n_nodes; - sched->n_splits = i_split + 1; - } - - if (sched->debug) { - ggml_backend_sched_print_assignments(sched, graph); - } - - // swap node_backend_ids and leaf _backend_ids with prevs - { - int * tmp = sched->node_backend_ids; - sched->node_backend_ids = sched->prev_node_backend_ids; - sched->prev_node_backend_ids = tmp; - - tmp = sched->leaf_backend_ids; - sched->leaf_backend_ids = sched->prev_leaf_backend_ids; - sched->prev_leaf_backend_ids = tmp; - } - - int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; - if (sched->graph.size < graph_size) { - sched->graph.size = graph_size; - sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); - sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); - GGML_ASSERT(sched->graph.nodes != NULL); - GGML_ASSERT(sched->graph.leafs != NULL); - } - sched->graph.n_nodes = 0; - sched->graph.n_leafs = 0; - - struct ggml_cgraph * graph_copy = &sched->graph; - - for (int i = 0; i < sched->n_splits; i++) { - struct ggml_backend_sched_split * split = &sched->splits[i]; - split->graph = ggml_graph_view(graph, split->i_start, split->i_end); - - // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split - for (int j = 0; j < split->n_inputs; j++) { - assert(graph_copy->size > (graph_copy->n_nodes + 1)); - - struct ggml_tensor * input = split->inputs[j]; - const size_t input_id = hash_id(input); - struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); - - // add a dependency to the input source so that it is not freed before the copy is done - struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); - input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id]; - graph_copy->nodes[graph_copy->n_nodes++] = input_dep; - - // add a dependency to the input copy so that it is allocated at the start of the split - sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; - graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; - } - - for (int j = split->i_start; j < split->i_end; j++) { - assert(graph_copy->size > graph_copy->n_nodes); - sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); - graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; - } - } - - if (sched->n_copies > 1) { - // add input copies as leafs so that they are allocated first - for (int i = 0; i < sched->n_graph_inputs; i++) { - struct ggml_tensor * input = sched->graph_inputs[i]; - size_t id = hash_id(input); - int backend_id = tensor_backend_id(input); - for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); - sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; - assert(graph_copy->size > graph_copy->n_leafs); - graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; - } - } - - for (int i = 0; i < sched->n_splits; i++) { - struct ggml_backend_sched_split * split = &sched->splits[i]; - int backend_id = split->backend_id; - for (int j = 0; j < split->n_inputs; j++) { - struct ggml_tensor * input = split->inputs[j]; - size_t id = hash_id(input); - for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); - sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; - assert(graph_copy->size > graph_copy->n_leafs); - graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; - } - } - } - } - - // add leafs from the original graph - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); - assert(graph_copy->size > graph_copy->n_leafs); - graph_copy->leafs[graph_copy->n_leafs++] = leaf; - } -} - -static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { - bool backend_ids_changed = false; - for (int i = 0; i < sched->graph.n_nodes; i++) { - if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && - sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { - backend_ids_changed = true; - break; - } - } - if (!backend_ids_changed) { - for (int i = 0; i < sched->graph.n_leafs; i++) { - if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && - sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { - backend_ids_changed = true; - break; - } - } - } - - // allocate graph - if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { - // the re-allocation may cause the split inputs to be moved to a different address - ggml_backend_sched_synchronize(sched); -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); -#endif - ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); - if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { - GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__); - return false; - } - } - - return true; -} - -static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { - struct ggml_backend_sched_split * splits = sched->splits; - - for (int i = 0; i < sched->n_splits; i++) { - struct ggml_backend_sched_split * split = &splits[i]; - int split_backend_id = split->backend_id; - ggml_backend_t split_backend = sched->backends[split_backend_id]; - - // copy the input tensors to the split backend - for (int j = 0; j < split->n_inputs; j++) { - ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); - struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); - - if (input->flags & GGML_TENSOR_FLAG_INPUT) { - // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); - } else { - ggml_backend_synchronize(split_backend); - } - ggml_backend_tensor_copy(input, input_cpy); - } else { - // wait for the split backend to finish using the input before overwriting it - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); - } else { - ggml_backend_synchronize(split_backend); - } - // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events - // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface - if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { - ggml_backend_synchronize(input_backend); - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); - } else { - ggml_backend_synchronize(split_backend); - } - ggml_backend_tensor_copy(input, input_cpy); - } - } - } - - if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); - if (ec != GGML_STATUS_SUCCESS) { - return ec; - } - } else { - // similar to ggml_backend_compare_graph_backend - for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { - struct ggml_tensor * t = split->graph.nodes[j0]; - - // check if the user needs data from this node - bool need = sched->callback_eval(t, true, sched->callback_eval_user_data); - - int j1 = j0; - - // determine the range [j0, j1] of nodes that can be computed together - while (!need && j1 < split->graph.n_nodes - 1) { - t = split->graph.nodes[++j1]; - need = sched->callback_eval(t, true, sched->callback_eval_user_data); - } - - struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); - if (ec != GGML_STATUS_SUCCESS) { - return ec; - } - - // TODO: pass backend to the callback, then the user can decide if they want to synchronize - ggml_backend_synchronize(split_backend); - - if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { - break; - } - - j0 = j1; - } - } - - // record the event of this copy - if (split->n_inputs > 0) { - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend); - } - } - } - - sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; - - return GGML_STATUS_SUCCESS; -} - -ggml_backend_sched_t ggml_backend_sched_new( - ggml_backend_t * backends, - ggml_backend_buffer_type_t * bufts, - int n_backends, - size_t graph_size, - bool parallel) { - GGML_ASSERT(n_backends > 0); - GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); - GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); - - struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); - - const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG"); - sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0; - sched->n_backends = n_backends; - sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; - - // initialize hash table - // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) - sched->hash_set = ggml_hash_set_new(graph_size); - sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); - sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); - - const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph - const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); - sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); - sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); - - sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); - sched->context_buffer = (char *) malloc(sched->context_buffer_size); - - const int initial_splits_capacity = 16; - sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); - sched->splits_capacity = initial_splits_capacity; - - for (int b = 0; b < n_backends; b++) { - sched->backends[b] = backends[b]; - sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); - GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); - - if (sched->n_copies > 1) { - for (int c = 0; c < sched->n_copies; c++) { - sched->events[b][c] = ggml_backend_event_new(backends[b]->device); - } - } - } - - sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); - - ggml_backend_sched_reset(sched); - - return sched; -} - -void ggml_backend_sched_free(ggml_backend_sched_t sched) { - if (sched == NULL) { - return; - } - for (int b = 0; b < sched->n_backends; b++) { - for (int c = 0; c < sched->n_copies; c++) { - ggml_backend_event_free(sched->events[b][c]); - } - } - ggml_gallocr_free(sched->galloc); - ggml_free(sched->ctx); - ggml_hash_set_free(&sched->hash_set); - free(sched->splits); - free(sched->hv_tensor_backend_ids); - free(sched->hv_tensor_copies); - free(sched->node_backend_ids); - free(sched->leaf_backend_ids); - free(sched->prev_node_backend_ids); - free(sched->prev_leaf_backend_ids); - free(sched->context_buffer); - free(sched->graph.nodes); - free(sched->graph.leafs); - free(sched); -} - -void ggml_backend_sched_reset(ggml_backend_sched_t sched) { - // reset state for the next run - if (!sched->is_reset) { - ggml_hash_set_reset(&sched->hash_set); - memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); - memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); - sched->is_reset = true; - } - sched->is_alloc = false; -} - -bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); - - ggml_backend_sched_split_graph(sched, measure_graph); - - ggml_backend_sched_synchronize(sched); - - if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { - return false; - } - - ggml_backend_sched_reset(sched); - - return true; -} - -bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); - - ggml_backend_sched_split_graph(sched, graph); - - - if (!ggml_backend_sched_alloc_splits(sched)) { - return false; - } - - sched->is_alloc = true; - - return true; -} - -enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); - ggml_backend_sched_synchronize(sched); - return err; -} - -enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - if (!sched->is_reset && !sched->is_alloc) { - ggml_backend_sched_reset(sched); - } - - if (!sched->is_alloc) { - if (!ggml_backend_sched_alloc_graph(sched, graph)) { - return GGML_STATUS_ALLOC_FAILED; - } - } - - return ggml_backend_sched_compute_splits(sched); -} - -void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { - for (int i = 0; i < sched->n_backends; i++) { - ggml_backend_synchronize(sched->backends[i]); - } -} - -void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { - sched->callback_eval = callback; - sched->callback_eval_user_data = user_data; -} - -int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { - return sched->n_splits; -} - -int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { - return sched->n_copies; -} - -int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) { - return sched->n_backends; -} - -ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) { - GGML_ASSERT(i >= 0 && i < sched->n_backends); - return sched->backends[i]; -} - -size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { - int backend_index = ggml_backend_sched_backend_id(sched, backend); - GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - - return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); -} - -void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { - int backend_index = ggml_backend_sched_backend_id(sched, backend); - GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - tensor_backend_id(node) = backend_index; - SET_CAUSE(node, "usr"); - sched->is_reset = false; -} - -ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { - int backend_index = tensor_backend_id(node); - if (backend_index == -1) { - return NULL; - } - return sched->backends[backend_index]; -} - -// utils - -enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) { - GGML_ASSERT(tensor->buffer == NULL); - GGML_ASSERT(tensor->view_src != NULL); - GGML_ASSERT(tensor->view_src->buffer != NULL); - GGML_ASSERT(tensor->view_src->data != NULL); - - tensor->buffer = tensor->view_src->buffer; - tensor->data = (char *)tensor->view_src->data + tensor->view_offs; - return ggml_backend_buffer_init_tensor(tensor->buffer, tensor); -} - -enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { - GGML_ASSERT(tensor->buffer == NULL); - GGML_ASSERT(tensor->data == NULL); - GGML_ASSERT(tensor->view_src == NULL); - GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer)); - GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <= - (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer)); - - tensor->buffer = buffer; - tensor->data = addr; - return ggml_backend_buffer_init_tensor(buffer, tensor); -} - -static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, - struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) { - - GGML_ASSERT(src != NULL); - GGML_ASSERT(src->data && "graph must be allocated"); - - size_t id = ggml_hash_insert(&hash_set, src); - if (id == GGML_HASHSET_ALREADY_EXISTS) { - return node_copies[ggml_hash_find(&hash_set, src)]; - } - - struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); - if (src->view_src != NULL) { - dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); - dst->view_offs = src->view_offs; - } - dst->op = src->op; - memcpy(dst->op_params, src->op_params, sizeof(dst->op_params)); - ggml_set_name(dst, src->name); - - // copy src - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * s = src->src[i]; - if (s == NULL) { - continue; - } - dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); - } - - node_copies[id] = dst; - return dst; -} - -static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { - size_t id = ggml_hash_find(hash_set, src); - if (node_init[id]) { - return; - } - node_init[id] = true; - - struct ggml_tensor * dst = node_copies[id]; - if (dst->view_src != NULL) { - graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); - enum ggml_status status = ggml_backend_view_init(dst); - GGML_ASSERT(status == GGML_STATUS_SUCCESS); - } - else { - ggml_backend_tensor_copy(src, dst); - } - - // init src - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * s = src->src[i]; - if (s == NULL) { - continue; - } - graph_copy_init_tensor(hash_set, node_copies, node_init, s); - } -} - -struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { - struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); - struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT - bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); - - struct ggml_init_params params = { - /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), - /* .mem_buffer = */ NULL, - /* .no_alloc = */ true - }; - - struct ggml_context * ctx_allocated = ggml_init(params); - struct ggml_context * ctx_unallocated = ggml_init(params); - - if (ctx_allocated == NULL || ctx_unallocated == NULL) { - GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__); - ggml_hash_set_free(&hash_set); - free(node_copies); - free(node_init); - ggml_free(ctx_allocated); - ggml_free(ctx_unallocated); - return { - /* .buffer = */ NULL, - /* .ctx_allocated = */ NULL, - /* .ctx_unallocated = */ NULL, - /* .graph = */ NULL, - }; - } - - // dup nodes - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); - } - - // allocate nodes - ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); - if (buffer == NULL) { - GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__); - ggml_hash_set_free(&hash_set); - free(node_copies); - free(node_init); - ggml_free(ctx_allocated); - ggml_free(ctx_unallocated); - return { - /* .buffer = */ NULL, - /* .ctx_allocated = */ NULL, - /* .ctx_unallocated = */ NULL, - /* .graph = */ NULL, - }; - } - - //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024); - - // copy data and init views - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - graph_copy_init_tensor(&hash_set, node_copies, node_init, node); - } - - // build graph copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false); - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)]; - graph_copy->nodes[i] = node_copy; - } - graph_copy->n_nodes = graph->n_nodes; - - ggml_hash_set_free(&hash_set); - free(node_copies); - free(node_init); - - return { - /* .buffer = */ buffer, - /* .ctx_allocated = */ ctx_allocated, - /* .ctx_unallocated = */ ctx_unallocated, - /* .graph = */ graph_copy, - }; -} - -void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { - ggml_backend_buffer_free(copy.buffer); - ggml_free(copy.ctx_allocated); - ggml_free(copy.ctx_unallocated); -} - -bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { - struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); - if (copy.buffer == NULL) { - return false; - } - - struct ggml_cgraph * g1 = graph; - struct ggml_cgraph * g2 = copy.graph; - - assert(g1->n_nodes == g2->n_nodes); - - for (int i = 0; i < g1->n_nodes; i++) { - struct ggml_tensor * t1 = g1->nodes[i]; - struct ggml_tensor * t2 = g2->nodes[i]; - - assert(t1->op == t2->op && ggml_are_same_layout(t1, t2)); - - struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); - struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - - ggml_backend_graph_compute(backend1, &g1v); - ggml_backend_graph_compute(backend2, &g2v); - - if (ggml_is_view_op(t1->op)) { - continue; - } - - // compare results, calculate rms etc - if (!callback(i, t1, t2, user_data)) { - break; - } - } - - ggml_backend_graph_copy_free(copy); - - return true; -} - -// CPU backend - buffer - -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { - uintptr_t data = (uintptr_t)buffer->context; - - // align the buffer - if (data % TENSOR_ALIGNMENT != 0) { - data = GGML_PAD(data, TENSOR_ALIGNMENT); - } - - return (void *)data; -} - -static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_aligned_free(buffer->context, buffer->size); -} - -static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - memcpy((char *)tensor->data + offset, data, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - memcpy(data, (const char *)tensor->data + offset, size); - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { - if (ggml_backend_buffer_is_host(src->buffer)) { - memcpy(dst->data, src->data, ggml_nbytes(src)); - return true; - } - return false; - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - memset(buffer->context, value, buffer->size); -} - -static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { - /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, -}; - -static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, -}; - -// CPU backend buffer type - -// this buffer type is defined here to make it available to all backends - -static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * data = ggml_aligned_malloc(size); - - if (data == NULL) { - GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); - return NULL; - } - - return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); -} - -static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return TENSOR_ALIGNMENT; - - GGML_UNUSED(buft); -} - -static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return true; - - GGML_UNUSED(buft); -} - -ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type; -} - -static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_Mapped"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type; -} - -ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { - GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); - return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); -} diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt deleted file mode 100644 index 0bf3c05d..00000000 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ /dev/null @@ -1,87 +0,0 @@ -if (GGML_STATIC) - set(BLA_STATIC ON) -endif() -#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) -# set(BLA_SIZEOF_INTEGER 8) -#endif() - -set(BLA_VENDOR ${GGML_BLAS_VENDOR}) -find_package(BLAS) - -if (BLAS_FOUND) - message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - - ggml_add_backend_library(ggml-blas - ggml-blas.cpp - ) - - if (${GGML_BLAS_VENDOR} MATCHES "Apple") - add_compile_definitions(ACCELERATE_NEW_LAPACK) - add_compile_definitions(ACCELERATE_LAPACK_ILP64) - add_compile_definitions(GGML_BLAS_USE_ACCELERATE) - elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "") - # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. - # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 - find_package(PkgConfig REQUIRED) - if (${GGML_BLAS_VENDOR} MATCHES "Generic") - pkg_check_modules(DepBLAS blas) - elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS") - # As of openblas v0.3.22, the 64-bit is named openblas64.pc - pkg_check_modules(DepBLAS openblas64) - if (NOT DepBLAS_FOUND) - pkg_check_modules(DepBLAS openblas) - endif() - elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") - add_compile_definitions(GGML_BLAS_USE_BLIS) - pkg_check_modules(DepBLAS blis) - elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") - pkg_check_modules(DepBLAS blas-atlas) - elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") - pkg_check_modules(DepBLAS flexiblas_api) - elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") - add_compile_definitions(GGML_BLAS_USE_MKL) - # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS mkl-sdl) - elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") - # this doesn't provide pkg-config - # suggest to assign BLAS_INCLUDE_DIRS on your own - if ("${NVHPC_VERSION}" STREQUAL "") - message(WARNING "Better to set NVHPC_VERSION") - else() - set(DepBLAS_FOUND ON) - set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") - endif() - endif() - if (DepBLAS_FOUND) - set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" - " detected by pkgconfig, trying to find cblas.h from possible paths...") - find_path(BLAS_INCLUDE_DIRS - NAMES cblas.h - HINTS - /usr/include - /usr/local/include - /usr/include/openblas - /opt/homebrew/opt/openblas/include - /usr/local/opt/openblas/include - /usr/include/x86_64-linux-gnu/openblas/include - ) - endif() - endif() - - message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - - target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS}) - - if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) - add_compile_definitions(GGML_BLAS_USE_MKL) - endif() - - target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES}) - target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS}) -else() - message(ERROR "BLAS not found, please refer to " - "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" - " to set correct GGML_BLAS_VENDOR") -endif() diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp deleted file mode 100644 index ec158dfa..00000000 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ /dev/null @@ -1,517 +0,0 @@ -#include "ggml-impl.h" -#include "ggml-blas.h" -#include "ggml-backend-impl.h" - -#include -#include -#include - -#if defined(GGML_BLAS_USE_ACCELERATE) -# include -#elif defined(GGML_BLAS_USE_MKL) -# include -#elif defined(GGML_BLAS_USE_BLIS) -# include -#elif defined(GGML_BLAS_USE_NVPL) -# include -#else -# include -#endif - -struct ggml_backend_blas_context { - int n_threads = GGML_DEFAULT_N_THREADS; - std::unique_ptr work_data; - size_t work_size = 0; -#ifndef GGML_USE_OPENMP - std::vector> tasks; -#endif -}; - -static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const enum ggml_type type = src0->type; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; - - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); - - if (ctx->work_size < desired_wsize) { - ctx->work_data.reset(new char[desired_wsize]); - ctx->work_size = desired_wsize; - } - void * wdata = ctx->work_data.get(); - - // convert src0 to float - if (type != GGML_TYPE_F32) { - const auto * type_traits = ggml_get_type_traits(type); - ggml_to_float_t const to_float = type_traits->to_float; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; - - const int min_cols_per_thread = 4096; - const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); - const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); - -#ifdef GGML_USE_OPENMP - #pragma omp parallel for num_threads(n_threads) - for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } -#else - for (int i = 1; i < n_threads; i++) { - const int64_t start = i*ne01/n_threads; - const int64_t end = (i + 1)*ne01/n_threads; - if (start < end) { - ctx->tasks.push_back(std::async(std::launch::async, [=]() { - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - })); - } - } - { - // reuse the current thread for the first task - const int64_t start = 0; - const int64_t end = ne01/n_threads; - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - } -#endif - } - } - -#ifndef GGML_USE_OPENMP - // wait for all tasks to finish - for (auto & task : ctx->tasks) { - task.get(); - } - ctx->tasks.clear(); -#endif - } - -#if defined(OPENBLAS_VERSION) - openblas_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_BLIS) - bli_thread_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_NVPL) - nvpl_blas_set_num_threads(ctx->n_threads); -#endif - - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - if (type != GGML_TYPE_F32) { - x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; - } - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne1, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } -} - -static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne3 == ne13); - GGML_ASSERT(ne03 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == sizeof(float)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - // GGML_ASSERT(nb0 <= nb1); - // GGML_ASSERT(nb1 <= nb2); - // GGML_ASSERT(nb2 <= nb3); - - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) - // src0: (k,n) - // src1: (k,m) - // dst: (m,n) - // - // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) - // Also expressed as (major,minor) - // a: (m,k): so src1 transposed - // b: (k,n): so src0 - // c: (m,n) - // - // However, if ggml_is_transposed(src1) is true, then - // src1->data already contains a transposed version, so sgemm mustn't - // transpose it further. - - int n = src0->ne[0]; - int k = src0->ne[1]; - int m = src1->ne[0]; - - CBLAS_TRANSPOSE transposeA; - int lda; - - if (!ggml_is_transposed(src1)) { - transposeA = CblasTrans; - lda = m; - } else { - transposeA = CblasNoTrans; - lda = k; - } - - float * a = (float *) ((char *) src1->data); - float * b = (float *) ((char *) src0->data); - float * c = (float *) ((char *) dst->data); - - cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - - GGML_UNUSED(ctx); -} - -// backend interface - -static const char * ggml_backend_blas_get_name(ggml_backend_t backend) { - return "BLAS"; - - GGML_UNUSED(backend); -} - -static void ggml_backend_blas_free(ggml_backend_t backend) { - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; - delete ctx; - delete backend; -} - -static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; - - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - switch (node->op) { - case GGML_OP_MUL_MAT: - ggml_backend_blas_mul_mat(ctx, node); - break; - - case GGML_OP_OUT_PROD: - ggml_backend_blas_out_prod(ctx, node); - break; - - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; - - default: - GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - } - } - - return GGML_STATUS_SUCCESS; - - GGML_UNUSED(backend); -} - -static struct ggml_backend_i blas_backend_i = { - /* .get_name = */ ggml_backend_blas_get_name, - /* .free = */ ggml_backend_blas_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_blas_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_blas_guid(void) { - static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; - return &guid; -} - -ggml_backend_t ggml_backend_blas_init(void) { - ggml_backend_blas_context * ctx = new ggml_backend_blas_context; - - ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_blas_guid(), - /* .interface = */ blas_backend_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), - /* .context = */ ctx, - }; - -#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) - if (openblas_get_parallel() != OPENBLAS_OPENMP) { - GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); - } -#endif - -#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) - GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); -#endif - - return backend; -} - -bool ggml_backend_is_blas(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); -} - -void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { - GGML_ASSERT(ggml_backend_is_blas(backend_blas)); - - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; - ctx->n_threads = n_threads; -} - -// device interface - -static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) { - return "BLAS"; - - GGML_UNUSED(dev); -} - -static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) { - #if defined(GGML_BLAS_USE_ACCELERATE) - return "Accelerate"; - #elif defined(GGML_BLAS_USE_MKL) - return "MKL"; - #elif defined(GGML_BLAS_USE_BLIS) - return "BLIS"; - #elif defined(GGML_BLAS_USE_NVPL) - return "NVPL"; - #elif defined(OPENBLAS_VERSION) - return "OpenBLAS"; - #else - return "BLAS"; - #endif - - GGML_UNUSED(dev); -} - -static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO - *free = 0; - *total = 0; - - GGML_UNUSED(dev); -} - -static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_ACCEL; - - GGML_UNUSED(dev); -} - -static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_blas_device_get_name(dev); - props->description = ggml_backend_blas_device_get_description(dev); - props->type = ggml_backend_blas_device_get_type(dev); - ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, - /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ true, - /* .events = */ false, - }; -} - -static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) { - return ggml_backend_blas_init(); - - GGML_UNUSED(dev); - GGML_UNUSED(params); -} - -static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_cpu_buffer_type(); - - GGML_UNUSED(dev); -} - -static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - return ggml_backend_cpu_buffer_from_ptr(ptr, size); - - GGML_UNUSED(dev); - GGML_UNUSED(max_tensor_size); -} - -static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - return true; - - case GGML_OP_MUL_MAT: - { - // BLAS usually is only faster for large matrices - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = op->ne[0]; - const int64_t ne1 = op->ne[1]; - - // TODO: find the optimal value - const int64_t min_batch = 32; - - return ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && - (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); - } - - case GGML_OP_OUT_PROD: - return op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && - (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); - - default: - return false; - - } - - GGML_UNUSED(dev); -} - -static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); - - GGML_UNUSED(dev); -} - -static const struct ggml_backend_device_i ggml_backend_blas_device_i = { - /* .get_name = */ ggml_backend_blas_device_get_name, - /* .get_description = */ ggml_backend_blas_device_get_description, - /* .get_memory = */ ggml_backend_blas_device_get_memory, - /* .get_type = */ ggml_backend_blas_device_get_type, - /* .get_props = */ ggml_backend_blas_device_get_props, - /* .init_backend = */ ggml_backend_blas_device_init_backend, - /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr, - /* .supports_op = */ ggml_backend_blas_device_supports_op, - /* .supports_buft = */ ggml_backend_blas_device_supports_buft, - /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -// backend reg interface - -static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) { - return "BLAS"; - - GGML_UNUSED(reg); -} - -static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) { - return 1; - - GGML_UNUSED(reg); -} - -static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) { - GGML_ASSERT(index == 0); - - static ggml_backend_device ggml_backend_blas_device = { - /* .iface = */ ggml_backend_blas_device_i, - /* .reg = */ reg, - /* .context = */ nullptr, - }; - - return &ggml_backend_blas_device; - - GGML_UNUSED(reg); - GGML_UNUSED(index); -} - -static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { - return (void *)ggml_backend_blas_set_n_threads; - } - return NULL; - - GGML_UNUSED(reg); - GGML_UNUSED(name); -} - -static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { - /* .get_name = */ ggml_backend_blas_reg_get_name, - /* .get_device_count = */ ggml_backend_blas_reg_get_device_count, - /* .get_device = */ ggml_backend_blas_reg_get_device, - /* .get_proc_address = */ ggml_backend_blas_get_proc_address, -}; - -ggml_backend_reg_t ggml_backend_blas_reg(void) { - static struct ggml_backend_reg ggml_backend_blas_reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_blas_reg_i, - /* .context = */ NULL, - }; - - return &ggml_backend_blas_reg; -} - -GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt deleted file mode 100644 index 0d8e483b..00000000 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME}) - set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME}) - message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") -endif() - -# Auto-detech Soc type and Soc version, if detect failed, will abort build -set(SOC_VERSION "") -function(detect_ascend_soc_type SOC_VERSION) - execute_process( - COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'" - OUTPUT_VARIABLE npu_info - RESULT_VARIABLE npu_result - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if("${npu_info}" STREQUAL "" OR ${npu_result}) - message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.") - endif() - set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE) -endfunction() - -if(NOT SOC_TYPE) - detect_ascend_soc_type(SOC_VERSION) - set(SOC_TYPE "${SOC_VERSION}") - message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") -endif() - -string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower - -# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P. -string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") -set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") -string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION) - -if (CANN_INSTALL_DIR) - # Only Support Linux. - if (NOT UNIX) - message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}") - endif() - - # Supported platforms: x86-64, arm64 - if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") - elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") - else() - message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}") - endif() - - # Set header and libs - set(CANN_INCLUDE_DIRS - ${CANN_INSTALL_DIR}/include - ${CANN_INSTALL_DIR}/include/aclnn - ${CANN_INSTALL_DIR}/acllib/include - ) - - list(APPEND CANN_LIBRARIES - ascendcl - nnopbase - opapi - acl_op_compiler - ) - - file(GLOB GGML_SOURCES_CANN "*.cpp") - - ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN}) - target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES}) - target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS}) - target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) - - target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") - - message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") - message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") -else() - message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?") -endif() diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile deleted file mode 100644 index 3290a485..00000000 --- a/ggml/src/ggml-cann/Doxyfile +++ /dev/null @@ -1,2579 +0,0 @@ -# Doxyfile 1.8.17 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a double hash (##) is considered a comment and is placed in -# front of the TAG it is preceding. -# -# All text after a single hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists, items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (\" \"). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the configuration -# file that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# https://www.gnu.org/software/libiconv/ for the list of possible encodings. -# The default value is: UTF-8. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -# double-quotes, unless you are using Doxywizard) that should identify the -# project for which the documentation is generated. This name is used in the -# title of most generated pages and in a few other places. -# The default value is: My Project. - -PROJECT_NAME = "ggml" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -# could be handy for archiving the generated documentation or if some version -# control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a -# quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = "Tensor library for machine learning" - -# With the PROJECT_LOGO tag one can specify a logo or an icon that is included -# in the documentation. The maximum height of the logo should not exceed 55 -# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy -# the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -# into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If -# left blank the current directory will be used. - -OUTPUT_DIRECTORY = docs - -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. -# The default value is: NO. - -CREATE_SUBDIRS = NO - -# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII -# characters to appear in the names of generated files. If set to NO, non-ASCII -# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode -# U+3044. -# The default value is: NO. - -ALLOW_UNICODE_NAMES = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. -# The default value is: English. - -OUTPUT_LANGUAGE = English - -# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all generated output in the proper direction. -# Possible values are: None, LTR, RTL and Context. -# The default value is: None. - -OUTPUT_TEXT_DIRECTION = None - -# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member -# descriptions after the members that are listed in the file and class -# documentation (similar to Javadoc). Set to NO to disable this. -# The default value is: YES. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief -# description of a member or function before the detailed description -# -# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. -# The default value is: YES. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator that is -# used to form the text in various listings. Each string in this list, if found -# as the leading text of the brief description, will be stripped from the text -# and the result, after processing the whole list, is used as the annotated -# text. Otherwise, the brief description is used as-is. If left blank, the -# following values are used ($name is automatically replaced with the name of -# the entity):The $name class, The $name widget, The $name file, is, provides, -# specifies, contains, represents, a, an and the. - -ABBREVIATE_BRIEF = "The $name class" \ - "The $name widget" \ - "The $name file" \ - is \ - provides \ - specifies \ - contains \ - represents \ - a \ - an \ - the - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief -# description. -# The default value is: NO. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. -# The default value is: NO. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path -# before files name in the file list and in the header files. If set to NO the -# shortest path that makes the file name unique will be used -# The default value is: YES. - -FULL_PATH_NAMES = YES - -# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -# Stripping is only done if one of the specified strings matches the left-hand -# part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to -# strip. -# -# Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. -# This tag requires that the tag FULL_PATH_NAMES is set to YES. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -# path mentioned in the documentation of a class, which tells the reader which -# header file to include in order to use a class. If left blank only the name of -# the header file containing the class definition is used. Otherwise one should -# specify the list of include paths that are normally passed to the compiler -# using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't -# support long names like on DOS, Mac, or CD-ROM. -# The default value is: NO. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) -# The default value is: NO. - -JAVADOC_AUTOBRIEF = NO - -# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line -# such as -# /*************** -# as being the beginning of a Javadoc-style comment "banner". If set to NO, the -# Javadoc-style will behave just like regular comments and it will not be -# interpreted by doxygen. -# The default value is: NO. - -JAVADOC_BANNER = NO - -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) -# The default value is: NO. - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -# a brief description. This used to be the default behavior. The new default is -# to treat a multi-line C++ comment block as a detailed description. Set this -# tag to YES if you prefer the old behavior instead. -# -# Note that setting this tag to YES also means that rational rose comments are -# not recognized any more. -# The default value is: NO. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -# documentation from any documented member that it re-implements. -# The default value is: YES. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new -# page for each member. If set to NO, the documentation of a member will be part -# of the file/class/namespace that contains it. -# The default value is: NO. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -# uses this value to replace tabs by spaces in code fragments. -# Minimum value: 1, maximum value: 16, default value: 4. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that act as commands in -# the documentation. An alias has the form: -# name=value -# For example adding -# "sideeffect=@par Side Effects:\n" -# will allow you to put the command \sideeffect (or @sideeffect) in the -# documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines (in the resulting output). You can put ^^ in the value part of an -# alias to insert a newline as if a physical newline was in the original file. -# When you need a literal { or } or , in the value part of an alias you have to -# escape them by means of a backslash (\), this can lead to conflicts with the -# commands \{ and \} for these it is advised to use the version @{ and @} or use -# a double escape (\\{ and \\}) - -ALIASES = - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -# only. Doxygen will then generate output that is more tailored for C. For -# instance, some of the names that are used will be different. The list of all -# members will be omitted, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -# Python sources only. Doxygen will then generate output that is more tailored -# for that language. For instance, namespaces will be presented as packages, -# qualified scopes will look different, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources. Doxygen will then generate output that is tailored for Fortran. -# The default value is: NO. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for VHDL. -# The default value is: NO. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice -# sources only. Doxygen will then generate output that is more tailored for that -# language. For instance, namespaces will be presented as modules, types will be -# separated into more groups, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_SLICE = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, -# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, -# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: -# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser -# tries to guess whether the code is fixed or free formatted code, this is the -# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat -# .inc files as Fortran files (default is PHP), and .f files as C (default is -# Fortran), use: inc=Fortran f=C. -# -# Note: For files without extension you can use no_extension as a placeholder. -# -# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -# according to the Markdown format, which allows for more readable -# documentation. See https://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -# case of backward compatibilities issues. -# The default value is: YES. - -MARKDOWN_SUPPORT = YES - -# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up -# to that level are automatically included in the table of contents, even if -# they do not have an id attribute. -# Note: This feature currently applies only to Markdown headings. -# Minimum value: 0, maximum value: 99, default value: 5. -# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. - -TOC_INCLUDE_HEADINGS = 5 - -# When enabled doxygen tries to link words that correspond to documented -# classes, or namespaces to their corresponding documentation. Such a link can -# be prevented in individual cases by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. -# The default value is: YES. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. -# The default value is: NO. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. -# The default value is: NO. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. -# The default value is: NO. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate -# getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. -# This will only work if the methods are indeed getting or setting a simple -# type. If this is not the case, or you want to show the methods anyway, you -# should set this option to NO. -# The default value is: YES. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. -# The default value is: NO. - -DISTRIBUTE_GROUP_DOC = NO - -# If one adds a struct or class to a group and this option is enabled, then also -# any nested class or struct is added to the same group. By default this option -# is disabled and one has to add nested compounds explicitly via \ingroup. -# The default value is: NO. - -GROUP_NESTED_COMPOUNDS = NO - -# Set the SUBGROUPING tag to YES to allow class member groups of the same type -# (for instance a group of public functions) to be put as a subgroup of that -# type (e.g. under the Public Functions section). Set it to NO to prevent -# subgrouping. Alternatively, this can be done per class using the -# \nosubgrouping command. -# The default value is: YES. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -# are shown inside the group in which they are included (e.g. using \ingroup) -# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -# and RTF). -# -# Note that this feature does not work in combination with -# SEPARATE_MEMBER_PAGES. -# The default value is: NO. - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -# with only public data fields or simple typedef fields will be shown inline in -# the documentation of the scope in which they are defined (i.e. file, -# namespace, or group documentation), provided this scope is documented. If set -# to NO, structs, classes, and unions are shown on a separate page (for HTML and -# Man pages) or section (for LaTeX and RTF). -# The default value is: NO. - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -# enum is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically be -# useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. -# The default value is: NO. - -TYPEDEF_HIDES_STRUCT = NO - -# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -# cache is used to resolve symbols given their name and scope. Since this can be -# an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The -# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest -# the optimal cache size from a speed point of view. -# Minimum value: 0, maximum value: 9, default value: 0. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in -# documentation are documented, even if no documentation was available. Private -# class members and static file members will be hidden unless the -# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -# Note: This will also disable the warnings about undocumented members that are -# normally produced when WARNINGS is set to YES. -# The default value is: NO. - -EXTRACT_ALL = YES - -# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will -# be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIVATE = YES - -# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual -# methods of a class will be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIV_VIRTUAL = YES - -# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal -# scope will be included in the documentation. -# The default value is: NO. - -EXTRACT_PACKAGE = YES - -# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be -# included in the documentation. -# The default value is: NO. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined -# locally in source files will be included in the documentation. If set to NO, -# only classes defined in header files are included. Does not have any effect -# for Java sources. -# The default value is: YES. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. If set to YES, local methods, -# which are defined in the implementation section but not in the interface are -# included in the documentation. If set to NO, only methods in the interface are -# included. -# The default value is: NO. - -EXTRACT_LOCAL_METHODS = YES - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base name of -# the file that contains the anonymous namespace. By default anonymous namespace -# are hidden. -# The default value is: NO. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -# undocumented members inside documented classes or files. If set to NO these -# members will be included in the various overviews, but no documentation -# section is generated. This option has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. If set -# to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# declarations. If set to NO, these declarations will be included in the -# documentation. -# The default value is: NO. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -# documentation blocks found inside the body of a function. If set to NO, these -# blocks will be appended to the function's detailed documentation block. -# The default value is: NO. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation that is typed after a -# \internal command is included. If the tag is set to NO then the documentation -# will be excluded. Set it to YES to include the internal documentation. -# The default value is: NO. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# (including Cygwin) ands Mac users are advised to set this option to NO. -# The default value is: system dependent. - -CASE_SENSE_NAMES = YES - -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -# their full class and namespace scopes in the documentation. If set to YES, the -# scope will be hidden. -# The default value is: NO. - -HIDE_SCOPE_NAMES = NO - -# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will -# append additional text to a page's title, such as Class Reference. If set to -# YES the compound reference will be hidden. -# The default value is: NO. - -HIDE_COMPOUND_REFERENCE= NO - -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -# the files that are included by a file in the documentation of that file. -# The default value is: YES. - -SHOW_INCLUDE_FILES = YES - -# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each -# grouped member an include statement to the documentation, telling the reader -# which file to include in order to use the member. -# The default value is: NO. - -SHOW_GROUPED_MEMB_INC = NO - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -# files with double quotes in the documentation rather than with sharp brackets. -# The default value is: NO. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -# documentation for inline members. -# The default value is: YES. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -# (detailed) documentation of file and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. -# The default value is: YES. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -# descriptions of file, namespace and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. Note that -# this will also influence the order of the classes in the class list. -# The default value is: NO. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -# (brief and detailed) documentation of class members so that constructors and -# destructors are listed first. If set to NO the constructors will appear in the -# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -# member documentation. -# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -# detailed member documentation. -# The default value is: NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -# of group names into alphabetical order. If set to NO the group names will -# appear in their defined order. -# The default value is: NO. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -# fully-qualified names, including namespaces. If set to NO, the class list will -# be sorted only by class name, not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the alphabetical -# list. -# The default value is: NO. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -# type resolution of all parameters of a function it will reject a match between -# the prototype and the implementation of a member function even if there is -# only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -# accept a match between prototype and implementation in such cases. -# The default value is: NO. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo -# list. This list is created by putting \todo commands in the documentation. -# The default value is: YES. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test -# list. This list is created by putting \test commands in the documentation. -# The default value is: YES. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug -# list. This list is created by putting \bug commands in the documentation. -# The default value is: YES. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) -# the deprecated list. This list is created by putting \deprecated commands in -# the documentation. -# The default value is: YES. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional documentation -# sections, marked by \if ... \endif and \cond -# ... \endcond blocks. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -# initial value of a variable or macro / define can have for it to appear in the -# documentation. If the initializer consists of more lines than specified here -# it will be hidden. Use a value of 0 to hide initializers completely. The -# appearance of the value of individual variables and macros / defines can be -# controlled using \showinitializer or \hideinitializer command in the -# documentation regardless of this setting. -# Minimum value: 0, maximum value: 10000, default value: 30. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -# the bottom of the documentation of classes and structs. If set to YES, the -# list will mention the files that were used to generate the documentation. -# The default value is: YES. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -# will remove the Files entry from the Quick Index and from the Folder Tree View -# (if specified). -# The default value is: YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -# page. This will remove the Namespaces entry from the Quick Index and from the -# Folder Tree View (if specified). -# The default value is: YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command command input-file, where command is the value of the -# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file -# version. For an example see the documentation. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can -# optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. -# -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -# tag is left empty. - -LAYOUT_FILE = - -# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -# the reference definitions. This must be a list of .bib files. The .bib -# extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. -# For LaTeX the style of the bibliography can be controlled using -# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -# search path. See also \cite for info how to create references. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# Configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the -# messages are off. -# The default value is: NO. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES -# this implies that the warnings are on. -# -# Tip: Turn warnings on while writing the documentation. -# The default value is: YES. - -WARNINGS = YES - -# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate -# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -# will automatically be disabled. -# The default value is: YES. - -WARN_IF_UNDOCUMENTED = YES - -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. -# The default value is: YES. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -# are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. If -# EXTRACT_ALL is set to YES then this flag will automatically be disabled. -# The default value is: NO. - -WARN_NO_PARAMDOC = NO - -# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. -# The default value is: NO. - -WARN_AS_ERROR = NO - -# The WARN_FORMAT tag determines the format of the warning messages that doxygen -# can produce. The string should contain the $file, $line, and $text tags, which -# will be replaced by the file and line number from which the warning originated -# and the warning text. Optionally the format may contain $version, which will -# be replaced by the version of the file (if it could be obtained via -# FILE_VERSION_FILTER) -# The default value is: $file:$line: $text. - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning and error -# messages should be written. If left blank the output is written to standard -# error (stderr). - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# Configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag is used to specify the files and/or directories that contain -# documented source files. You may enter file names like myfile.cpp or -# directories like /usr/src/myproject. Separate the files or directories with -# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING -# Note: If this tag is empty the current directory is searched. - -INPUT = - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. -# The default value is: UTF-8. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -# *.h) to filter out the source-files in the directories. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# read by doxygen. -# -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), -# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen -# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, -# *.vhdl, *.ucf, *.qsf and *.ice. - -FILE_PATTERNS = *.c \ - *.cc \ - *.cxx \ - *.cpp \ - *.c++ \ - *.java \ - *.ii \ - *.ixx \ - *.ipp \ - *.i++ \ - *.inl \ - *.idl \ - *.ddl \ - *.odl \ - *.h \ - *.hh \ - *.hxx \ - *.hpp \ - *.h++ \ - *.cs \ - *.d \ - *.php \ - *.php4 \ - *.php5 \ - *.phtml \ - *.inc \ - *.m \ - *.markdown \ - *.md \ - *.mm \ - *.dox \ - *.doc \ - *.txt \ - *.py \ - *.pyw \ - *.f90 \ - *.f95 \ - *.f03 \ - *.f08 \ - *.f \ - *.for \ - *.tcl \ - *.vhd \ - *.vhdl \ - *.ucf \ - *.qsf \ - *.ice - -# The RECURSIVE tag can be used to specify whether or not subdirectories should -# be searched for input files as well. -# The default value is: NO. - -RECURSIVE = YES - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. -# The default value is: NO. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or directories -# that contain example code fragments that are included (see the \include -# command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank all -# files are included. - -EXAMPLE_PATTERNS = * - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude commands -# irrespective of the value of the RECURSIVE tag. -# The default value is: NO. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or directories -# that contain images that are to be included in the documentation (see the -# \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command: -# -# -# -# where is the value of the INPUT_FILTER tag, and is the -# name of an input file. Doxygen will then use the output that the filter -# program writes to standard output. If FILTER_PATTERNS is specified, this tag -# will be ignored. -# -# Note that the filter must not add or remove lines; it is applied before the -# code is scanned, but not when the output code is generated. If lines are added -# or removed, the anchors will not be placed correctly. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: pattern=filter -# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -# patterns match the file name, INPUT_FILTER is applied. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will also be used to filter the input files that are used for -# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -# The default value is: NO. - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -# it is also possible to disable source filtering for a specific pattern using -# *.ext= (so without naming a filter). -# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - -FILTER_SOURCE_PATTERNS = - -# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -# is part of the input, its contents will be placed on the main page -# (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. - -USE_MDFILE_AS_MAINPAGE = - -#--------------------------------------------------------------------------- -# Configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -# generated. Documented entities will be cross-referenced with these sources. -# -# Note: To get rid of all source code in the generated output, make sure that -# also VERBATIM_HEADERS is set to NO. -# The default value is: NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. -# The default value is: NO. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -# special comment blocks from generated source code fragments. Normal C, C++ and -# Fortran comments will always remain visible. -# The default value is: YES. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# entity all documented functions referencing it will be listed. -# The default value is: NO. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES then for each documented function -# all documented entities called/used by that function will be listed. -# The default value is: NO. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -# to YES then the hyperlinks from functions in REFERENCES_RELATION and -# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -# link to the documentation. -# The default value is: YES. - -REFERENCES_LINK_SOURCE = YES - -# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -# source code will show a tooltip with additional information such as prototype, -# brief description and links to the definition and documentation. Since this -# will make the HTML file larger and loading of large files a bit slower, you -# can opt to disable this feature. -# The default value is: YES. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -SOURCE_TOOLTIPS = YES - -# If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in -# source browser. The htags tool is part of GNU's global source tagging system -# (see https://www.gnu.org/software/global/global.html). You will need version -# 4.8.6 or higher. -# -# To use it do the following: -# - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file -# - Make sure the INPUT points to the root of the source tree -# - Run doxygen as normal -# -# Doxygen will invoke htags (and that will in turn invoke gtags), so these -# tools must be available from the command line (i.e. in the search path). -# -# The result: instead of the source browser generated by doxygen, the links to -# source code will now point to the output of htags. -# The default value is: NO. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -# verbatim copy of the header file for each class for which an include is -# specified. Set to NO to disable this. -# See also: Section \class. -# The default value is: YES. - -VERBATIM_HEADERS = YES - -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# generated with the -Duse_libclang=ON option for CMake. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - -# If clang assisted parsing is enabled you can provide the clang parser with the -# path to the compilation database (see: -# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files -# were built. This is equivalent to specifying the "-p" option to a clang tool, -# such as clang-check. These options will then be passed to the parser. -# Note: The availability of this option depends on whether or not doxygen was -# generated with the -Duse_libclang=ON option for CMake. - -CLANG_DATABASE_PATH = - -#--------------------------------------------------------------------------- -# Configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -# compounds will be generated. Enable this if the project contains a lot of -# classes, structs, unions or interfaces. -# The default value is: YES. - -ALPHABETICAL_INDEX = YES - -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output -# The default value is: YES. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -# it. -# The default directory is: html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -# generated HTML page (for example: .htm, .php, .asp). -# The default value is: .html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a -# standard header. -# -# To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. -# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -# default header using -# doxygen -w html new_header.html new_footer.html new_stylesheet.css -# YourConfigFile -# and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally -# uses. -# Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description -# of the possible markers and block names see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard -# footer. See HTML_HEADER for more information on how to generate a default -# footer and what special commands can be used inside the footer. See also -# section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -# sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. -# See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. -# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -# it is more robust and this tag (HTML_STYLESHEET) will in the future become -# obsolete. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined -# cascading style sheets that are included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. -# This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefore more robust against future updates. -# Doxygen will copy the style sheet files to the output directory. -# Note: The order of the extra style sheet files is of importance (e.g. the last -# style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -# files will be copied as-is; there are no commands or markers available. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -# will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see -# https://en.wikipedia.org/wiki/Hue for more information. For instance the value -# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -# purple, and 360 is red again. -# Minimum value: 0, maximum value: 359, default value: 220. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A -# value of 255 will produce the most vivid colors. -# Minimum value: 0, maximum value: 255, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -# luminance component of the colors in the HTML output. Values below 100 -# gradually make the output lighter, whereas values above 100 make the output -# darker. The value divided by 100 is the actual gamma applied, so 80 represents -# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -# change the gamma. -# Minimum value: 40, maximum value: 240, default value: 80. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML -# documentation will contain a main index with vertical navigation menus that -# are dynamically created via JavaScript. If disabled, the navigation index will -# consists of multiple levels of tabs that are statically embedded in every HTML -# page. Disable this option to support browsers that do not have JavaScript, -# like the Qt help browser. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_MENUS = YES - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -# shown in the various tree structured indices initially; the user can expand -# and collapse entries dynamically later on. Doxygen will expand the tree to -# such a level that at most the specified number of entries are visible (unless -# a fully collapsed tree already exceeds this amount). So setting the number of -# entries 1 will produce a full collapsed tree by default. 0 is a special value -# representing an infinite number of entries and will result in a full expanded -# tree by default. -# Minimum value: 0, maximum value: 9999, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files will be -# generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/xcode/), introduced with OSX -# 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy -# genXcode/_index.html for more information. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_DOCSET = NO - -# This tag determines the name of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# The default value is: Doxygen generated docs. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# This tag specifies a string that should uniquely identify the documentation -# set bundle. This should be a reverse domain-name style string, e.g. -# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. -# The default value is: org.doxygen.Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -# The default value is: Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. -# -# The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -# files are now used as the Windows 98 help format, and will replace the old -# Windows help format (.hlp) on all Windows platforms in the future. Compressed -# HTML files also contain an index, a table of contents, and you can search for -# words in the documentation. The HTML workshop also contains a viewer for -# compressed HTML files. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_HTMLHELP = NO - -# The CHM_FILE tag can be used to specify the file name of the resulting .chm -# file. You can add a path in front of the file if the result should not be -# written to the html output directory. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_FILE = - -# The HHC_LOCATION tag can be used to specify the location (absolute path -# including file name) of the HTML help compiler (hhc.exe). If non-empty, -# doxygen will try to run the HTML help compiler on the generated index.hhp. -# The file has to be specified with full path. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -HHC_LOCATION = - -# The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -GENERATE_CHI = NO - -# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) -# and project file content. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_INDEX_ENCODING = - -# The BINARY_TOC flag controls whether a binary table of contents is generated -# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it -# enables the Previous and Next buttons. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members to -# the table of contents of the HTML help documentation and to the tree view. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -# (.qch) of the generated HTML documentation. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -# the file name of the resulting .qch file. The path specified is relative to -# the HTML output folder. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -# Project output. For more information please see Qt Help Project / Namespace -# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -# Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- -# folders). -# The default value is: doc. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_VIRTUAL_FOLDER = doc - -# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -# filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's filter section matches. Qt Help Project / Filter Attributes (see: -# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_SECT_FILTER_ATTRS = - -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -# generated, together with the HTML files, they form an Eclipse help plugin. To -# install this plugin and make it available under the help contents menu in -# Eclipse, the contents of the directory containing the HTML and XML files needs -# to be copied into the plugins directory of eclipse. The name of the directory -# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -# After copying Eclipse needs to be restarted before the help appears. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the Eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have this -# name. Each documentation set should have its own identifier. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# If you want full control over the layout of the generated HTML pages it might -# be necessary to disable the index and replace it with your own. The -# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -# of each HTML page. A value of NO enables the index and the value YES disables -# it. Since the tabs in the index contain the same information as the navigation -# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. If the tag -# value is set to YES, a side panel will be generated containing a tree-like -# index structure (just like the one that is generated for HTML Help). For this -# to work a browser that supports JavaScript, DHTML, CSS and frames is required -# (i.e. any modern browser). Windows users are probably better off using the -# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. -# -# Note that a value of 0 will completely suppress the enum values from appearing -# in the overview section. -# Minimum value: 0, maximum value: 20, default value: 4. -# This tag requires that the tag GENERATE_HTML is set to YES. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -# to set the initial width (in pixels) of the frame in which the tree is shown. -# Minimum value: 0, maximum value: 1500, default value: 250. -# This tag requires that the tag GENERATE_HTML is set to YES. - -TREEVIEW_WIDTH = 250 - -# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to -# external symbols imported via tag files in a separate window. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of LaTeX formulas included as images in -# the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML -# output directory to force them to be regenerated. -# Minimum value: 8, maximum value: 50, default value: 10. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - -# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands -# to create new LaTeX commands to be used in formulas as building blocks. See -# the section "Including formulas" for details. - -FORMULA_MACROFILE = - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# https://www.mathjax.org) which uses client side JavaScript for the rendering -# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX -# installed or if you want to formulas look prettier in the HTML output. When -# enabled you may also need to install MathJax separately and configure the path -# to it using the MATHJAX_RELPATH option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -USE_MATHJAX = YES - -# When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. -# Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. -# The default value is: HTML-CSS. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_FORMAT = HTML-CSS - -# When MathJax is enabled you need to specify the location relative to the HTML -# output directory using the MATHJAX_RELPATH option. The destination directory -# should contain the MathJax.js script. For instance, if the mathjax directory -# is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -# Content Delivery Network so you can quickly see the result without installing -# MathJax. However, it is strongly recommended to install a local copy of -# MathJax from https://www.mathjax.org before deployment. -# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ - -# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -# extension names that should be enabled during MathJax rendering. For example -# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_EXTENSIONS = - -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -# of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an -# example see the documentation. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_CODEFILE = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and -# should work on any modern browser. Note that when using HTML help -# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -# there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then -# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -# search using the keyboard; to jump to the search box use + S -# (what the is depends on the OS and browser, but it is typically -# , /