diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h deleted file mode 100644 index 042d6d91..00000000 --- a/ggml/include/ggml-amx.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - - -#ifdef __cplusplus -extern "C" { -#endif - -// buffer_type API -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); - -GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend); - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void); - -GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-cuda/ggml/CMakeLists.txt b/ggml/src/ggml-cuda/ggml/CMakeLists.txt deleted file mode 100644 index 14761650..00000000 --- a/ggml/src/ggml-cuda/ggml/CMakeLists.txt +++ /dev/null @@ -1,152 +0,0 @@ -cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - -find_package(CUDAToolkit) - -if (CUDAToolkit_FOUND) - message(STATUS "CUDA Toolkit found") - - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # native == GPUs available at build time - # 52 == Maxwell, lowest CUDA 12 standard - # 60 == P100, FP16 CUDA intrinsics - # 61 == Pascal, __dp4a instruction (per-byte integer dot product) - # 70 == V100, FP16 tensor cores - # 75 == Turing, int8 tensor cores - if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") - set(CMAKE_CUDA_ARCHITECTURES "native") - elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") - else() - set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") - endif() - endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - - enable_language(CUDA) - - file(GLOB GGML_HEADERS_CUDA "*.cuh") - list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h") - - file(GLOB GGML_SOURCES_CUDA "*.cu") - file(GLOB SRCS "template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - - if (GGML_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - endif() - - ggml_add_backend_library(ggml-cuda - ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_CUDA} - ) - - add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) - - if (GGML_CUDA_GRAPHS) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) - endif() - - if (GGML_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - - if (GGML_CUDA_FORCE_CUBLAS) - add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) - endif() - - if (GGML_CUDA_NO_VMM) - add_compile_definitions(GGML_CUDA_NO_VMM) - endif() - - if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() - - if (GGML_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - - if (GGML_STATIC) - if (WIN32) - # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) - else () - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) - endif() - else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) - endif() - - if (GGML_CUDA_NO_VMM) - # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) - else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver) - endif() - - set(CUDA_CXX_FLAGS "") - - set(CUDA_FLAGS -use_fast_math) - - if (GGML_FATAL_WARNINGS) - list(APPEND CUDA_FLAGS -Werror all-warnings) - endif() - - if (GGML_ALL_WARNINGS AND NOT MSVC) - set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) - if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) - endif() - - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler --version - OUTPUT_VARIABLE CUDA_CCFULLVER - ERROR_QUIET - ) - - if (NOT CUDA_CCFULLVER MATCHES clang) - set(CUDA_CCID "GNU") - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" - OUTPUT_VARIABLE CUDA_CCVER - ERROR_QUIET - ) - else() - if (CUDA_CCFULLVER MATCHES Apple) - set(CUDA_CCID "AppleClang") - else() - set(CUDA_CCID "Clang") - endif() - string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) - endif() - - message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") - - ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later - endif() - - if (NOT MSVC) - list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) - endif() - - list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument - - if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") - list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) - endif() - - target_compile_options(ggml-cuda PRIVATE "$<$:${CUDA_FLAGS}>") -else() - message(FATAL_ERROR "CUDA Toolkit not found") -endif() diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cu b/ggml/src/ggml-cuda/rwkv-wkv.cu deleted file mode 100644 index 098e92d3..00000000 --- a/ggml/src/ggml-cuda/rwkv-wkv.cu +++ /dev/null @@ -1,89 +0,0 @@ -#include "common.cuh" -#include "rwkv-wkv.cuh" - -static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) { - const int tid = threadIdx.x; - const int bid = blockIdx.x; - - const int head_size = CUDA_WKV_BLOCK_SIZE; - const int batch_i = bid / H; - const int head_i = bid % H; - const int state_size = C * head_size; - const int n_seq_tokens = T / B; - - float state[head_size]; - __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size]; - - #pragma unroll - for (int i = 0; i < head_size; i++) { - state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; - } - - __syncthreads(); - _tf[tid] = tf[head_i * head_size + tid]; - __syncthreads(); - - for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) { - __syncthreads(); - _k[tid] = k[t]; - _r[tid] = r[t]; - _td[tid] = td[t]; - __syncthreads(); - - const float _v = v[t]; - float y = 0; - for (int j = 0; j < head_size; j += 4) { - const float4& k = (float4&)(_k[j]); - const float4& r = (float4&)(_r[j]); - const float4& tf = (float4&)(_tf[j]); - const float4& td = (float4&)(_td[j]); - float4& s = (float4&)(state[j]); - float4 kv; - - kv.x = k.x * _v; - kv.y = k.y * _v; - kv.z = k.z * _v; - kv.w = k.w * _v; - - y += r.x * (tf.x * kv.x + s.x); - y += r.y * (tf.y * kv.y + s.y); - y += r.z * (tf.z * kv.z + s.z); - y += r.w * (tf.w * kv.w + s.w); - - s.x = s.x * td.x + kv.x; - s.y = s.y * td.y + kv.y; - s.z = s.z * td.z + kv.z; - s.w = s.w * td.w + kv.w; - } - dst[t] = y; - } - - #pragma unroll - for (int i = 0; i < head_size; i++) { - dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; - } -} - -void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const float * k_d = (const float *)dst->src[0]->data; - const float * v_d = (const float *)dst->src[1]->data; - const float * r_d = (const float *)dst->src[2]->data; - const float * tf_d = (const float *)dst->src[3]->data; - const float * td_d = (const float *)dst->src[4]->data; - const float * s_d = (const float *)dst->src[5]->data; - - const int64_t B = dst->src[5]->ne[1]; - const int64_t T = dst->src[0]->ne[3]; - const int64_t C = dst->ne[0]; - const int64_t H = dst->src[0]->ne[2]; - - float * dst_d = (float *)dst->data; - - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); - GGML_ASSERT(C % H == 0); - GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); - - rwkv_wkv_f32<<>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d); -} diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cuh b/ggml/src/ggml-cuda/rwkv-wkv.cuh deleted file mode 100644 index 13795247..00000000 --- a/ggml/src/ggml-cuda/rwkv-wkv.cuh +++ /dev/null @@ -1,5 +0,0 @@ -#include "common.cuh" - -#define CUDA_WKV_BLOCK_SIZE 64 - -void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-musa/ggml/CMakeLists.txt b/ggml/src/ggml-musa/ggml/CMakeLists.txt deleted file mode 100644 index 415b2b2e..00000000 --- a/ggml/src/ggml-musa/ggml/CMakeLists.txt +++ /dev/null @@ -1,107 +0,0 @@ -if (NOT EXISTS $ENV{MUSA_PATH}) - if (NOT EXISTS /opt/musa) - set(MUSA_PATH /usr/local/musa) - else() - set(MUSA_PATH /opt/musa) - endif() -else() - set(MUSA_PATH $ENV{MUSA_PATH}) -endif() - -set(CMAKE_C_COMPILER "${MUSA_PATH}/bin/clang") -set(CMAKE_C_EXTENSIONS OFF) -set(CMAKE_CXX_COMPILER "${MUSA_PATH}/bin/clang++") -set(CMAKE_CXX_EXTENSIONS OFF) - -list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake") - -find_package(MUSAToolkit) - -if (MUSAToolkit_FOUND) - message(STATUS "MUSA Toolkit found") - - if (NOT DEFINED MUSA_ARCHITECTURES) - set(MUSA_ARCHITECTURES "21;22") - endif() - message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}") - - file(GLOB GGML_HEADERS_MUSA "../ggml-cuda/*.cuh") - list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h") - - file(GLOB GGML_SOURCES_MUSA "../ggml-cuda/*.cu") - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - - if (GGML_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - endif() - - set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX) - foreach(SOURCE ${GGML_SOURCES_MUSA}) - set(COMPILE_FLAGS "-x musa -mtgpu") - foreach(ARCH ${MUSA_ARCHITECTURES}) - set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}") - endforeach() - set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS}) - endforeach() - - ggml_add_backend_library(ggml-musa - ${GGML_HEADERS_MUSA} - ${GGML_SOURCES_MUSA} - ) - - # TODO: do not use CUDA definitions for MUSA - target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) - - add_compile_definitions(GGML_USE_MUSA) - add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) - - if (GGML_CUDA_GRAPHS) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) - endif() - - if (GGML_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - - if (GGML_CUDA_FORCE_CUBLAS) - add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) - endif() - - if (GGML_CUDA_NO_VMM) - add_compile_definitions(GGML_CUDA_NO_VMM) - endif() - - if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() - - if (GGML_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - - if (GGML_STATIC) - target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static) - else() - target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas) - endif() - - if (GGML_CUDA_NO_VMM) - # No VMM requested, no need to link directly with the musa driver lib (libmusa.so) - else() - target_link_libraries(ggml-musa PRIVATE MUSA::musa_driver) - endif() -else() - message(FATAL_ERROR "MUSA Toolkit not found") -endif()