SYCL: Rename oneMKL to oneMath (llama/12192)

* Rename oneMKL Interface to oneMath

* Use oneMath for Intel vendor

* Rename occurences to mkl

* clang-format

* Silence verbose warnings

* Set oneMath HIP_TARGETS

* Fix silence warnings

* Remove step to build oneMath from build instructions

* Use fixed oneMath version

* Remove INTEL_CPU

* Fold CMake oneDNN conditions

* Use Intel oneMKL for Intel devices

* Improve CMake message

* Link against MKL::MKL_SYCL::BLAS only

* Move oneMath documentation to Nvidia and AMD sections
This commit is contained in:
Romain Biessy 2025-04-01 10:24:29 +02:00 committed by Georgi Gerganov
parent 0d42097fd3
commit ddf7e6a15d
4 changed files with 182 additions and 149 deletions

View File

@ -23,6 +23,23 @@ ggml_add_backend_library(ggml-sycl
../../include/ggml-sycl.h ../../include/ggml-sycl.h
) )
file(GLOB GGML_HEADERS_SYCL "*.hpp")
file(GLOB GGML_SOURCES_SYCL "*.cpp")
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
find_package(IntelSYCL)
if (IntelSYCL_FOUND)
# Use oneAPI CMake when possible
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
else()
# Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
target_compile_options(ggml-sycl PRIVATE "-fsycl")
target_link_options(ggml-sycl PRIVATE "-fsycl")
endif()
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
# Link against oneDNN
find_package(DNNL) find_package(DNNL)
set(GGML_SYCL_DNNL 0) set(GGML_SYCL_DNNL 0)
if(DNNL_FOUND) if(DNNL_FOUND)
@ -62,8 +79,6 @@ if (GGML_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16) add_compile_definitions(GGML_SYCL_F16)
endif() endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
if (GGML_SYCL_TARGET STREQUAL "NVIDIA") if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32) add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
elseif (GGML_SYCL_TARGET STREQUAL "AMD") elseif (GGML_SYCL_TARGET STREQUAL "AMD")
@ -76,34 +91,84 @@ else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=16) add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
endif() endif()
file(GLOB GGML_HEADERS_SYCL "*.hpp") if (GGML_SYCL_GRAPH)
file(GLOB GGML_SOURCES_SYCL "*.cpp") target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL}) endif()
# Link against Intel oneMKL or oneMath
if (WIN32) if (GGML_SYCL_TARGET STREQUAL "INTEL")
find_package(IntelSYCL REQUIRED) # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
# See https://github.com/uxlfoundation/oneMath/issues/654
find_package(MKL REQUIRED) find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
else() else()
if (GGML_SYCL_GRAPH) find_package(oneMath QUIET)
add_compile_definitions(GGML_SYCL_GRAPH) if (NOT oneMath_FOUND)
message(STATUS "oneMath not found: oneMath will be automatically downloaded")
# Use FetchContent to automatically pull and build oneMath
include(FetchContent)
set(BUILD_FUNCTIONAL_TESTS False)
set(BUILD_EXAMPLES False)
set(TARGET_DOMAINS blas)
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(ENABLE_MKLCPU_BACKEND False)
set(ENABLE_MKLGPU_BACKEND False)
set(ENABLE_CUBLAS_BACKEND True)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
set(ENABLE_MKLCPU_BACKEND False)
set(ENABLE_MKLGPU_BACKEND False)
set(ENABLE_ROCBLAS_BACKEND True)
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
cmake_policy(SET CMP0126 NEW)
# Setting the device architecture is only needed and useful for AMD devices in oneMath
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
endif()
FetchContent_Declare(
ONEMATH
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
)
FetchContent_MakeAvailable(ONEMATH)
# Create alias to match with find_package targets name
function(onemath_alias target)
if (TARGET ${target}_obj)
# Silence verbose warnings from external libraries
target_compile_options(${target}_obj PRIVATE -w)
endif()
if (TARGET ${target})
add_library(ONEMATH::${target} ALIAS ${target})
endif()
endfunction()
onemath_alias(onemath)
onemath_alias(onemath_blas_mklcpu)
onemath_alias(onemath_blas_mklgpu)
onemath_alias(onemath_blas_cublas)
onemath_alias(onemath_blas_rocblas)
endif() endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) # Below oneMath compile-time dispatching is used for better performance
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
add_compile_definitions(GGML_SYCL_NVIDIA) target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas) target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
elseif (GGML_SYCL_TARGET STREQUAL "AMD") elseif (GGML_SYCL_TARGET STREQUAL "AMD")
if (NOT GGML_SYCL_DEVICE_ARCH) if (NOT GGML_SYCL_DEVICE_ARCH)
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
endif() endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa") target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl) target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
else()
# Fallback to oneMath runtime dispatcher
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
endif() endif()
endif()
if (GGML_SYCL_DEVICE_ARCH)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}") if (GGML_SYCL_DEVICE_ARCH)
endif() target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
endif() endif()

View File

@ -16,9 +16,18 @@
#include <sycl/sycl.hpp> #include <sycl/sycl.hpp>
#include <sycl/half_type.hpp> #include <sycl/half_type.hpp>
#include <syclcompat/math.hpp> #include <syclcompat/math.hpp>
#include <oneapi/mkl.hpp>
#include <map> #include <map>
#ifdef GGML_SYCL_USE_INTEL_ONEMKL
#include <oneapi/mkl.hpp>
// Allow to use the same namespace for Intel oneMKL and oneMath
namespace oneapi {
namespace math = mkl;
}
#else
#include <oneapi/math.hpp>
#endif
#include "ggml.h" #include "ggml.h"
#if defined(__linux__) #if defined(__linux__)
@ -83,13 +92,32 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
} }
template <typename Ts> struct matrix_info_t { template <typename Ts> struct matrix_info_t {
oneapi::mkl::transpose transpose_info[2]; oneapi::math::transpose transpose_info[2];
Ts value_info[2]; Ts value_info[2];
std::int64_t size_info[3]; std::int64_t size_info[3];
std::int64_t ld_info[3]; std::int64_t ld_info[3];
std::int64_t groupsize_info; std::int64_t groupsize_info;
}; };
inline auto get_onemath_backend(sycl::queue& queue)
#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
-> sycl::queue&
#endif
{
// If the backend is known at compile-time, use oneMath backend_selector to use
// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
// fallback to runtime dispatching.
#if defined(GGML_SYCL_NVIDIA)
return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
#elif defined(GGML_SYCL_AMD)
return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
return queue;
#else
static_assert(false, "Unsupported backend");
#endif
}
namespace dpct namespace dpct
{ {
typedef sycl::queue *queue_ptr; typedef sycl::queue *queue_ptr;
@ -1686,26 +1714,18 @@ namespace dpct
namespace detail namespace detail
{ {
template <class Ta, class Tb, class Tc, class Ts> template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
oneapi::mkl::transpose b_trans, int m, int n, int k, int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
const void *alpha, const void *a, int lda, const void *b, const void * beta, void * c, int ldc) {
int ldb, const void *beta, void *c, int ldc) Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
{ Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); auto data_a = get_memory<const Ta>(a);
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); auto data_b = get_memory<const Tb>(b);
auto data_a = get_memory<const Ta>(a); auto data_c = get_memory<Tc>(c);
auto data_b = get_memory<const Tb>(b); oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
auto data_c = get_memory<Tc>(c); lda, data_b, ldb, beta_value, data_c, ldc);
#ifdef GGML_SYCL_NVIDIA }
oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
beta_value, data_c, ldc);
#else
oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
beta_value, data_c, ldc);
#endif
}
template <typename VecT, class BinaryOperation, class = void> template <typename VecT, class BinaryOperation, class = void>
class vectorized_binary class vectorized_binary
@ -1735,7 +1755,7 @@ namespace dpct
}; };
template <class Ta, class Tb, class Tc, class Ts> template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b, int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
int ldb, const void * beta, void ** c, int ldc, int batch_size, int ldb, const void * beta, void ** c, int ldc, int batch_size,
matrix_info_t<float> * matrix_info) { matrix_info_t<float> * matrix_info) {
@ -1754,48 +1774,28 @@ namespace dpct
matrix_info->ld_info[2] = ldc; matrix_info->ld_info[2] = ldc;
matrix_info->groupsize_info = batch_size; matrix_info->groupsize_info = batch_size;
#ifdef GGML_SYCL_NVIDIA sycl::event e = oneapi::math::blas::column_major::gemm_batch(
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info, matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b), reinterpret_cast<Ts *>(matrix_info->value_info + 1), reinterpret_cast<Tc **>(c),
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
#else
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
#endif
} }
template <class Ta, class Tb, class Tc, class Ts> template <class Ta, class Tb, class Tc, class Ts>
inline void inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, int m, int n, int k, const void * alpha, const void * a, int lda,
oneapi::mkl::transpose b_trans, int m, int n, long long int stride_a, const void * b, int ldb, long long int stride_b,
int k, const void *alpha, const void *a, int lda, const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
long long int stride_a, const void *b, int ldb,
long long int stride_b, const void *beta, void *c,
int ldc, long long int stride_c, int batch_size)
{
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
auto data_a = get_memory<const Ta>(a); auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b); auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c); auto data_c = get_memory<Tc>(c);
#ifdef GGML_SYCL_NVIDIA oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
oneapi::mkl::blas::column_major::gemm_batch( data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k, data_c, ldc, stride_c, batch_size);
alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
batch_size);
#else
oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
stride_c, batch_size);
#endif
} }
} // namespace detail } // namespace detail
@ -2259,13 +2259,10 @@ namespace dpct
sycl::range<3>(x, y, 1), direction); sycl::range<3>(x, y, 1), direction);
} }
inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans, inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
oneapi::mkl::transpose b_trans, int m, int n, int k, int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
const void *alpha, const void *a, library_data_t a_type, library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
int lda, const void *b, library_data_t b_type, int ldb, library_data_t scaling_type) {
const void *beta, void *c, library_data_t c_type, int ldc,
library_data_t scaling_type)
{
if (scaling_type == library_data_t::real_float && if (scaling_type == library_data_t::real_float &&
c_type == library_data_t::complex_float) c_type == library_data_t::complex_float)
{ {
@ -2329,9 +2326,8 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float): library_data_t::real_float, library_data_t::real_float):
{ {
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
ldb, beta, c, ldc);
break; break;
} }
case detail::get_type_combination_id( case detail::get_type_combination_id(
@ -2369,8 +2365,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float): library_data_t::real_bfloat16, library_data_t::real_float):
{ {
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
break; break;
} }
@ -2390,7 +2385,7 @@ namespace dpct
default: default:
throw std::runtime_error("the combination of data type is unsupported"); throw std::runtime_error("the combination of data type is unsupported");
} }
} // gemm() } // gemm()
/// Computes a batch of matrix-matrix product with general matrices. /// Computes a batch of matrix-matrix product with general matrices.
/// \param [in] q The queue where the routine should be executed. /// \param [in] q The queue where the routine should be executed.
@ -2412,7 +2407,7 @@ namespace dpct
/// \param [in] ldc Leading dimension of C. /// \param [in] ldc Leading dimension of C.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform. /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors. /// \param [in] scaling_type Data type of the scaling factors.
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda, int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[], const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type, library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
@ -2450,7 +2445,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float): library_data_t::real_bfloat16, library_data_t::real_float):
{ {
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>( detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break; break;
} }
@ -2458,7 +2453,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float): library_data_t::real_float, library_data_t::real_float):
{ {
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>( detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break; break;
} }
@ -2534,15 +2529,11 @@ namespace dpct
/// \param [in] stride_c Stride between the different C matrices. /// \param [in] stride_c Stride between the different C matrices.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform. /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors. /// \param [in] scaling_type Data type of the scaling factors.
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
oneapi::mkl::transpose b_trans, int m, int n, int k, int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
const void *alpha, const void *a, library_data_t a_type, long long int stride_a, const void * b, library_data_t b_type, int ldb,
int lda, long long int stride_a, const void *b, long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
library_data_t b_type, int ldb, long long int stride_b, long long int stride_c, int batch_size, library_data_t scaling_type) {
const void *beta, void *c, library_data_t c_type,
int ldc, long long int stride_c, int batch_size,
library_data_t scaling_type)
{
if (scaling_type == library_data_t::real_float && if (scaling_type == library_data_t::real_float &&
c_type == library_data_t::complex_float) c_type == library_data_t::complex_float)
{ {
@ -2611,20 +2602,18 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float): library_data_t::real_bfloat16, library_data_t::real_float):
{ {
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
oneapi::mkl::bfloat16, float>( q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
beta, c, ldc, stride_c, batch_size);
break; break;
} }
case detail::get_type_combination_id( case detail::get_type_combination_id(
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float): library_data_t::real_float, library_data_t::real_float):
{ {
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
stride_a, b, ldb, stride_b, beta, c, ldc, batch_size);
stride_c, batch_size);
break; break;
} }
#endif #endif

View File

@ -2059,8 +2059,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
const sycl::half alpha_f16 = 1.0f; const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f; const sycl::half beta_f16 = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::mkl::transpose::trans, *stream, oneapi::math::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
dst_f16.get(), dpct::library_data_t::real_half, ldc, dst_f16.get(), dpct::library_data_t::real_half, ldc,
@ -2097,17 +2097,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
#if !GGML_SYCL_DNNL #if !GGML_SYCL_DNNL
const float alpha = 1.0f; const float alpha = 1.0f;
const float beta = 0.0f; const float beta = 0.0f;
# ifdef GGML_SYCL_NVIDIA SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
# else
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
dst_dd_i, ldc)));
# endif
#else #else
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
@ -2836,14 +2829,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3 // there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*main_stream, oneapi::mkl::transpose::trans, *main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
(const char *)src0_as_f16, dpct::library_data_t::real_half, (const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
nb01 / nb00, nb02 / nb00, cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
(const char *)src1_f16, dpct::library_data_t::real_half,
nb11 / nb10, nb12 / nb10, beta,
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
ne12 * ne13, cu_compute_type)));
} else { } else {
const int ne23 = ne12*ne13; const int ne23 = ne12*ne13;
@ -2878,7 +2867,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
}); });
} }
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, *main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00, (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta, (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get()))); (void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));

View File

@ -1,8 +1,5 @@
#include <sycl/sycl.hpp>
#include <oneapi/mkl.hpp>
#include "outprod.hpp" #include "outprod.hpp"
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1]; const ggml_tensor *src1 = dst->src[1];
@ -34,20 +31,13 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
// Handle transposition of src1 // Handle transposition of src1
const bool src1_T = ggml_is_transposed(src1); const bool src1_T = ggml_is_transposed(src1);
const oneapi::mkl::transpose src1_op = const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
try { try {
// Perform matrix multiplication using oneMKL GEMM // Perform matrix multiplication using oneMath GEMM
#ifdef GGML_SYCL_NVIDIA oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
ne00, src1_d, ldb, beta, dst_d, ne0);
#else
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
#endif
} }
catch (sycl::exception const& exc) { catch (sycl::exception const& exc) {
std::cerr << exc.what() << std::endl; std::cerr << exc.what() << std::endl;