mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-22 16:38:58 +00:00
ggml : sync sycl (skip) (#0)
This commit is contained in:
@ -17,6 +17,7 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml-sycl.h"
|
||||
#include "presets.hpp"
|
||||
|
||||
#define GGML_COMMON_DECL_SYCL
|
||||
@ -46,10 +47,6 @@ static int g_ggml_sycl_debug = 0;
|
||||
} \
|
||||
}()
|
||||
|
||||
// #define DEBUG_SYCL_MALLOC
|
||||
|
||||
static int g_work_group_size = 0;
|
||||
// typedef sycl::half ggml_fp16_t;
|
||||
|
||||
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
||||
#define VER_4VEC 610 // todo for hardward optimize.
|
||||
@ -192,6 +189,8 @@ struct ggml_sycl_device_info {
|
||||
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
|
||||
};
|
||||
|
||||
const ggml_sycl_device_info & ggml_sycl_info();
|
||||
@ -294,5 +293,57 @@ struct ggml_backend_sycl_context {
|
||||
}
|
||||
};
|
||||
|
||||
// common device functions
|
||||
|
||||
static __dpct_inline__ float warp_reduce_sum(float x,
|
||||
const sycl::nd_item<3>& item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
/*
|
||||
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
||||
kernel that calls this function may be less than "32". The function
|
||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||
CPU device. Modify the size of the work-group to ensure that the value
|
||||
of the right-most dimension is a multiple of "32".
|
||||
*/
|
||||
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
static __dpct_inline__ sycl::float2
|
||||
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
||||
mask);
|
||||
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
||||
mask);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float warp_reduce_max(float x,
|
||||
const sycl::nd_item<3>& item_ct1) {
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
/*
|
||||
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
||||
kernel that calls this function may be less than "32". The function
|
||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||
CPU device. Modify the size of the work-group to ensure that the value
|
||||
of the right-most dimension is a multiple of "32".
|
||||
*/
|
||||
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
||||
item_ct1.get_sub_group(), x, mask));
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// Helper for vec loading aligned data
|
||||
template <typename Tp, int n>
|
||||
inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
||||
return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
|
||||
}
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
Reference in New Issue
Block a user