sycl: Use syclcompat::dp4a (llama/10267)

* sycl: Use syclcompat::dp4a

* Using the syclcompat version allow the compiler to optimize the
  operation with native function

* Update news section

* Update CI Windows oneAPI version to 2025.0

* Reword doc

* Call syclcompat::dp4a inside dpct::dp4a

This reverts commit 90cb61d692d61360b46954a1c7f780bd2e569b73.
This commit is contained in:
Romain Biessy 2024-11-15 04:09:12 +01:00 committed by Georgi Gerganov
parent 3298916e5e
commit 2c0484ebf7
2 changed files with 6 additions and 26 deletions

View File

@ -15,6 +15,7 @@
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <syclcompat/math.hpp>
#include <oneapi/mkl.hpp>
#include <map>
@ -1830,31 +1831,10 @@ namespace dpct
: id);
}
template <typename T>
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
{
return sycl::vec<T, 1>(val)
.template as<sycl::vec<
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
.template convert<T>();
}
template <typename T1, typename T2>
using dot_product_acc_t =
std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
uint32_t, int32_t>;
template <typename T1, typename T2, typename T3>
inline auto dp4a(T1 a, T2 b, T3 c)
{
dot_product_acc_t<T1, T2> res = c;
auto va = extract_and_sign_or_zero_extend4(a);
auto vb = extract_and_sign_or_zero_extend4(b);
res += va[0] * vb[0];
res += va[1] * vb[1];
res += va[2] * vb[2];
res += va[3] * vb[3];
return res;
return syclcompat::dp4a(a, b, c);
}
struct sub_sat

View File

@ -968,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
grid1[0] ^ signs[0], signs[0], std::minus<>());
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
grid2[0] ^ signs[1], signs[1], std::minus<>());
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
q8 += 8;
aux32 >>= 7;
}
@ -1009,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
grid1[0] ^ signs0, signs0, std::minus<>());
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
grid2[0] ^ signs1, signs1, std::minus<>());
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
q8 += 8;
}
const float d =