diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index db3c24f6..b8388149 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3038,6 +3038,10 @@ typedef float dfloat; // dequantize float typedef sycl::float2 dfloat2; #endif //GGML_SYCL_F16 +#define MMVQ_MAX_BATCH_SIZE 8 + +static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + bool ggml_sycl_loaded(void); void * ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void * ptr); @@ -4473,6 +4477,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest } +template +__dpct_inline__ static void +dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; +#pragma unroll + for (int j = 0; j < 8; ++j) + y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); + +#endif + +} + template static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, const sycl::nd_item<3> &item_ct1, @@ -4505,26 +4535,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res } -template -static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3s_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { +template +__dpct_inline__ static void +dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { const int i = item_ct1.get_group(2); - const block_iq3_s * x = (const block_iq3_s *) vx; + const block_iq3_s * x = (const block_iq3_s *) vx; const int tid = item_ct1.get_local_id(2); #if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]); + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); const uint8_t signs = x[i].signs[4*ib + il]; +#pragma unroll for (int j = 0; j < 4; ++j) { y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); @@ -4535,12 +4565,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr } -template -static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { +template +__dpct_inline__ static void +dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + const int i = item_ct1.get_group(2); const block_iq1_s * x = (const block_iq1_s *) vx; @@ -4549,14 +4579,15 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]); - const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1); - const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); } #else assert(false); @@ -4564,6 +4595,85 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr } +template +__dpct_inline__ static void +dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int i = item_ct1.get_group(2); + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} + + +template +__dpct_inline__ static void +dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_group(2); + const block_iq4_xs * x = (const block_iq4_xs *)vx; + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[i].qs + 16*ib + 4*il; + const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } +} + + + /* DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register @@ -7370,6 +7480,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, #endif } +static __dpct_inline__ float +vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); +#endif +} + static __dpct_inline__ float vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, @@ -7412,10 +7574,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, static __dpct_inline__ float vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq3s_grid, const uint64_t *ksigns64) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3s_grid) { #if QK_K == 256 const block_iq3_s * bq2 = (const block_iq3_s *) vbq; @@ -7427,9 +7587,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); uint32_t signs0 = dpct::vectorized_binary( - ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); uint32_t signs1 = dpct::vectorized_binary( - ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); const int grid_l = dpct::vectorized_binary( grid1[0] ^ signs0, signs0, std::minus<>()); const int grid_h = dpct::vectorized_binary( @@ -7438,45 +7600,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); q8 += 8; } - const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0]; + const float d = + (float)bq2->d * + (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * + bq8_1[ib32].ds[0]; return d * sumi; #else assert(false); - return 0.f; -#endif -#else - assert(false); - return 0.f; #endif } static __dpct_inline__ float vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq1s_grid, const uint64_t *ksigns64) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq1s_grid_gpu) { #if QK_K == 256 const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int ib32 = iqs; - const uint8_t * qs = bq1->qs + 4*ib32; - const int8_t * q8 = bq8_1[ib32].qs; int sumi = 0; + const int * q8 = (const int *)bq8_1[ib32].qs; for (int l = 0; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8)); - const int grid_l = dpct::vectorized_binary( - grid[0] ^ signs[0], signs[0], std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid[1] ^ signs[1], signs[1], std::minus<>()); - sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); - sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); - q8 += 8; + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi)); } - const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f; - return d * sumi; + + const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; + const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); + const float d = d1q * bq8_1[ib32].ds[0]; + const float m = d1q * bq8_1[ib32].ds[1]; + return d * sumi + m * delta; +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq1_m * bq1 = (const block_iq1_m *) vbq; + + const int ib32 = iqs; + int sumi[2] = {0, 0}; + float sumf[2] = {0.f, 0.f}; + + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2])); + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101, + dpct::dp4a(q8[2 * l + 0], 0x01010101, 0)); + sumf[l/2] += delta*sumy; + } + + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)bq1->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; + return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); +#else + assert(false); +#endif +} + +static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, + const uint8_t *values, + int &val1, int &val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} + + +static __dpct_inline__ float +vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2); + } + + const float d = (float)bq->d * bq8_1->ds[0]; + return d * (sumi1 + sumi2); +} + + +static __dpct_inline__ float +vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#if QK_K == 256 + const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + // iqs is 0...7 + const int ib32 = iqs; + const int32_t * q8 = (const int *)bq8_1[ib32].qs; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; + const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0]; + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 4; ++j) { + get_int_from_table_16(q4[j], values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); + } + return d * (sumi1 + sumi2); #else assert(false); - return 0.f; #endif } @@ -8061,8 +8320,7 @@ template static void template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) { + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8106,10 +8364,11 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ } template -static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xxs_grid_ptr, const uint8_t *ksigns_iq2xs_ptr, - const uint8_t *kmask_iq2xs_ptr ) { +static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8137,7 +8396,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid_ptr, ksigns_iq2xs_ptr, kmask_iq2xs_ptr); + tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs); } // sum up partial sums and write back result @@ -8153,9 +8412,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void } template -static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xs_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8183,7 +8444,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64); } // sum up partial sums and write back result @@ -8199,9 +8460,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * } template -static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8229,7 +8492,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs); } // sum up partial sums and write back result @@ -8245,9 +8508,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void } template -static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8275,7 +8540,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64); } // sum up partial sums and write back result @@ -8291,9 +8556,11 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * } template -static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8321,7 +8588,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid); } // sum up partial sums and write back result @@ -8336,6 +8603,200 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * } } +template +static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + + +template +static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + + template static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> &item_ct1) { @@ -8897,64 +9358,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } } + template -static inline void swap(T & a, T & b) { +static inline void ggml_sycl_swap(T & a, T & b) { T tmp = a; a = b; b = tmp; } -template -static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, - const sycl::nd_item<3> &item_ct1) { +template +__dpct_inline__ static void +k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad, + const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) { // bitonic sort int col = item_ct1.get_local_id(2); int row = item_ct1.get_group(1); - if (col >= ncols) return; + if (col >= ncols_pad) { + return; + } const float * x_row = x + row * ncols; - int * dst_row = dst + row * ncols; + auto dst_row = (int *)dpct_local; // initialize indices - if (col < ncols) { - dst_row[col] = col; - } - /* - DPCT1065:58: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better - performance if there is no access to global memory. - */ - item_ct1.barrier(); + dst_row[col] = col; - for (int k = 2; k <= ncols; k *= 2) { + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int k = 2; k <= ncols_pad; k *= 2) { for (int j = k / 2; j > 0; j /= 2) { int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { - swap(dst_row[col], dst_row[ixj]); + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]])) + ) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { - swap(dst_row[col], dst_row[ixj]); + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]])) + ) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); } } } /* - DPCT1118:11: SYCL group functions and algorithms must be encountered + DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ - /* - DPCT1065:59: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); } } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } } + static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, const sycl::nd_item<3> &item_ct1) { const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + @@ -9933,28 +10401,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, #endif } +template +static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_s( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template +static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_m( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} template static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0]; - auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; - auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_xxs( - vx, y, item_ct1, iq2xxs_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, iq2xxs_grid, + ksigns_iq2xs, kmask_iq2xs); }); }); } @@ -9965,48 +10469,58 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; - auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; - auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_xs( - vx, y, item_ct1, iq2xs_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, iq2xs_grid, + ksigns_iq2xs, kmask_iq2xs); }); }); } } template -static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, - dpct::queue_ptr stream) { +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { const int nb = k / QK_K; { - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; - auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; - auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s(vx, y, item_ct1); + }); + }); + } +} + +template +static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_xxs( - vx, y, item_ct1, iq3xxs_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, iq3xxs_grid, + ksigns_iq2xs, kmask_iq2xs); }); }); } @@ -10017,53 +10531,68 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; - auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; - auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s( - vx, y, item_ct1, iq3s_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, kmask_iq2xs, iq3s_grid); }); }); } } template -static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, - dpct::queue_ptr stream) { - const int nb = k / QK_K; - { +static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_row_iq4_nl_sycl(vx, y, k, stream); +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; - auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; - auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; - - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_s( - vx, y, item_ct1, iq1s_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); - }); - }); - } + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_xs(vx, y, item_ct1); + }); + }); + } +#endif } + +template +static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_nl(vx, y, item_ct1); + }); + }); + } +} + + + template static void convert_unary_sycl(const void *__restrict__ vx, dst_t *__restrict__ y, const int k, @@ -10108,16 +10637,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { return dequantize_row_q5_K_sycl; case GGML_TYPE_Q6_K: return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_sycl; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_sycl; case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; case GGML_TYPE_F32: return convert_unary_sycl; default: @@ -10152,16 +10689,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { return dequantize_row_q5_K_sycl; case GGML_TYPE_Q6_K: return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_sycl; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_sycl; case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; case GGML_TYPE_F16: return convert_unary_sycl; default: @@ -10624,19 +11169,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - stream->submit([&](sycl::handler &cgh) { - auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0]; - auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0]; - auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0]; - cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq2_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10661,8 +11200,32 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq2_xs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq2xs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10687,8 +11250,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq3_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10706,15 +11268,13 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, stream->submit([&](sycl::handler &cgh) { auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; - auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq3_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq3s_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10739,8 +11299,72 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq1_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq1s_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq1_m_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_NL == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq4_nl_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq4_xs_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -12364,36 +12988,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, }); } +static int next_power_of_2(int x) { + int n = 1; + while (n < x) { + n *= 2; + } + return n; +} + static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, const int nrows, ggml_sort_order order, dpct::queue_ptr stream) { // bitonic sort requires ncols to be power of 2 - GGML_ASSERT((ncols & (ncols - 1)) == 0); + const int ncols_pad = next_power_of_2(ncols); - const sycl::range<3> block_dims(1, 1, ncols); + const sycl::range<3> block_dims(1, 1, ncols_pad); const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + // GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); + if (order == GGML_SORT_ORDER_ASC) { - /* - DPCT1049:44: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr() + .get()); + }); + }); } else if (order == GGML_SORT_ORDER_DESC) { - /* - DPCT1049:45: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr() + .get()); + }); + }); } else { GGML_ASSERT(false); } @@ -13521,8 +14163,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; case GGML_TYPE_IQ3_S: return max_compute_capability >= VER_GEN9 ? 128 : 64; @@ -13541,11 +14187,20 @@ inline void ggml_sycl_op_mul_mat_vec_q( const int64_t src1_ncols, const int64_t src1_padded_row_size, const dpct::queue_ptr &stream) { - GGML_ASSERT(ggml_nrows(src1) == 1); + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff; + switch (src0->type) { case GGML_TYPE_Q4_0: mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); @@ -13577,20 +14232,32 @@ inline void ggml_sycl_op_mul_mat_vec_q( case GGML_TYPE_Q6_K: mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_IQ2_XXS: mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; case GGML_TYPE_IQ2_XS: mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_IQ3_XXS: mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; case GGML_TYPE_IQ3_S: mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + case GGML_TYPE_IQ4_NL: + mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; default: GGML_ASSERT(false); @@ -13672,6 +14339,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; default: + printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); GGML_ASSERT(false); break; } @@ -14526,8 +15194,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10; } // do the computation - op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, - dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream); + SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream))); /* DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -15108,7 +15776,14 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 #ifdef GGML_SYCL_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; + bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; + use_mul_mat_vec_q = use_mul_mat_vec_q || + (src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) || + (src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) || + (src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) || + (src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M); + + #endif // GGML_SYCL_FORCE_DMMV if (use_mul_mat_vec_q) { @@ -16968,9 +17643,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || - a_type == GGML_TYPE_IQ4_XS) { - return false; + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS || + a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S || + a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S || + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M + ) { + if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + return false; + } } return true; } break;