opencl : remove obsolete files (skip) (ggml/1200)

2025-06-06 17:21:33 +00:00 · 2025-04-24 18:41:17 +03:00 · 2025-04-24 18:41:17 +03:00 · 337becefb9
commit 337becefb9
parent 11ae30c19e
10 changed files with 0 additions and 5475 deletions
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
@ -1,106 +0,0 @@
-//------------------------------------------------------------------------------
-// This file is contains additional kernels for data conversion.
-// These kernels are used when loading the model, so its performance is less
-// important.
-//------------------------------------------------------------------------------
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#elif defined(cl_amd_fp16)
-#pragma OPENCL EXTENSION cl_amd_fp16 : enable
-#else
-#error "Half precision floating point not supportedby OpenCL implementation on your device."
-#endif
-
-#ifdef cl_khr_subgroups
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#elif defined(cl_intel_subgroups)
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#error "Subgroup not supported on your device."
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-// Always use subgroup size of 32 on Intel.
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-// Always use subgroups size of 64 on Adreno.
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#else
-// TODO: do not know how to choose subgroup size on other GPUs.
-#error "Selecting subgroup size is not supported on your device."
-#endif
-
-#define QK4_0                   32
-#define QR4_0                   2
-#define QK4_1                   32
-#define QR4_1                   2
-#define QK5_0                   32
-#define QR5_0                   2
-#define QK5_1                   32
-#define QR5_1                   2
-#define QK8_0                   32
-#define QR8_0                   1
-#define QK_K                    256
-#define K_QUANTS_PER_ITERATION  2
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-//------------------------------------------------------------------------------
-// block_q4_0
-//------------------------------------------------------------------------------
-struct block_q4_0
-{
-    half d;
-    uint8_t qs[QK4_0 / 2];
-};
-
-//------------------------------------------------------------------------------
-// mul_vec_q_n_f32_flat_noshuffle
-//
-// This variation uses flat arrays (struct of arrays, SOA) representation for
-// quant tensors. It also uses non shuffled bit order for weights.
-//
-// The shuffled version is kept in the original file because moving it here
-// seems to result in worse performance for adreno.
-//------------------------------------------------------------------------------
-
-kernel void kernel_convert_block_q4_0_noshuffle(
-    global struct block_q4_0 * src0,
-    global uchar * dst_q,
-    global half  * dst_d
-) {
-    global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
-    global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
-    global half  * d = (global half *) dst_d + get_global_id(0);
-
-    *d = b->d;
-    for (int i = 0; i < QK4_0/4; ++i) {
-        uchar x0 = b->qs[2*i + 0];
-        uchar x1 = b->qs[2*i + 1];
-
-        q[i + 0      ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
-        q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
-
-#ifdef ADRENO_GPU
-        // Workaround for adreno - must have the following printf statement for
-        // the kernel to work properly. Otherwise it produces incorrect result.
-        // convert_uchar above also seems necessary.
-        // Compare against a large number so that it does not print anything.
-        // get_sub_group_local_id() also works.
-        if (get_global_id(0) == 65536*4096) {
-            printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
-        }
-#endif
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
@ -1,268 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-
-#ifdef cl_qcom_reqd_sub_group_size
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
-#endif
-
-// assume
-#define QK4_0 32
-#define N_SIMDGROUP 4
-
-#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
-    float shared_y; \
-    shared_y = sub_group_broadcast(y.s0, 0); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 0); \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 0); \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 0); \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 0); \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 0); \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 0); \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 0); \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s0, 1); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 1); \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 1); \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 1); \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 1); \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 1); \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 1); \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 1); \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-
-
-#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
-    shared_y = sub_group_broadcast(y.s0, 2); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 2); \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 2); \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 2); \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 2); \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 2); \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 2); \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 2); \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s0, 3); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 3); \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 3); \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 3); \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 3); \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 3); \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 3); \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 3); \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-
-
-#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
-    float8 shared_y; \
-    shared_y = sub_group_broadcast(y, 0); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-    shared_y = sub_group_broadcast(y, 1); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-
-
-#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
-    shared_y = sub_group_broadcast(y, 2); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-    shared_y = sub_group_broadcast(y, 3); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-
-#ifdef ADRENO_GPU
-REQD_SUBGROUP_SIZE_64
-#endif
-__kernel void kernel_gemv_noshuffle(
-        __read_only  image1d_buffer_t src0_q,  // quantized A
-        global half2  * src0_d,  // A scales
-        __read_only  image1d_buffer_t src1,    // B
-        ulong offset1,            // offset to B (0)
-        global float * dst,     // C
-        ulong offsetd,            // offset to C (0)
-        uint K,               // K
-        int ne01,               // M
-        int ne02,               // 1
-        int ne10,               // K
-        int ne12,               // 1
-        int ne0,                // M
-        int ne1,                // N
-        int r2,                 // 1
-        int r3)
-{
-    uint groupId = get_local_id(1);
-    uint gid     = get_global_id(0);
-    ushort slid    = get_sub_group_local_id();
-
-    __private uint4     regA;
-    __private half2     regS;
-    __private float8    regB;
-
-    __private float2 totalSum = (float2)(0.0f);
-
-    // loop along K in block granularity, skip 4 blocks every iter
-    for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
-        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
-        // first 4 fibers in each wave load 8 B values to its private scope
-        if (slid < 4) {
-            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
-            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
-        }
-
-        // load half weights for two blocks in consecutive rows
-        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
-        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
-        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
-        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
-#ifdef VECTOR_SUB_GROUP_BROADCAT
-        dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
-#else
-        dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
-#endif // VECTOR_SUB_GROUP_BROADCAT
-
-        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
-        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
-        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
-        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
-#ifdef VECTOR_SUB_GROUP_BROADCAT
-        dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
-#else
-        dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
-#endif // VECTOR_SUB_GROUP_BROADCAT
-    }
-
-    // reduction in local memory, assumes #wave=4
-    __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
-    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
-    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
-    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
-    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
-    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
-
-    // 2 outputs per fiber in wave 0
-    if (groupId == 0) {
-        dst = (global float*)((global char*)dst + offsetd);
-        vstore2(totalSum, 0, &(dst[gid * 2]));
-    }
-
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
@ -1,274 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-
-#ifdef cl_qcom_reqd_sub_group_size
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
-#endif
-
-// assume
-#define QK4_0 32
-#define N_SIMDGROUP 4
-
-#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
-    float shared_y; \
-    shared_y = sub_group_broadcast(y.s0, 0); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 0); \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 0); \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 0); \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 0); \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 0); \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 0); \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 0); \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s0, 1); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 1); \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 1); \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 1); \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 1); \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 1); \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 1); \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 1); \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-
-
-#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
-    shared_y = sub_group_broadcast(y.s0, 2); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 2); \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 2); \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 2); \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 2); \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 2); \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 2); \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 2); \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s0, 3); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s1, 3); \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s2, 3); \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s3, 3); \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s4, 3); \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s5, 3); \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s6, 3); \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
-    shared_y = sub_group_broadcast(y.s7, 3); \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
-
-
-#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
-    float8 shared_y; \
-    shared_y = sub_group_broadcast(y, 0); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-    shared_y = sub_group_broadcast(y, 1); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-
-
-#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
-    shared_y = sub_group_broadcast(y, 2); \
-    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-    shared_y = sub_group_broadcast(y, 3); \
-    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
-    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
-    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
-    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
-    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
-    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
-    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
-    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
-    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
-    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
-    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
-    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
-    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
-    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
-    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
-    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
-
-#ifdef ADRENO_GPU
-REQD_SUBGROUP_SIZE_64
-#endif
-__kernel void kernel_gemv_noshuffle(
-        __read_only  image1d_buffer_t src0_q,  // quantized A
-        global half2  * src0_d,  // A scales
-        __read_only  image1d_buffer_t src1,    // B
-        ulong offset1,            // offset to B (0)
-        global float * dst,     // C
-        ulong offsetd,            // offset to C (0)
-        int ne00,               // K
-        int ne01,               // M
-        int ne02,               // 1
-        int ne10,               // K
-        int ne12,               // 1
-        int ne0,                // M
-        int ne1,                // N
-        int r2,                 // 1
-        int r3)
-{
-    uint groupId = get_local_id(1);
-    uint gid     = get_global_id(0);
-    ushort slid    = get_sub_group_local_id();
-
-    uint K = ne00;
-    uint M = ne01;
-
-    uint LINE_STRIDE_A = M / 2;
-    uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
-
-    __private uint4     regA;
-    __private half2     regS;
-    __private float8    regB;
-
-    __private float2 totalSum = (float2)(0.0f);
-
-    // loop along K in block granularity, skip 4 blocks every iter
-    for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
-        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
-        // first 4 fibers in each wave load 8 B values to its private scope
-        if (slid < 4) {
-            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
-            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
-        }
-
-        // load half weights for two blocks in consecutive rows
-        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
-        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
-        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
-        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
-#ifdef VECTOR_SUB_GROUP_BROADCAT
-        dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
-#else
-        dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
-#endif // VECTOR_SUB_GROUP_BROADCAT
-
-        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
-        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
-        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
-        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
-#ifdef VECTOR_SUB_GROUP_BROADCAT
-        dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
-#else
-        dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
-#endif // VECTOR_SUB_GROUP_BROADCAT
-    }
-
-    // reduction in local memory, assumes #wave=4
-    __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
-    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
-    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
-    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
-    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
-    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
-
-    // 2 outputs per fiber in wave 0
-    if (groupId == 0) {
-        dst = (global float*)((global char*)dst + offsetd);
-        vstore2(totalSum, 0, &(dst[gid * 2]));
-    }
-
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
@ -1,146 +0,0 @@
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#elif defined(cl_amd_fp16)
-#pragma OPENCL EXTENSION cl_amd_fp16 : enable
-#else
-#error "Half precision floating point not supportedby OpenCL implementation on your device."
-#endif
-
-#ifdef cl_khr_subgroups
-#pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#elif defined(cl_intel_subgroups)
-#pragma OPENCL EXTENSION cl_intel_subgroups : enable
-#else
-#error "Subgroup not supported on your device."
-#endif
-
-#ifdef cl_intel_required_subgroup_size
-// Always use subgroup size of 32 on Intel.
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-// Always use subgroups size of 64 on Adreno.
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#else
-// TODO: do not know how to choose subgroup size on other GPUs.
-#error "Selecting subgroup size is not supported on your device."
-#endif
-
-kernel void kernel_im2col_f32(
-        global float * src1,
-        ulong offset1,
-        global float * dst,
-        ulong offsetd,
-        ulong batch_offset,
-        ulong delta_offset,
-        long IW,
-        long IH,
-        long IC,
-        long OW,
-        long OH,
-        long KW,
-        long KH,
-        long pelements,
-        long CHW,
-        int  s0,
-        int  s1,
-        int  p0,
-        int  p1,
-        int  d0,
-        int  d1
-) {
-    // threadIdx.x + blockIdx.x * blockDim.x
-    long i = get_global_id(0);
-    if (i >= pelements) {
-        return;
-    }
-
-    src1 = (global float*)((global char*)src1 + offset1);
-    dst = (global float*)((global char*)dst + offsetd);
-
-    long  ksize = OW * (KH > 1 ? KW : 1);
-    long  kx = i / ksize;
-    long  kd = kx * ksize;
-    long  ky = (i - kd) / OW;
-    long  ix = i % OW;
-
-    long  oh = get_group_id(1);
-    long  batch = get_group_id(2) / IC;
-    long  ic = get_group_id(2) % IC;
-
-    long iiw = ix * s0 + kx * d0 - p0;
-    long iih = oh * s1 + ky * d1 - p1;
-
-    long offset_dst =
-        ((batch * OH + oh) * OW + ix) * CHW +
-        (ic * (KW * KH) + ky * KW + kx);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] = 0.0f;
-    } else {
-        long offset_src = ic * delta_offset + batch * batch_offset;
-        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
-    }
-}
-
-kernel void kernel_im2col_f16(
-        global float * src1,
-        ulong offset1,
-        global half  * dst,
-        ulong offsetd,
-        ulong batch_offset,
-        ulong delta_offset,
-        long IW,
-        long IH,
-        long IC,
-        long OW,
-        long OH,
-        long KW,
-        long KH,
-        long pelements,
-        long CHW,
-        int  s0,
-        int  s1,
-        int  p0,
-        int  p1,
-        int  d0,
-        int  d1
-) {
-    long i = get_global_id(0);
-
-    if (i >= pelements) {
-        return;
-    }
-
-    src1 = (global float*)((global char*)src1 + offset1);
-    dst = (global half*)((global char*)dst + offsetd);
-
-    long  ksize = OW * (KH > 1 ? KW : 1);
-    long  kx = i / ksize;
-    long  kd = kx * ksize;
-    long  ky = (i - kd) / OW;
-    long  ix = i % OW;
-
-    long  oh = get_group_id(1);
-    long  batch = get_group_id(2) / IC;
-    long  ic = get_group_id(2) % IC;
-
-    long iiw = ix * s0 + kx * d0 - p0;
-    long iih = oh * s1 + ky * d1 - p1;
-
-    long offset_dst =
-        ((batch * OH + oh) * OW + ix) * CHW +
-        (ic * (KW * KH) + ky * KW + kx);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] = 0.0f;
-    } else {
-        long offset_src = ic * delta_offset + batch * batch_offset;
-        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
@ -1,139 +0,0 @@
-// src0_q, src0_d, src1 are transposed as a preprocessing step
-// 4-bit weights are transposed in groups of 4 (unsigned short int)
-// consider weights originally "next to each other", now "on top of each other"
-// each fiber computes a 8x4 tile of output elements
-// using unshuffled weights
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-
-#ifdef cl_qcom_reqd_sub_group_size
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-#ifdef ADRENO_GPU
-REQD_SUBGROUP_SIZE_128
-#endif
-
-kernel void kernel_mul_mat_Ab_Bi_8x4(
-        global const ushort * src0_q,       // quantized A
-        global const half  * src0_d,        // A scales
-        __read_only image1d_buffer_t src1,  // B (1d image)
-        global float * dst,                 // C
-        int m,                              // M
-        int n,                              // N with padding
-        int k,                              // K
-        int n_no_padding                    // N without padding
-) {
-
-    int m_4 = m >> 2;
-    int n_4 = n >> 2;
-
-    int gy = get_global_id(0);
-    int gx = get_global_id(1);
-    int gx_2 = gx << 2;
-
-    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
-    half8 B; // registers for activations
-    half4 dequantized_weights; // registers for dequantized weights
-    __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
-    __global const half* scale_ptr = src0_d + gx_2; // pointer for scales
-
-    for(int i=0; i<k; i+=4){ //loop through K dimension
-
-        B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
-        B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
-
-        // keep (i/4) and (i/32) in parenthesis, rounds down
-        // load 4 consecutive groups of 4 weights
-        ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
-
-        // load 4 consecutive scales
-        half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
-
-        // j=0
-        dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
-        dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
-        dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
-        dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
-        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
-        c1 += B * dequantized_weights.s1;
-        c2 += B * dequantized_weights.s2;
-        c3 += B * dequantized_weights.s3;
-
-        // j=1
-        B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
-        B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
-        dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
-        dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
-        dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
-        dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
-        c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
-        c1 += B * dequantized_weights.s1;
-        c2 += B * dequantized_weights.s2;
-        c3 += B * dequantized_weights.s3;
-
-        // j=2
-        B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
-        B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
-        dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
-        dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
-        dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
-        dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
-        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
-        c1 += B * dequantized_weights.s1;
-        c2 += B * dequantized_weights.s2;
-        c3 += B * dequantized_weights.s3;
-
-        // j=3
-        B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
-        B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
-        dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
-        dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
-        dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
-        dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
-        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
-        c1 += B * dequantized_weights.s1;
-        c2 += B * dequantized_weights.s2;
-        c3 += B * dequantized_weights.s3;
-    }
-
-    int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
-
-    // conditional check if store is to a valid location. Required when N is not a multiple of 8
-    // if statements allow registers to be reused for each store
-    // provides a performance boost due to reduced register footprint, which increases number of concurrent waves
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
-        idx += m;
-    }
-    if(idx+3 < m*n_no_padding){
-        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
-    }
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
@ -1,26 +0,0 @@
-// 16-bit transpose, loading/storing a 4x4 tile of elements
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-kernel void kernel_transpose_16(
-    __read_only image1d_buffer_t input,
-    __write_only image1d_buffer_t output,
-    const uint rows,
-    const uint cols
-) {
-
-    const int i = get_global_id(0);
-    const int j = get_global_id(1);
-    const int i_2 = i<<2;
-    const int j_2 = j<<2;
-
-    half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
-    half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
-    half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
-    half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
-
-    write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
-    write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
-    write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
-    write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
@ -1,25 +0,0 @@
-// 32-bit transpose, loading/storing a 4x4 tile of elements
-
-kernel void kernel_transpose_32(
-    __read_only image1d_buffer_t input,
-    __write_only image1d_buffer_t output,
-    const uint rows,
-    const uint cols
-) {
-
-    const int i = get_global_id(0);
-    const int j = get_global_id(1);
-    const int i_2 = i<<2;
-    const int j_2 = j<<2;
-
-    float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
-    float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
-    float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
-    float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
-
-    write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
-    write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
-    write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
-    write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
-
-}
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
@ -1,35 +0,0 @@
-// 32-bit transpose, loading/storing a 4x4 tile of elements
-// Only used for activations
-// converts to FP16
-// also adds zero padding for non multiple of 8 prompt lengths
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
-
-    const int i = get_global_id(0);
-    const int j = get_global_id(1);
-    const int i_2 = i<<2;
-    const int j_2 = j<<2;
-    half4 temp0 = {0,0,0,0}; // initialize outputs to 0
-    half4 temp1 = {0,0,0,0};
-    half4 temp2 = {0,0,0,0};
-    half4 temp3 = {0,0,0,0};
-
-    if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
-        temp0 = read_imageh(input, (j_2+0)*cols+i);
-    }
-    if((j_2+1)*cols+i*4+3 < rows*cols*16){
-        temp1 = read_imageh(input, (j_2+1)*cols+i);
-    }
-    if((j_2+2)*cols+i*4+3 < rows*cols*16){
-        temp2 = read_imageh(input, (j_2+2)*cols+i);
-    }
-    if((j_2+3)*cols+i*4+3 < rows*cols*16){
-        temp3 = read_imageh(input, (j_2+3)*cols+i);
-    }
-
-    write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
-    write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
-    write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
-    write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
-}