diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 913a440f..a230bf9a 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -357,9 +357,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.3.0") + set(KLEIDIAI_COMMIT_TAG "v1.5.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9") + set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index a8a59a88..aacc2bb5 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, }, /* .lhs_info = */ { - /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, - /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon, + /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon, - /* .require_aligned_m_idx = */ true, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, @@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, @@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, @@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, @@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, - /* .require_aligned_m_idx = */ false, }, /* .rhs_info = */ { /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.h b/ggml/src/ggml-cpu/kleidiai/kernels.h index a0b0d149..2ffe97eb 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.h +++ b/ggml/src/ggml-cpu/kleidiai/kernels.h @@ -40,7 +40,6 @@ struct lhs_packing_info { size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed); - bool require_aligned_m_idx; }; struct rhs_packing_info { diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 4dff5c67..4e89ca0f 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t sr = kernel->get_sr(); // Calculate number of columns to be processed per thread - const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true; - const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m; + const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth; const size_t m_start = ith * num_m_per_thread; size_t m_to_process = num_m_per_thread; if ((m_start + m_to_process) > m) { @@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits { if(m_start < m) { // Transform LHS const size_t src_stride = src1->nb[1]; - const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1])); + const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1])); const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr); void * lhs_packed_ptr = static_cast(lhs_packed + lhs_packed_offset); - lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr); + lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr); } ggml_barrier(params->threadpool);