From 0001ec075f361c704ed817d532092ee1b5599024 Mon Sep 17 00:00:00 2001 From: amritahs-ibm Date: Fri, 28 Mar 2025 13:13:22 +0530 Subject: [PATCH] llamafile : ppc64le GEMV forwarding for FP32. (llama/12594) This patch enables usage of MMA when one of the dimensions of the matrix(ie either M or N) is 1. This is useful in case of token generation where N < 2. The concept of 'GEMV Forwarding' is used where when one of the matrix has a single row/column, the elements are broadcasted, instead of using packing routine to prepack the matrix elements. This change results in 5% - 15% improvement in total speed(ie all tokens/total time), across various batch sizes. This is in comparision with the corresponding dot product implementation. The patch is tested with FP32 models of Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf on a IBM POWER10 machine. Signed-off-by: Amrita H S --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 92dfbc2d..f6374f78 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2680,13 +2680,25 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); vec_t vec_A[4] {0}, vec_B[4] = {0}; for (int l=0; l= 4 && RM == 1) { + /* 'GEMV Forwarding' concept is used in first two conditional loops. + * when one of the matrix has a single row/column, the elements are + * broadcasted, instead of using packing routine to prepack the + * matrix elements. + */ + if (RM == 1) { TA* a = const_cast(A+(ii)*lda+l); - packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); vec_A[0] = (vec_t)vec_xl(0,a); vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); + } else if (RN == 1) { + packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); + TB* b = const_cast(B+(jj)*ldb+l); + vec_B[0] = (vec_t)vec_xl(0,b); + vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1)); + vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2)); + vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3)); } else { packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); @@ -2790,8 +2802,10 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 assert(params->ith < params->nth); // only enable sgemm for prompt processing +#if !defined(__MMA__) if (n < 2) return false; +#endif if (Ctype != GGML_TYPE_F32) return false;