ggml : optimize llamafile cpu matrix multiplication for ppc64le (llama/10156)

This change upstreams llamafile's cpu matrix multiplication kernels for ppc64le using MMA builtins for FP32 datatype. This change results in a consistent 90% improvement in input processing time, and 20% to 80% improvement in output processing time, across various batch sizes. The patch is tested with Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf models on a IBM POWER10 machine. Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
2025-05-08 11:38:26 +00:00 · 2024-11-09 12:47:50 +05:30 · 2024-11-09 12:47:50 +05:30 · b7b38f7d68
commit b7b38f7d68
parent 9f67aab211
1 changed files with 7 additions and 2 deletions
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -1265,7 +1265,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
-    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
                   OUTPUT_VARIABLE POWER10_M)
    string(FIND ${POWER10_M} "POWER10" substring_index)
    if(${substring_index} GREATER_EQUAL 0)
       list(APPEND ARCH_FLAGS -mcpu=power10)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)