mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-20 13:13:07 +00:00
ggml : optimize llamafile cpu matrix multiplication for ppc64le (llama/10156)
This change upstreams llamafile's cpu matrix multiplication kernels for ppc64le using MMA builtins for FP32 datatype. This change results in a consistent 90% improvement in input processing time, and 20% to 80% improvement in output processing time, across various batch sizes. The patch is tested with Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf models on a IBM POWER10 machine. Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
This commit is contained in:
parent
9f67aab211
commit
b7b38f7d68
@ -1265,7 +1265,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
|
||||||
|
OUTPUT_VARIABLE POWER10_M)
|
||||||
|
string(FIND ${POWER10_M} "POWER10" substring_index)
|
||||||
|
if(${substring_index} GREATER_EQUAL 0)
|
||||||
|
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||||
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||||
else()
|
else()
|
||||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||||
|
Loading…
Reference in New Issue
Block a user