From be4ea0826b58b5f450993d7509b4e519b101a0bd Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 19 Jun 2025 17:48:54 +0800 Subject: [PATCH] llamafile : support s390x SIMD instruction set (llama/14273) --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 55 ++++++++++++++++++++++++++- ggml/src/ggml-cpu/llamafile/sgemm.h | 5 +++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 1c545f80..7ed3874a 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -62,7 +62,7 @@ #define NOINLINE __attribute__((__noinline__)) #endif -#if defined(__ARM_NEON) || defined(__AVX512F__) +#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__) #define VECTOR_REGISTERS 32 #else #define VECTOR_REGISTERS 16 @@ -109,6 +109,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); } inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(__VXE__) || defined(__VXE2__) +inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); } +inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); } +inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); } +#endif + #if defined(__MMA__) typedef vector unsigned char vec_t; typedef __vector_quad acc_t; @@ -162,6 +168,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) { #endif #endif +#if defined(__VXE__) || defined(__VXE2__) +template <> +inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) { + return vec_madd(a, b, c); +} +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // VECTORIZED HORIZONTAL SUM @@ -178,6 +191,13 @@ inline float hsum(float16x8_t x) { } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(__VXE__) || defined(__VXE2__) +inline float hsum(float32x4_t x) { + float32x4_t tmp = x + vec_reve(x); + return tmp[0] + tmp[1]; +} +#endif + #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) inline float hsum(__m128 x) { #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) @@ -227,6 +247,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) { #endif // _MSC_VER #endif // __ARM_NEON +#if defined(__VXE__) || defined(__VXE2__) +template <> inline float32x4_t load(const ggml_fp16_t * p) { + float tmp[4]; + + for (int i = 0; i < 4; i++) { + tmp[i] = GGML_FP16_TO_FP32(p[i]); + } + + return vec_xl(0, (const float *)(tmp)); +} +template <> inline float32x4_t load(const float * p) { + return vec_xl(0, p); +} +#endif + #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) template <> inline __m128 load(const float *p) { return _mm_loadu_ps(p); @@ -3319,6 +3354,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (const float *)B, ldb, (float *)C, ldc}; return tb.matmul(m, n); +#elif defined(__VXE__) || defined(__VXE2__) + if (n < 4) + return false; + tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params, + k, (const float *)A, lda, + (const float *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); #elif defined(__MMA__) if (k % 8) return false; @@ -3410,6 +3453,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (float *)C, ldc}; return tb.matmul(m, n); } +#elif defined(__VXE__) || defined(__VXE2__) + if (n < 4) + return false; + if (Btype == GGML_TYPE_F16) { + tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params, + k, (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } #endif return false; } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h index 3d290951..729e8853 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ b/ggml/src/ggml-cpu/llamafile/sgemm.h @@ -1,6 +1,11 @@ #pragma once #include #include + +#if defined(__VXE__) || defined(__VXE2__) +#include +#endif + #ifdef __cplusplus extern "C" { #endif