mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-22 08:30:07 +00:00
llamafile : support s390x SIMD instruction set (llama/14273)
This commit is contained in:
committed by
Georgi Gerganov
parent
1aca7b5c8a
commit
be4ea0826b
@ -62,7 +62,7 @@
|
|||||||
#define NOINLINE __attribute__((__noinline__))
|
#define NOINLINE __attribute__((__noinline__))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__AVX512F__)
|
#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
|
||||||
#define VECTOR_REGISTERS 32
|
#define VECTOR_REGISTERS 32
|
||||||
#else
|
#else
|
||||||
#define VECTOR_REGISTERS 16
|
#define VECTOR_REGISTERS 16
|
||||||
@ -109,6 +109,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
|
|||||||
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
|
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
|
||||||
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||||
|
|
||||||
|
#if defined(__VXE__) || defined(__VXE2__)
|
||||||
|
inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); }
|
||||||
|
inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); }
|
||||||
|
inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__MMA__)
|
#if defined(__MMA__)
|
||||||
typedef vector unsigned char vec_t;
|
typedef vector unsigned char vec_t;
|
||||||
typedef __vector_quad acc_t;
|
typedef __vector_quad acc_t;
|
||||||
@ -162,6 +168,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__VXE__) || defined(__VXE2__)
|
||||||
|
template <>
|
||||||
|
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
||||||
|
return vec_madd(a, b, c);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// VECTORIZED HORIZONTAL SUM
|
// VECTORIZED HORIZONTAL SUM
|
||||||
|
|
||||||
@ -178,6 +191,13 @@ inline float hsum(float16x8_t x) {
|
|||||||
}
|
}
|
||||||
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||||
|
|
||||||
|
#if defined(__VXE__) || defined(__VXE2__)
|
||||||
|
inline float hsum(float32x4_t x) {
|
||||||
|
float32x4_t tmp = x + vec_reve(x);
|
||||||
|
return tmp[0] + tmp[1];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||||
inline float hsum(__m128 x) {
|
inline float hsum(__m128 x) {
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||||
@ -227,6 +247,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) {
|
|||||||
#endif // _MSC_VER
|
#endif // _MSC_VER
|
||||||
#endif // __ARM_NEON
|
#endif // __ARM_NEON
|
||||||
|
|
||||||
|
#if defined(__VXE__) || defined(__VXE2__)
|
||||||
|
template <> inline float32x4_t load(const ggml_fp16_t * p) {
|
||||||
|
float tmp[4];
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
tmp[i] = GGML_FP16_TO_FP32(p[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return vec_xl(0, (const float *)(tmp));
|
||||||
|
}
|
||||||
|
template <> inline float32x4_t load(const float * p) {
|
||||||
|
return vec_xl(0, p);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||||
template <> inline __m128 load(const float *p) {
|
template <> inline __m128 load(const float *p) {
|
||||||
return _mm_loadu_ps(p);
|
return _mm_loadu_ps(p);
|
||||||
@ -3319,6 +3354,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc};
|
(float *)C, ldc};
|
||||||
return tb.matmul(m, n);
|
return tb.matmul(m, n);
|
||||||
|
#elif defined(__VXE__) || defined(__VXE2__)
|
||||||
|
if (n < 4)
|
||||||
|
return false;
|
||||||
|
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
|
||||||
|
k, (const float *)A, lda,
|
||||||
|
(const float *)B, ldb,
|
||||||
|
(float *)C, ldc};
|
||||||
|
return tb.matmul(m, n);
|
||||||
#elif defined(__MMA__)
|
#elif defined(__MMA__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
return false;
|
return false;
|
||||||
@ -3410,6 +3453,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|||||||
(float *)C, ldc};
|
(float *)C, ldc};
|
||||||
return tb.matmul(m, n);
|
return tb.matmul(m, n);
|
||||||
}
|
}
|
||||||
|
#elif defined(__VXE__) || defined(__VXE2__)
|
||||||
|
if (n < 4)
|
||||||
|
return false;
|
||||||
|
if (Btype == GGML_TYPE_F16) {
|
||||||
|
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||||
|
k, (const ggml_fp16_t *)A, lda,
|
||||||
|
(const ggml_fp16_t *)B, ldb,
|
||||||
|
(float *)C, ldc};
|
||||||
|
return tb.matmul(m, n);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,11 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#if defined(__VXE__) || defined(__VXE2__)
|
||||||
|
#include <vecintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user