ggml : simplify Arm fp16 CPU logic (ggml/1177)

* ggml : simlpify Arm fp16 CPU logic

ggml-ci

* cont : bring back CUDA/MUSA checks

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-04-07 12:25:15 +03:00
parent 06ce8f83e6
commit 1e9c2f87f1
3 changed files with 23 additions and 42 deletions

View File

@ -4,13 +4,13 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
//#include <stddef.h> //#include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
#include <string.h> // memcpy #include <string.h> // memcpy
#include <math.h> // fabsf #include <math.h> // fabsf
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -69,33 +69,16 @@ struct ggml_compute_params {
#endif #endif
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h> #include <sys/prctl.h>
#endif #endif
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: // ref: https://github.com/ggml-org/llama.cpp/pull/5404
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#ifdef _MSC_VER #ifdef _MSC_VER
typedef uint16_t ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
#else #else
typedef __fp16 ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
#endif // _MSC_VER #endif // _MSC_VER
#if !defined(__aarch64__) #if !defined(__aarch64__)

View File

@ -71,7 +71,7 @@
#define GGML_F16x8 float16x8_t #define GGML_F16x8 float16x8_t
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
#define GGML_F16x8_SET1(x) vdupq_n_f16(x) #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x)) #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
#define GGML_F16x8_STORE vst1q_f16 #define GGML_F16x8_STORE vst1q_f16
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_ADD vaddq_f16
@ -99,7 +99,7 @@
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL #define GGML_F16_VEC_MUL GGML_F16x8_MUL
@ -114,7 +114,7 @@
#define GGML_F32Cx4 float32x4_t #define GGML_F32Cx4 float32x4_t
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32Cx4_ADD vaddq_f32 #define GGML_F32Cx4_ADD vaddq_f32
@ -125,7 +125,7 @@
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i]) #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL

View File

@ -16,14 +16,6 @@
#include <arm_sve.h> #include <arm_sve.h>
#endif // __ARM_FEATURE_SVE #endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#endif
#if defined(__F16C__) #if defined(__F16C__)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
@ -311,29 +303,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
// FP16 to FP32 conversion // FP16 to FP32 conversion
#if defined(__ARM_NEON) // 16-bit float
#if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) // on Arm, we use __fp16
typedef uint16_t ggml_fp16_internal_t; // on x86, we use uint16_t
#else //
typedef __fp16 ggml_fp16_internal_t; // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
#endif // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
#endif //
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp; __fp16 tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t)); memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp; return (float)tmp;
} }
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res; ggml_fp16_t res;
ggml_fp16_internal_t tmp = f; __fp16 tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t)); memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res; return res;
} }
@ -485,7 +483,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
// precomputed f32 table for f16 (256 KB) // precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init() // defined in ggml.c, initialized in ggml_init()