metal : fix floating-point range of attention scores in FA kernels (llama/13090)

ggml-ci
2025-06-21 16:09:55 +00:00 · 2025-04-24 10:38:30 +03:00
parent cf3eb291ab
commit 01e1600edd
1 changed files with 3 additions and 3 deletions
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -3192,7 +3192,7 @@ kernel void kernel_flash_attn_ext(

    {
        float S[Q] = { [0 ... Q-1] = 0.0f };
-        float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
+        float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };

        // thread indices inside the simdgroup
        // TODO: see if we can utilize quad-group functions for better performance
@ -3452,7 +3452,7 @@ kernel void kernel_flash_attn_ext(
    // reduce the warps sequentially
    for (ushort sg = 1; sg < nsg; ++sg) {
        float S = { 0.0f };
-        float M = { -__FLT16_MAX__/2 };
+        float M = { -__FLT_MAX__/2 };

        threadgroup_barrier(mem_flags::mem_threadgroup);

@ -3699,7 +3699,7 @@ kernel void kernel_flash_attn_ext_vec(

    {
        float S = 0.0f;
-        float M = -__FLT16_MAX__/2;
+        float M = -__FLT_MAX__/2;

        // thread indices inside the simdgroup
        const short tx = tiisg%NL;