CUDA: fix race conditions FlashAttention kernels (llama/13438)

2025-06-19 23:33:43 +00:00 · 2025-05-10 22:22:48 +02:00
parent 16f3546f38
commit 6db0e01db6
2 changed files with 3 additions and 0 deletions
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@ -874,6 +874,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            }
        }

+        __syncthreads();
+
        // Write back combined meta data:
 #pragma unroll
        for (int imeta = 0; imeta < nmeta; ++imeta) {
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16(
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
+    __syncthreads();

    half2 VKQ[ncols] = {{0.0f, 0.0f}};