From 7564f5e6f17da0946cd4e14ef6f436f5a3e6b45b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 4 May 2025 13:58:38 +0200 Subject: [PATCH] CUDA: fix race condition in MMQ ids_dst (llama/13294) --- ggml/src/ggml-cuda/mmq.cuh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 8c93e832..fc6ce008 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q( ids_dst_shared[j] = j; } + __syncthreads(); // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA @@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q( return; } + // __syncthreads(); // There is no previous tile that could cause a race condition. #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q( ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; } + __syncthreads(); } offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); @@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q( continue; } + __syncthreads(); #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q( ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; } + __syncthreads(); } offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); @@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q( } // The memory layout for the fixup buffer is always contiguous, therefore reset ids: + __syncthreads(); #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q( ids_dst_shared[j] = j; } + __syncthreads(); } offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));