CUDA: fix race condition in MMQ stream-k fixup (llama/13299)

This commit is contained in:
Johannes Gäßler 2025-05-04 14:16:39 +02:00 committed by Georgi Gerganov
parent 7564f5e6f1
commit 7fa8bb303f

View File

@ -2958,6 +2958,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) { for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) {
ids_dst_shared[j] = ids_dst[col_low + j]; ids_dst_shared[j] = ids_dst[col_low + j];
} }
__syncthreads();
const int offset_dst = it*mmq_y; const int offset_dst = it*mmq_y;
dst += offset_dst; dst += offset_dst;