diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu
index 65cc1bca..8516252c 100644
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@@ -75,7 +75,7 @@ static __global__ void mul_mat_vec_q(
             tmp[j][i] = warp_reduce_sum(tmp[j][i]);
         }
 
-        if (threadIdx.x < rows_per_cuda_block) {
+        if (threadIdx.x < rows_per_cuda_block && row0 + threadIdx.x < nrows_dst) {
             dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
         }
     }