mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-29 13:34:13 +00:00
CUDA: fix misaligned synchronization in FA (llama/13469)
This commit is contained in:
parent
250bcc041a
commit
866f685bbc
@ -895,6 +895,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
|
||||
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
|
||||
}
|
||||
} else if (np > 1) {
|
||||
// Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
|
||||
// Therefore, all other warps also need to execute a __syncthreads().
|
||||
// Otherwise the points at which warps synchronize with each other would become misaligned.
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
|
Loading…
x
Reference in New Issue
Block a user