mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-25 19:44:13 +00:00
CUDA: fix FlashAttention on Turing (llama/13415)
This commit is contained in:
parent
a04b329ad1
commit
16f3546f38
@ -546,7 +546,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const int i0_stop = i0_start + 2*c::nbatch_V2 < DV ? i0_start + 2*c::nbatch_V2 : DV;
|
||||
const int i0_diff = i0_stop - i0_start;
|
||||
|
||||
if (nstages == 1) {
|
||||
if (nstages <= 1) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
|
||||
|
Loading…
x
Reference in New Issue
Block a user