mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-02 08:43:02 +00:00
vulkan: enable coopmat2 FA gqa and split_k optimizations more often (llama/12931)
The grouped query attention optmization doesn't require a power of two ratio, the only thing relying on it was the modulo operation written as bitwise &. split_k need not depend on gqa_ratio - enable it any time there's only one workgroup in the X dimension. The shader gets the split index from the x coord, and multiple workgroups in the X dimension (pre-split) indicates a larger FA operation that wouldn't need splitting.
This commit is contained in:
parent
be42a19eab
commit
7db8f278f0
@ -5531,7 +5531,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||||||
uint32_t workgroups_y = (uint32_t)neq2;
|
uint32_t workgroups_y = (uint32_t)neq2;
|
||||||
uint32_t workgroups_z = (uint32_t)neq3;
|
uint32_t workgroups_z = (uint32_t)neq3;
|
||||||
|
|
||||||
if (N == 1 && qk_ratio > 1 && is_pow2(qk_ratio) && gqa_ratio <= flash_attention_num_small_rows &&
|
if (N == 1 && qk_ratio > 1 && gqa_ratio <= flash_attention_num_small_rows &&
|
||||||
qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
|
qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
|
||||||
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
|
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
|
||||||
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
|
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
|
||||||
@ -5544,8 +5544,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||||||
uint32_t split_kv = KV;
|
uint32_t split_kv = KV;
|
||||||
uint32_t split_k = 1;
|
uint32_t split_k = 1;
|
||||||
|
|
||||||
if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
|
// Try to use split_k when KV is large enough to be worth the overhead
|
||||||
GGML_ASSERT(workgroups_x == 1);
|
if (workgroups_x == 1 && ctx->device->shader_core_count > 0 && KV >= 512) {
|
||||||
// Try to run two workgroups per SM.
|
// Try to run two workgroups per SM.
|
||||||
split_k = ctx->device->shader_core_count * 2 / workgroups_y;
|
split_k = ctx->device->shader_core_count * 2 / workgroups_y;
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
|
@ -131,7 +131,7 @@ ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in A
|
|||||||
// Load the slope matrix, indexed by Q's dimension 2.
|
// Load the slope matrix, indexed by Q's dimension 2.
|
||||||
ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
|
ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
|
||||||
{
|
{
|
||||||
const uint32_t h = iq2 + (r & (p.gqa_ratio - 1));
|
const uint32_t h = iq2 + (r % p.gqa_ratio);
|
||||||
|
|
||||||
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
|
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
|
||||||
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
|
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user