mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-24 06:46:37 +00:00
vulkan: get the first command buffer submitted sooner (llama/10499)
This is an incremental improvement over #9118 to get work to the GPU a bit sooner. The first part is to start with a smaller number of nodes before the first submit, and ramp it up to the current 100 nodes/submit. The second part is to reduce the dryrun overhead for all the nodes that just need to request descriptor space. With these changes I get around 1-2% speedup on RTX 4070 combined with my old Haswell-era CPU.
This commit is contained in:
parent
276b08d8f0
commit
a753a82462
@ -5672,6 +5672,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||||||
} else {
|
} else {
|
||||||
compute_ctx = ctx->compute_ctx.lock();
|
compute_ctx = ctx->compute_ctx.lock();
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
switch (node->op) {
|
||||||
|
case GGML_OP_REPEAT:
|
||||||
|
case GGML_OP_ACC:
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_MUL:
|
||||||
|
case GGML_OP_DIV:
|
||||||
|
case GGML_OP_CONCAT:
|
||||||
|
case GGML_OP_UPSCALE:
|
||||||
|
case GGML_OP_SCALE:
|
||||||
|
case GGML_OP_SQR:
|
||||||
|
case GGML_OP_SIN:
|
||||||
|
case GGML_OP_COS:
|
||||||
|
case GGML_OP_CLAMP:
|
||||||
|
case GGML_OP_PAD:
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
case GGML_OP_CONT:
|
||||||
|
case GGML_OP_DUP:
|
||||||
|
case GGML_OP_NORM:
|
||||||
|
case GGML_OP_GROUP_NORM:
|
||||||
|
case GGML_OP_RMS_NORM:
|
||||||
|
case GGML_OP_UNARY:
|
||||||
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
|
case GGML_OP_SOFT_MAX:
|
||||||
|
case GGML_OP_ROPE:
|
||||||
|
case GGML_OP_ARGSORT:
|
||||||
|
case GGML_OP_SUM_ROWS:
|
||||||
|
case GGML_OP_IM2COL:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
{
|
||||||
|
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
||||||
|
// do the only thing needed for the dryrun.
|
||||||
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
|
||||||
|
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
@ -6401,16 +6443,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||||
int submit_node_idx = 0; // index to first node in a batch
|
int submit_node_idx = 0; // index to first node in a batch
|
||||||
|
|
||||||
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
|
// Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
|
||||||
constexpr int submit_count = 100;
|
// Start with a smaller count to get work submitted right away, and increase it after each submit.
|
||||||
|
int nodes_per_submit = 20;
|
||||||
int submitted_nodes = 0;
|
int submitted_nodes = 0;
|
||||||
|
int submit_count = 0;
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
if (first_node_in_batch) {
|
if (first_node_in_batch) {
|
||||||
submit_node_idx = i;
|
submit_node_idx = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
|
bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
|
||||||
|
|
||||||
|
|
||||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
||||||
|
|
||||||
@ -6427,6 +6470,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||||||
if (submit) {
|
if (submit) {
|
||||||
first_node_in_batch = true;
|
first_node_in_batch = true;
|
||||||
submitted_nodes = 0;
|
submitted_nodes = 0;
|
||||||
|
switch (submit_count) {
|
||||||
|
case 0:
|
||||||
|
nodes_per_submit = 50;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
nodes_per_submit = 100;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
submit_count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user