mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
vulkan: Optimize contiguous copies (llama/10254)
* tests: Fix memory bandwidth calculation for perf tests Add a flops calculation for flash attention. Add one GGML_OP_CPY perf test. * vulkan: Optimize contiguous copies Add a variant of the copy shader for when the tensors are contiguous. Avoid the complex addressing calculations, and do four elements per invocation to hide some other overhead. Apply similar changes to the scale shader, since scale is always contiguous. Add a "progress bar" for shader compiles.
This commit is contained in:
parent
b54ce5edc5
commit
21b01a21b6
@ -196,6 +196,7 @@ struct vk_device_struct {
|
|||||||
vk_pipeline pipeline_pad_f32;
|
vk_pipeline pipeline_pad_f32;
|
||||||
vk_pipeline pipeline_repeat_f32;
|
vk_pipeline pipeline_repeat_f32;
|
||||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
||||||
|
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
|
||||||
vk_pipeline pipeline_norm_f32;
|
vk_pipeline pipeline_norm_f32;
|
||||||
vk_pipeline pipeline_group_norm_f32;
|
vk_pipeline pipeline_group_norm_f32;
|
||||||
vk_pipeline pipeline_rms_norm_f32;
|
vk_pipeline pipeline_rms_norm_f32;
|
||||||
@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|||||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||||
assert(compile_count > 0);
|
assert(compile_count > 0);
|
||||||
compile_count--;
|
compile_count--;
|
||||||
|
|
||||||
|
// "Progress bar" for shader compiles
|
||||||
|
static uint32_t total_compile_count = 0;
|
||||||
|
if ((total_compile_count++ % 10) == 0) {
|
||||||
|
std::cerr << ".";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
compile_count_cond.notify_all();
|
compile_count_cond.notify_all();
|
||||||
}
|
}
|
||||||
@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
|||||||
static void ggml_vk_load_shaders(vk_device& device) {
|
static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||||
|
|
||||||
|
std::cerr << "ggml_vulkan: Compiling shaders";
|
||||||
|
|
||||||
// mulmat
|
// mulmat
|
||||||
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
||||||
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
||||||
@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
for (auto &c : compiles) {
|
for (auto &c : compiles) {
|
||||||
c.wait();
|
c.wait();
|
||||||
}
|
}
|
||||||
|
std::cerr << "Done!" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_device ggml_vk_get_device(size_t idx) {
|
static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
|||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
|
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
|
||||||
if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
|
||||||
|
// Choose "contiguous copy" shader if src/dst are contiguous
|
||||||
|
bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
||||||
|
if (contig) {
|
||||||
|
return ctx->device->pipeline_contig_cpy_f32_f32;
|
||||||
|
} else {
|
||||||
return ctx->device->pipeline_cpy_f32_f32;
|
return ctx->device->pipeline_cpy_f32_f32;
|
||||||
}
|
}
|
||||||
if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
|
}
|
||||||
|
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
|
||||||
|
if (contig) {
|
||||||
|
return ctx->device->pipeline_contig_cpy_f32_f16;
|
||||||
|
} else {
|
||||||
return ctx->device->pipeline_cpy_f32_f16;
|
return ctx->device->pipeline_cpy_f32_f16;
|
||||||
}
|
}
|
||||||
if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
|
}
|
||||||
|
if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
|
||||||
|
if (contig) {
|
||||||
|
return ctx->device->pipeline_contig_cpy_f16_f16;
|
||||||
|
} else {
|
||||||
return ctx->device->pipeline_cpy_f16_f16;
|
return ctx->device->pipeline_cpy_f16_f16;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
|
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
const int tensor_type_size = ggml_type_size(tensor->type);
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
||||||
|
|
||||||
const uint32_t ne = ggml_nelements(tensor);
|
const uint32_t ne = ggml_nelements(tensor);
|
||||||
|
std::array<uint32_t, 3> elements;
|
||||||
|
|
||||||
|
if (ne > 262144) {
|
||||||
|
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
||||||
|
} else if (ne > 512) {
|
||||||
|
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
||||||
|
} else {
|
||||||
|
elements = { ne, 1, 1 };
|
||||||
|
}
|
||||||
|
|
||||||
const vk_op_unary_push_constants pc = {
|
const vk_op_unary_push_constants pc = {
|
||||||
(uint32_t)ne,
|
(uint32_t)ne,
|
||||||
@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
0.0f, 0.0f,
|
0.0f, 0.0f,
|
||||||
};
|
};
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||||
@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||||
|
|
||||||
if (x_non_contig) {
|
if (x_non_contig) {
|
||||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
||||||
}
|
}
|
||||||
if (y_non_contig) {
|
if (y_non_contig) {
|
||||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
vk_pipeline to_fp16_vk_0 = nullptr;
|
vk_pipeline to_fp16_vk_0 = nullptr;
|
||||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||||
if (x_non_contig) {
|
if (x_non_contig) {
|
||||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
|
||||||
}
|
}
|
||||||
if (y_non_contig) {
|
if (y_non_contig) {
|
||||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||||
|
|
||||||
if (x_non_contig) {
|
if (x_non_contig) {
|
||||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
||||||
}
|
}
|
||||||
if (y_non_contig) {
|
if (y_non_contig) {
|
||||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||||||
vk_pipeline to_fp16_vk_0 = nullptr;
|
vk_pipeline to_fp16_vk_0 = nullptr;
|
||||||
vk_pipeline to_fp16_vk_1 = nullptr;
|
vk_pipeline to_fp16_vk_1 = nullptr;
|
||||||
if (x_non_contig) {
|
if (x_non_contig) {
|
||||||
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
|
||||||
}
|
}
|
||||||
if (y_non_contig) {
|
if (y_non_contig) {
|
||||||
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
|
||||||
} else {
|
} else {
|
||||||
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
||||||
}
|
}
|
||||||
@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
|
return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_norm_f32;
|
return ctx->device->pipeline_norm_f32;
|
||||||
@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
case GGML_OP_CONCAT:
|
case GGML_OP_CONCAT:
|
||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
case GGML_OP_SCALE:
|
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
case GGML_OP_SIN:
|
case GGML_OP_SIN:
|
||||||
case GGML_OP_COS:
|
case GGML_OP_COS:
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = get_idx();
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
42
ggml/src/vulkan-shaders/contig_copy.comp
Normal file
42
ggml/src/vulkan-shaders/contig_copy.comp
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : require
|
||||||
|
|
||||||
|
const uint num_threads = 128;
|
||||||
|
|
||||||
|
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint idx = get_idx();
|
||||||
|
|
||||||
|
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||||
|
const uint num_iter = 4;
|
||||||
|
|
||||||
|
// fast path for when all four iterations are in-bounds
|
||||||
|
if (idx + (num_iter-1)*num_threads < p.ne) {
|
||||||
|
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||||
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
|
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
||||||
|
#else
|
||||||
|
data_d[p.d_offset + idx] = data_a[idx];
|
||||||
|
#endif
|
||||||
|
idx += num_threads;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||||
|
if (idx >= p.ne) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
|
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
||||||
|
#else
|
||||||
|
data_d[p.d_offset + idx] = data_a[idx];
|
||||||
|
#endif
|
||||||
|
idx += num_threads;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = get_idx();
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = get_idx();
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
#extension GL_EXT_control_flow_attributes : require
|
||||||
|
|
||||||
layout (push_constant) uniform parameter
|
layout (push_constant) uniform parameter
|
||||||
{
|
{
|
||||||
@ -9,8 +10,6 @@ layout (push_constant) uniform parameter
|
|||||||
float param1; float param2;
|
float param1; float param2;
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
uint src0_idx_mod(uint idx) {
|
uint src0_idx_mod(uint idx) {
|
||||||
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
||||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||||
|
@ -3,12 +3,22 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
const uint num_threads = 128;
|
||||||
|
|
||||||
|
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = get_idx();
|
uint idx = get_idx();
|
||||||
|
|
||||||
|
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||||
|
const uint num_iter = 4;
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||||
if (idx >= p.ne) {
|
if (idx >= p.ne) {
|
||||||
return;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
|
data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
|
||||||
|
idx += num_threads;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = get_idx();
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint idx = get_idx();
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
@ -350,6 +350,9 @@ void process_shaders() {
|
|||||||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||||
|
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||||
|
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||||
|
|
||||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||||
|
Loading…
Reference in New Issue
Block a user