From eeb259909e150bd4a5c5a8d88159b1a8b705b812 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Fri, 25 Apr 2025 17:37:51 +0800 Subject: [PATCH] change the reorder tensor from init to execute OP (llama/13003) --- ggml/src/ggml-sycl/common.hpp | 1 - ggml/src/ggml-sycl/ggml-sycl.cpp | 125 +++++++++++++++---------------- 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 96becabc..0ab0fb0a 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -313,7 +313,6 @@ struct ggml_backend_sycl_context { int device; std::string name; optimize_feature opt_feature; - bool optimized_graph=false; queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 8081a77b..548f2d0a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -192,7 +192,7 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); + g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); @@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { } } +static void reorder_qw(char *data_device, const int ncols, const int nrows, + size_t size, size_t offset, dpct::queue_ptr stream) { + auto tmp_buf = sycl::malloc_shared(size, *stream); + SYCL_CHECK( + CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) + .wait())); + GGML_ASSERT((size % sizeof(block_q4_0) == 0)); + GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); + int offset_blks = offset / sizeof(block_q4_0); + auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; + auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; + + stream->parallel_for( + size / sizeof(block_q4_0), + [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + const block_q4_0* x = (const block_q4_0*)tmp_buf; + const int ib = i; + + for (int j = 0; j < QK4_0/2; j ++) + { + *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; + } + *(d_ptr + ib) = x[ib].d; + }); + + sycl::free(tmp_buf, *stream); +} + +static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { + char*data_device = (char*)src0->data; + size_t ncols = src0->ne[0]; + size_t nrows = src0->ne[1]; + size_t size = ggml_nbytes(src0); + + reorder_qw(data_device, ncols, nrows, size, 0, stream); +} + +/* +* This function could be called when the OP (mul_mat) function support reorder optimizition. +*/ +static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT + ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. + dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. + src0->type == GGML_TYPE_Q4_0 && + src1->ne[2]==1 && src1->ne[3]==1) { + + ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; + if (!extra) return; //only happen in CI/UT permute case. + + if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder. + + reorder_qw(src0, ctx->stream()); + extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. + } +} + static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); @@ -2914,6 +2972,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor // KQ + KQV multi-batch ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { + opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); } else if (use_mul_mat_vec_q) { @@ -2921,6 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor } else if (use_mul_mat_q) { ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); } else { + opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } } @@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void reorder_qw(char *data_device, const int ncols, const int nrows, - size_t size, size_t offset, dpct::queue_ptr stream) { - auto tmp_buf = sycl::malloc_shared(size, *stream); - SYCL_CHECK( - CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) - .wait())); - GGML_ASSERT((size % sizeof(block_q4_0) == 0)); - GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); - int offset_blks = offset / sizeof(block_q4_0); - auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; - auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; - - stream->parallel_for( - size / sizeof(block_q4_0), - [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - const block_q4_0* x = (const block_q4_0*)tmp_buf; - const int ib = i; - - for (int j = 0; j < QK4_0/2; j ++) - { - *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; - } - *(d_ptr + ib) = x[ib].d; - }); - - sycl::free(tmp_buf, *stream); -} - -static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { - char*data_device = (char*)src0->data; - size_t ncols = src0->ne[0]; - size_t nrows = src0->ne[1]; - size_t size = ggml_nbytes(src0); - - reorder_qw(data_device, ncols, nrows, size, 0, stream); -} - -static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { - ggml_tensor *src0 = dst->src[0]; - ggml_tensor *src1 = dst->src[1]; - - if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 && - src1->ne[2]==1 && src1->ne[3]==1) { - reorder_qw(src0, stream); - ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; - GGML_ASSERT(extra); - extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. - } -} - -static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { - dpct::queue_ptr stream = ctx->stream(); - if (ctx->optimized_graph) { - return; - } - ctx->optimized_graph = true; - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream); - } -} - static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { ggml_sycl_set_main_device(sycl_ctx->device); - if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i];