mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-22 16:38:58 +00:00
Optimize mul_mat for Q4_0 on Intel GPU (llama/12035)
* opt performance by reorder for Intel GPU * detect hw type and save opt feature, and print opt feature * correct name * support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed * add env variable GGML_SYCL_DISABLE_OPT for debug * use syclex::architecture replace the custom hw define, update the guide for GGML_SYCL_DISABLE_OPT * add performance data * mv getrows functions to separeted files * fix global variables --------- Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
This commit is contained in:
committed by
Georgi Gerganov
parent
ac301a7d9b
commit
e3cb412a59
@ -39,9 +39,12 @@
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
#include "ggml-sycl/gemm.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
#include "ggml-sycl/getrows.hpp"
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
int g_ggml_sycl_debug = 0;
|
||||
int g_ggml_sycl_disable_optimize = 0;
|
||||
|
||||
static ggml_sycl_device_info ggml_sycl_init() {
|
||||
ggml_sycl_device_info info = {};
|
||||
@ -64,14 +67,18 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
info.devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(i);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||
prop, device)));
|
||||
|
||||
info.default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
info.devices[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
info.devices[i].hw_info = get_device_hw_info(&device);
|
||||
info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
|
||||
|
||||
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
||||
}
|
||||
@ -110,6 +117,27 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
||||
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
||||
}
|
||||
|
||||
void print_device_opt_feature(int device_count) {
|
||||
GGML_LOG_INFO("SYCL Optimization Feature:\n");
|
||||
GGML_LOG_INFO(
|
||||
"|ID| Device Type|Reorder|\n");
|
||||
GGML_LOG_INFO(
|
||||
"|--|-------------------|-------|\n");
|
||||
std::map<std::string, size_t> DeviceNums;
|
||||
for (int id = 0; id < device_count; ++id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
std::string backend_type = get_device_backend_and_type(device);
|
||||
int type_id = DeviceNums[backend_type]++;
|
||||
std::stringstream device_type;
|
||||
device_type << "[" << backend_type << ":" << std::to_string(type_id)
|
||||
<< "]";
|
||||
std::string device_type_s = device_type.str();
|
||||
device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), "");
|
||||
GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(),
|
||||
ggml_sycl_info().devices[id].opt_feature.reorder ? "Y": "N");
|
||||
}
|
||||
|
||||
}
|
||||
void ggml_backend_sycl_print_sycl_devices() {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
@ -138,6 +166,8 @@ void ggml_backend_sycl_print_sycl_devices() {
|
||||
<< "]";
|
||||
print_device_detail(id, device, device_type.str());
|
||||
}
|
||||
|
||||
print_device_opt_feature(device_count);
|
||||
}
|
||||
|
||||
static inline int get_sycl_env(const char *env_name, int default_val) {
|
||||
@ -159,17 +189,21 @@ static void ggml_check_sycl() try {
|
||||
|
||||
if (!initialized) {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
||||
GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
||||
GGML_LOG_INFO("Build with Macros:\n");
|
||||
#if defined(GGML_SYCL_FORCE_MMQ)
|
||||
GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: yes\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: no\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_F16)
|
||||
GGML_LOG_INFO("GGML_SYCL_F16: yes\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_F16: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO("GGML_SYCL_F16: no\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
|
||||
#endif
|
||||
|
||||
/* NOT REMOVE, keep it for next optimize for XMX.
|
||||
@ -241,19 +275,27 @@ struct ggml_backend_sycl_buffer_context {
|
||||
void * dev_ptr = nullptr;
|
||||
queue_ptr stream;
|
||||
std::string name;
|
||||
optimize_feature opt_feature;
|
||||
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
||||
|
||||
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
|
||||
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
|
||||
device(device), dev_ptr(dev_ptr), stream(stream) {
|
||||
check_allow_gpu_index(device);
|
||||
name = (GGML_SYCL_NAME + std::to_string(device));
|
||||
opt_feature = ggml_sycl_info().devices[device].opt_feature;
|
||||
}
|
||||
|
||||
|
||||
~ggml_backend_sycl_buffer_context() {
|
||||
if (dev_ptr != nullptr) {
|
||||
ggml_sycl_set_device(device);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
|
||||
}
|
||||
|
||||
//release extra used by tensors
|
||||
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
||||
release_extra_gpu(extra);
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
@ -291,6 +333,9 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
@ -316,7 +361,6 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
size_t size) try {
|
||||
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
||||
SYCL_CHECK(
|
||||
@ -660,32 +704,7 @@ struct ggml_backend_sycl_split_buffer_type_context {
|
||||
struct ggml_backend_sycl_split_buffer_context {
|
||||
~ggml_backend_sycl_split_buffer_context() try {
|
||||
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
||||
if (extra->events[i][is] != nullptr) {
|
||||
/*
|
||||
DPCT1009:206: SYCL uses exceptions to report errors and
|
||||
does not use the error codes. The original code was
|
||||
commented out and a warning string was inserted. You
|
||||
need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::destroy_event(extra->events[i][is])));
|
||||
}
|
||||
}
|
||||
if (extra->data_device[i] != nullptr) {
|
||||
/*
|
||||
DPCT1009:207: SYCL uses exceptions to report errors and does
|
||||
not use the error codes. The original code was commented out
|
||||
and a warning string was inserted. You need to rewrite this
|
||||
code.
|
||||
*/
|
||||
ggml_sycl_set_device(i);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
|
||||
extra->data_device[i], *(streams[i]))));
|
||||
}
|
||||
}
|
||||
delete extra;
|
||||
release_extra_gpu(extra, streams);
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
@ -723,7 +742,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
|
||||
ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
int64_t row_low, row_high;
|
||||
@ -1337,83 +1356,6 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
||||
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void k_get_rows(
|
||||
const void * src0, const int32_t * src1, dst_t * dst,
|
||||
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
||||
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
||||
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
||||
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
||||
size_t s10, size_t s11, size_t s12,
|
||||
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
||||
|
||||
const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
||||
item_ct1.get_local_id(2)) *
|
||||
2;
|
||||
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne12;
|
||||
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
|
||||
const int ib = i00/qk; // block index
|
||||
const int iqs = (i00%qk)/qr; // quant index
|
||||
const int iybs = i00 - i00%qk; // dst block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
dequantize_kernel(src0_row, ib, iqs, v);
|
||||
|
||||
dst_row[iybs + iqs + 0] = v.x();
|
||||
dst_row[iybs + iqs + y_offset] = v.y();
|
||||
}
|
||||
|
||||
template<typename src0_t, typename dst_t>
|
||||
static void k_get_rows_float(
|
||||
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
||||
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
||||
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
||||
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
||||
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
||||
size_t s10, size_t s11, size_t s12,
|
||||
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
||||
|
||||
const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne12;
|
||||
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
|
||||
dst_row[i00] = src0_row[i00];
|
||||
}
|
||||
|
||||
static void mul_mat_p021_f16_f32(
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
||||
@ -1896,81 +1838,6 @@ static void pool2d_nchw_kernel(
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dq>
|
||||
static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
ggml_tensor *dst, const void *src0_dd,
|
||||
const int32_t *src1_dd, float *dst_dd,
|
||||
queue_ptr stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
||||
const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
||||
|
||||
// strides in elements
|
||||
//const size_t s0 = nb0 / ggml_element_size(dst);
|
||||
const size_t s1 = nb1 / ggml_element_size(dst);
|
||||
const size_t s2 = nb2 / ggml_element_size(dst);
|
||||
const size_t s3 = nb3 / ggml_element_size(dst);
|
||||
|
||||
const size_t s10 = nb10 / ggml_element_size(src1);
|
||||
const size_t s11 = nb11 / ggml_element_size(src1);
|
||||
const size_t s12 = nb12 / ggml_element_size(src1);
|
||||
//const size_t s13 = nb13 / ggml_element_size(src1);
|
||||
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_get_rows<qk, qr, dq>(
|
||||
src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
||||
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
||||
});
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
|
||||
template <typename src0_t>
|
||||
static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const src0_t *src0_dd, const int32_t *src1_dd,
|
||||
float *dst_dd, queue_ptr stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
||||
const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
|
||||
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
||||
|
||||
// strides in elements
|
||||
//const size_t s0 = nb0 / ggml_element_size(dst);
|
||||
const size_t s1 = nb1 / ggml_element_size(dst);
|
||||
const size_t s2 = nb2 / ggml_element_size(dst);
|
||||
const size_t s3 = nb3 / ggml_element_size(dst);
|
||||
|
||||
const size_t s10 = nb10 / ggml_element_size(src1);
|
||||
const size_t s11 = nb11 / ggml_element_size(src1);
|
||||
const size_t s12 = nb12 / ggml_element_size(src1);
|
||||
//const size_t s13 = nb13 / ggml_element_size(src1);
|
||||
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
||||
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
|
||||
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
||||
const int ky, const int kx_padded,
|
||||
queue_ptr stream) {
|
||||
@ -2494,52 +2361,6 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_d, const float *src1_d,
|
||||
float *dst_d, const queue_ptr &stream) {
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
||||
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
||||
|
||||
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
|
||||
src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_d, const float *src1_d,
|
||||
@ -2589,11 +2410,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
|
||||
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
size_t ne = row_diff*ne00;
|
||||
src0_as_f16.alloc(ne);
|
||||
@ -2605,7 +2425,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
size_t ne = src1_ncols*ne10;
|
||||
src1_as_f16.alloc(ne);
|
||||
@ -2626,13 +2446,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
dpct::library_data_t::real_half)));
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
#else
|
||||
auto dnnl_stream = ctx.stream_dnnl(stream);
|
||||
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
#endif
|
||||
}
|
||||
@ -2641,13 +2461,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
src0_ddq_as_f32.alloc(row_diff*ne00);
|
||||
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
||||
}
|
||||
if (src1->type != GGML_TYPE_F32) {
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
||||
to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
|
||||
@ -3085,7 +2905,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
||||
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
|
||||
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
||||
continue;
|
||||
@ -3393,7 +3212,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
// convert src1 to fp16
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
@ -3509,6 +3328,7 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
|
||||
@ -3570,6 +3390,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
||||
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
||||
} else if (use_mul_mat_q) {
|
||||
@ -4251,10 +4072,72 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void reorder_qw(char *data_device, const int ncols, const int nrows,
|
||||
size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
||||
.wait()));
|
||||
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
||||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
||||
int offset_blks = offset / sizeof(block_q4_0);
|
||||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
|
||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
||||
|
||||
stream->parallel_for(
|
||||
size / sizeof(block_q4_0),
|
||||
[=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK4_0/2; j ++)
|
||||
{
|
||||
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
||||
}
|
||||
*(d_ptr + ib) = x[ib].d;
|
||||
});
|
||||
|
||||
sycl::free(tmp_buf, *stream);
|
||||
}
|
||||
|
||||
void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
char*data_device = (char*)src0->data;
|
||||
size_t ncols = src0->ne[0];
|
||||
size_t nrows = src0->ne[1];
|
||||
size_t size = ggml_nbytes(src0);
|
||||
|
||||
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
||||
}
|
||||
|
||||
void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
||||
ggml_tensor *src0 = dst->src[0];
|
||||
ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
|
||||
src1->ne[2]==1 && src1->ne[3]==1) {
|
||||
reorder_qw(src0, stream);
|
||||
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
|
||||
GGML_ASSERT(extra);
|
||||
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
|
||||
}
|
||||
}
|
||||
|
||||
void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
||||
dpct::queue_ptr stream = ctx->stream();
|
||||
if (ctx->optimized_graph) {
|
||||
return;
|
||||
}
|
||||
ctx->optimized_graph = true;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
||||
}
|
||||
}
|
||||
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
|
||||
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
Reference in New Issue
Block a user