mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-22 16:38:58 +00:00
Optimize mul_mat for Q4_0 on Intel GPU (llama/12035)
* opt performance by reorder for Intel GPU * detect hw type and save opt feature, and print opt feature * correct name * support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed * add env variable GGML_SYCL_DISABLE_OPT for debug * use syclex::architecture replace the custom hw define, update the guide for GGML_SYCL_DISABLE_OPT * add performance data * mv getrows functions to separeted files * fix global variables --------- Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
This commit is contained in:
committed by
Georgi Gerganov
parent
ac301a7d9b
commit
e3cb412a59
@ -19,6 +19,9 @@
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml-sycl.h"
|
||||
#include "presets.hpp"
|
||||
#include "sycl_hw.hpp"
|
||||
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
#include "dnnl.hpp"
|
||||
#include "dnnl_sycl.hpp"
|
||||
@ -35,7 +38,10 @@
|
||||
void* ggml_sycl_host_malloc(size_t size);
|
||||
void ggml_sycl_host_free(void* ptr);
|
||||
|
||||
|
||||
extern int g_ggml_sycl_debug;
|
||||
extern int g_ggml_sycl_disable_optimize;
|
||||
|
||||
#define GGML_SYCL_DEBUG(...) \
|
||||
do { \
|
||||
if (g_ggml_sycl_debug) \
|
||||
@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
struct optimize_feature {
|
||||
bool reorder=false;
|
||||
};
|
||||
|
||||
struct sycl_device_info {
|
||||
int cc; // compute capability
|
||||
// int nsm; // number of streaming multiprocessors
|
||||
// size_t smpb; // max. shared memory per block
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
sycl_hw_info hw_info;
|
||||
optimize_feature opt_feature;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_sycl_device_info {
|
||||
int device_count;
|
||||
|
||||
struct sycl_device_info {
|
||||
int cc; // compute capability
|
||||
// int nsm; // number of streaming multiprocessors
|
||||
// size_t smpb; // max. shared memory per block
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
};
|
||||
|
||||
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
||||
@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu {
|
||||
// tensors
|
||||
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
|
||||
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
||||
optimize_feature optimized_feature;
|
||||
};
|
||||
|
||||
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
|
||||
|
||||
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
|
||||
optimize_feature opt;
|
||||
|
||||
opt.reorder =
|
||||
(arch == syclex::architecture::intel_gpu_dg1 ||
|
||||
arch == syclex::architecture::intel_gpu_acm_g10 ||
|
||||
arch == syclex::architecture::intel_gpu_acm_g11 ||
|
||||
arch == syclex::architecture::intel_gpu_acm_g12 ||
|
||||
arch == syclex::architecture::intel_gpu_pvc ||
|
||||
arch == syclex::architecture::intel_gpu_pvc_vg ||
|
||||
arch == syclex::architecture::intel_gpu_mtl_u ||
|
||||
arch == syclex::architecture::intel_gpu_mtl_s ||
|
||||
arch == syclex::architecture::intel_gpu_mtl_h ||
|
||||
arch == syclex::architecture::intel_gpu_arl_u ||
|
||||
arch == syclex::architecture::intel_gpu_arl_s ||
|
||||
arch == syclex::architecture::intel_gpu_arl_h ||
|
||||
arch == syclex::architecture::intel_gpu_bmg_g21 ||
|
||||
arch == syclex::architecture::intel_gpu_lnl_m
|
||||
);
|
||||
|
||||
return opt;
|
||||
}
|
||||
|
||||
struct ggml_backend_sycl_context {
|
||||
int device;
|
||||
std::string name;
|
||||
optimize_feature opt_feature;
|
||||
bool optimized_graph=false;
|
||||
|
||||
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
||||
|
||||
explicit ggml_backend_sycl_context(int device) :
|
||||
device(device),
|
||||
name(GGML_SYCL_NAME + std::to_string(device)) {
|
||||
opt_feature = ggml_sycl_info().devices[device].opt_feature;
|
||||
}
|
||||
|
||||
queue_ptr stream(int device, int stream) {
|
||||
@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev);
|
||||
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const ggml_sycl_op_flatten_t op);
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
Reference in New Issue
Block a user