whisper : adapt to latest ggml (skip) (#0)

This commit is contained in:
Georgi Gerganov 2024-10-05 13:14:03 +03:00
parent 0b1b094a67
commit 941912467d
13 changed files with 368 additions and 2585 deletions

View File

@ -904,10 +904,10 @@ ggml/src/ggml-alloc.o: \
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-backend.o: \ ggml/src/ggml-backend.o: \
ggml/src/ggml-backend.c \ ggml/src/ggml-backend.cpp \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-backend.h ggml/include/ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ggml/src/ggml-quants.o: \ ggml/src/ggml-quants.o: \
ggml/src/ggml-quants.c \ ggml/src/ggml-quants.c \

View File

@ -34,7 +34,7 @@ let package = Package(
"src/whisper.cpp", "src/whisper.cpp",
"ggml/src/ggml-aarch64.c", "ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c", "ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c", "ggml/src/ggml-backend.cpp",
"ggml/src/ggml-quants.c", "ggml/src/ggml-quants.c",
"ggml/src/ggml-metal.m" "ggml/src/ggml-metal.m"
], ],

View File

@ -11,7 +11,7 @@ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} ."
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .") system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")

View File

@ -12,9 +12,7 @@
# include "ggml-rpc.h" # include "ggml-rpc.h"
#endif #endif
#ifdef GGML_USE_CUDA #if defined(GGML_USE_VULKAN)
# include "ggml-cuda.h"
#elif defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h" # include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h" # include "ggml-sycl.h"
@ -610,7 +608,7 @@ enum llm_tensor {
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
}; };
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = { static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
{ {
LLM_ARCH_LLAMA, LLM_ARCH_LLAMA,
{ {
@ -1566,32 +1564,32 @@ struct LLM_TN {
return LLM_TENSOR_NAMES.at(arch).at(tensor); return LLM_TENSOR_NAMES.at(arch).at(tensor);
} }
std::string operator()(llm_tensor tensor, const std::string & suffix) const { std::string operator()(llm_tensor tensor, const char * suffix) const {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix; return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
} }
std::string operator()(llm_tensor tensor, int bid) const { std::string operator()(llm_tensor tensor, int bid) const {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid); return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
} }
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix; return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
} }
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__"; return "__missing__";
} }
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix; return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
} }
}; };
@ -2264,59 +2262,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
return piece; return piece;
} }
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_CUDA)
// host buffers should only be used when data is expected to be copied to/from the GPU
if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type();
}
#elif defined(GGML_USE_SYCL)
if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CANN)
if (host_buffer) {
buft = ggml_backend_cann_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
if (host_buffer) {
buft = ggml_backend_vk_host_buffer_type();
}
#endif
if (buft == nullptr) {
buft = ggml_backend_cpu_buffer_type();
}
return buft;
GGML_UNUSED(host_buffer);
}
// //
// globals // globals
// //
struct llama_state { struct llama_logger_state {
llama_state() {
#ifdef GGML_USE_METAL
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
#elif defined(GGML_USE_CUDA)
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
#elif defined(GGML_USE_CANN)
ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
#endif
}
// We save the log callback globally
ggml_log_callback log_callback = llama_log_callback_default; ggml_log_callback log_callback = llama_log_callback_default;
void * log_callback_user_data = nullptr; void * log_callback_user_data = nullptr;
}; };
static llama_state g_state; static llama_logger_state g_logger_state;
// available llama models // available llama models
enum e_model { enum e_model {
@ -2920,14 +2875,17 @@ struct llama_model {
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
int n_gpu_layers; int n_gpu_layers;
std::vector<std::string> rpc_servers; // list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
// gguf metadata std::vector<std::string> rpc_servers;
std::unordered_map<std::string, std::string> gguf_kv;
// layer -> buffer type mapping // layer -> buffer type mapping
struct layer_buft { struct layer_buft {
@ -2970,11 +2928,6 @@ struct llama_model {
ggml_free(ctx); ggml_free(ctx);
} }
for (ggml_backend_buffer_t buf : bufs) { for (ggml_backend_buffer_t buf : bufs) {
#ifdef GGML_USE_CUDA
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
}
#endif
ggml_backend_buffer_free(buf); ggml_backend_buffer_free(buf);
} }
while (!lora_adapters.empty()) { while (!lora_adapters.empty()) {
@ -3460,72 +3413,116 @@ struct llama_lora_adapter {
} }
}; };
static size_t llama_get_device_count(const llama_model & model) { static int llama_get_device_count(const llama_model & model) {
size_t count = 1; int count = (int) model.devices.size();
#if defined(GGML_USE_CUDA)
count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
return ggml_backend_cann_get_device_count();
#endif
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
count += model.rpc_servers.size(); count += (int) model.rpc_servers.size();
#endif #endif
#if defined(GGML_USE_METAL)
count += 1;
#elif defined(GGML_USE_SYCL)
count += ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count += ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
count += ggml_backend_cann_get_device_count();
#endif
return count; return count;
GGML_UNUSED(model); GGML_UNUSED(model);
} }
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_RPC if (host_buffer) {
int rpc_count = (int)model.rpc_servers.size(); for (auto * dev : model.devices) {
#else buft = ggml_backend_dev_host_buffer_type(dev);
int rpc_count = 0; if (buft != nullptr) {
#endif break;
int local_gpu = gpu - rpc_count; }
#if defined(GGML_USE_RPC) }
if (gpu < rpc_count) {
const char * endpoint = model.rpc_servers[gpu].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
} }
#endif
#if defined(GGML_USE_METAL) #if defined(GGML_USE_SYCL)
buft = ggml_backend_metal_buffer_type(); if (host_buffer) {
#elif defined(GGML_USE_CUDA) buft = ggml_backend_sycl_host_buffer_type();
buft = ggml_backend_cuda_buffer_type(local_gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(local_gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(local_gpu);
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(local_gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
} }
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(local_gpu); if (host_buffer) {
buft = ggml_backend_cann_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
if (host_buffer) {
buft = ggml_backend_vk_host_buffer_type();
}
#endif #endif
if (buft == nullptr) { if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true); buft = ggml_backend_cpu_buffer_type();
} }
return buft; return buft;
GGML_UNUSED(host_buffer);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_RPC)
int rpc_count = (int)model.rpc_servers.size();
if (device < rpc_count) {
const char * endpoint = model.rpc_servers[device].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
device -= rpc_count;
#endif
if (device < (int)model.devices.size()) {
return ggml_backend_dev_buffer_type(model.devices[device]);
}
device -= (int)model.devices.size();
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(device);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(device);
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(device);
#elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(device);
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(model, true);
}
return buft;
GGML_UNUSED(model); GGML_UNUSED(model);
GGML_UNUSED(local_gpu);
} }
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUDA // find a backend that supports split buffers
if (ggml_backend_cuda_get_device_count() > 1) { for (size_t i = 0; i < ggml_backend_reg_count(); ++i) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split); ggml_backend_reg_t reg = ggml_backend_reg_get(i);
auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
if (ggml_backend_split_buffer_type_fn) {
buft = ggml_backend_split_buffer_type_fn(tensor_split);
if (buft != nullptr) {
break;
}
}
} }
#endif
#ifdef GGML_USE_SYCL #ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) { if (ggml_backend_sycl_get_device_count() > 1) {
@ -3542,13 +3539,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
} }
static size_t llama_get_device_memory(const llama_model & model, int device) { static size_t llama_get_device_memory(const llama_model & model, int device) {
#ifdef GGML_USE_RPC
int rpc_count = (int)model.rpc_servers.size();
#else
int rpc_count = 0;
#endif
int local_device = device - rpc_count;
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
int rpc_count = (int)model.rpc_servers.size();
if (device < rpc_count) { if (device < rpc_count) {
size_t total; size_t total;
size_t free; size_t free;
@ -3556,32 +3548,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
ggml_backend_rpc_get_device_memory(endpoint, &free, &total); ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free; return free;
} }
device = device - rpc_count;
#endif #endif
#if defined(GGML_USE_CUDA)
if (device < (int)model.devices.size()) {
ggml_backend_dev_t dev = model.devices[device];
size_t total;
size_t free;
ggml_backend_dev_memory(dev, &free, &total);
return free;
}
#if defined(GGML_USE_SYCL)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(local_device, &free, &total); ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(local_device, &free, &total);
return free; return free;
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_vk_get_device_memory(local_device, &free, &total); ggml_backend_vk_get_device_memory(device, &free, &total);
return free; return free;
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cann_get_device_memory(local_device, &free, &total); ggml_backend_cann_get_device_memory(device, &free, &total);
return free; return free;
#else #else
return 1; return 1;
#endif #endif
GGML_UNUSED(model); GGML_UNUSED(model);
GGML_UNUSED(local_device); GGML_UNUSED(device);
} }
// //
@ -3624,7 +3621,7 @@ static bool llama_kv_cache_init(
buft_layer_count[model.buft_layer[i].buft]++; buft_layer_count[model.buft_layer[i].buft]++;
} }
} else { } else {
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
} }
// create a context for each buffer type // create a context for each buffer type
@ -4916,7 +4913,7 @@ struct llama_model_loader {
static const int TENSOR_NOT_REQUIRED = 1; static const int TENSOR_NOT_REQUIRED = 1;
static const int TENSOR_DUPLICATED = 2; static const int TENSOR_DUPLICATED = 2;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) { struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0) {
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
if (cur == NULL) { if (cur == NULL) {
@ -4926,7 +4923,7 @@ struct llama_model_loader {
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED); return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
} }
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) { struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
if (cur == NULL) { if (cur == NULL) {
@ -4939,7 +4936,7 @@ struct llama_model_loader {
std::array<int64_t, GGML_MAX_DIMS> dims; std::array<int64_t, GGML_MAX_DIMS> dims;
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
dims[i] = i < ne.size() ? ne[i] : 1; dims[i] = i < ne.size() ? ne.begin()[i] : 1;
} }
struct ggml_tensor * tensor = ggml_view_4d(ctx, base, struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
@ -5037,7 +5034,7 @@ struct llama_model_loader {
// Returns false if cancelled by progress_callback // Returns false if cancelled by progress_callback
bool load_all_data( bool load_all_data(
struct ggml_context * ctx, struct ggml_context * ctx,
llama_buf_map & bufs_mmap, llama_buf_map & bufs,
llama_mlocks * lmlocks, llama_mlocks * lmlocks,
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void * progress_callback_user_data) { void * progress_callback_user_data) {
@ -5046,43 +5043,94 @@ struct llama_model_loader {
std::vector<no_init<uint8_t>> read_buf; std::vector<no_init<uint8_t>> read_buf;
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
#if defined(GGML_USE_CUDA)
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers. // NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4; constexpr size_t n_buffers = 4;
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
std::vector<ggml_backend_buffer_t> host_buffers; std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<void*> host_ptrs;
std::vector<ggml_backend_event_t> events; std::vector<ggml_backend_event_t> events;
std::vector<void *> host_ptrs;
size_t buffer_idx = 0; // buffer to use for async loads size_t buffer_idx = 0; // buffer to use for async loads
ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
ggml_backend_t cuda_backend = nullptr; if (use_mmap || check_tensors) {
if (!use_mmap && !check_tensors) { return nullptr;
}
// When not using mmaped io use async uploads from pinned memory to GPU memory. // When not using mmaped io use async uploads from pinned memory to GPU memory.
// First determine if the CUDA backend is active, and if so, determine the device ID. // First determine if the backend supports the necessary features for async uploads.
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
if (buf) { if (!buf) {
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf); LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { return nullptr;
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
if (buffer_type == cuda_buffer_type) {
cuda_backend = ggml_backend_cuda_init(i);
break;
}
}
} }
// If the cuda backend is active create pinned memory buffers and events for synchronisation. auto * buft = ggml_backend_buffer_get_type(buf);
if (cuda_backend) { auto * dev = ggml_backend_buft_get_device(buft);
for (size_t idx = 0; idx < n_buffers; ++idx) { if (!dev) {
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); ggml_backend_buft_name(buft));
events.emplace_back(ggml_backend_event_new(cuda_backend)); return nullptr;
}
} }
if (buft != ggml_backend_dev_buffer_type(dev)) {
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
return nullptr;
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
if (!host_buft) {
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) {
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
host_buffers.emplace_back(buf);
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
auto * event = ggml_backend_event_new(dev);
if (!event) {
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
events.emplace_back(event);
}
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
if (!backend) {
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
ggml_backend_dev_name(dev));
return nullptr;
}
return backend;
}(__func__);
if (upload_backend) {
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
ggml_backend_name(upload_backend));
} }
#endif
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
const auto * weight = get_weight(ggml_get_name(cur)); const auto * weight = get_weight(ggml_get_name(cur));
@ -5102,8 +5150,8 @@ struct llama_model_loader {
if (use_mmap) { if (use_mmap) {
const auto & mapping = mappings.at(weight->idx); const auto & mapping = mappings.at(weight->idx);
ggml_backend_buffer_t buf_mmap = nullptr; ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.count(weight->idx)) { if (bufs.count(weight->idx)) {
buf_mmap = bufs_mmap.at(weight->idx); buf_mmap = bufs.at(weight->idx);
} }
uint8_t * data = (uint8_t *) mapping->addr + weight->offs; uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
@ -5139,9 +5187,8 @@ struct llama_model_loader {
})); }));
} }
} else { } else {
#if defined(GGML_USE_CUDA) // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) {
if (cuda_backend) {
file->seek(weight->offs, SEEK_SET); file->seek(weight->offs, SEEK_SET);
size_t bytes_read = 0; size_t bytes_read = 0;
@ -5151,17 +5198,14 @@ struct llama_model_loader {
ggml_backend_event_synchronize(events[buffer_idx]); ggml_backend_event_synchronize(events[buffer_idx]);
file->read_raw(host_ptrs[buffer_idx], read_iteration); file->read_raw(host_ptrs[buffer_idx], read_iteration);
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
ggml_backend_event_record(events[buffer_idx]); ggml_backend_event_record(events[buffer_idx], upload_backend);
bytes_read += read_iteration; bytes_read += read_iteration;
++buffer_idx; ++buffer_idx;
buffer_idx %= n_buffers; buffer_idx %= n_buffers;
} }
} } else {
else
#endif
{
read_buf.resize(n_size); read_buf.resize(n_size);
file->seek(weight->offs, SEEK_SET); file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size); file->read_raw(read_buf.data(), n_size);
@ -5176,17 +5220,15 @@ struct llama_model_loader {
size_done += n_size; size_done += n_size;
} }
#if defined(GGML_USE_CUDA) // free temporary resources used for async uploads
// free temporary resources used for async cuda uploads for (auto * event : events) {
if (cuda_backend) { ggml_backend_event_synchronize(event);
for (size_t idx = 0; idx < n_buffers;++idx) { ggml_backend_event_free(event);
ggml_backend_event_synchronize(events[idx]);
ggml_backend_event_free(events[idx]);
ggml_backend_buffer_free(host_buffers[idx]);
}
ggml_backend_free(cuda_backend);
} }
#endif for (auto * buf : host_buffers) {
ggml_backend_buffer_free(buf);
}
ggml_backend_free(upload_backend);
// check validation results // check validation results
bool validation_failed = false; bool validation_failed = false;
@ -6922,6 +6964,13 @@ static bool llm_load_tensors(
void * progress_callback_user_data) { void * progress_callback_user_data) {
auto & hparams = model.hparams; auto & hparams = model.hparams;
// check if the value of main_gpu is valid
if (llama_get_device_count(model) > 0 &&
split_mode != LLAMA_SPLIT_MODE_LAYER &&
(main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
}
model.split_mode = split_mode; model.split_mode = split_mode;
model.main_gpu = main_gpu; model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
@ -6931,14 +6980,14 @@ static bool llm_load_tensors(
bool use_mmap_buffer = true; bool use_mmap_buffer = true;
// there is very little benefit to offloading the input layer, so always keep it on the CPU // there is very little benefit to offloading the input layer, so always keep it on the CPU
model.buft_input = llama_default_buffer_type_cpu(true); model.buft_input = llama_default_buffer_type_cpu(model, true);
//model.buft_input = llama_default_buffer_type_offload(main_gpu); //model.buft_input = llama_default_buffer_type_offload(main_gpu);
model.buft_layer.resize(n_layer); model.buft_layer.resize(n_layer);
// assign cpu layers // assign cpu layers
for (int i = 0; i < i_gpu_start; ++i) { for (int i = 0; i < i_gpu_start; ++i) {
model.buft_layer[i] = llama_default_buffer_type_cpu(true); model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
} }
if (split_mode == LLAMA_SPLIT_MODE_LAYER) { if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@ -6976,7 +7025,7 @@ static bool llm_load_tensors(
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu); model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
} else { } else {
model.buft_output = llama_default_buffer_type_cpu(true); model.buft_output = llama_default_buffer_type_cpu(model, true);
} }
} else { } else {
ggml_backend_buffer_type_t split_buft; ggml_backend_buffer_type_t split_buft;
@ -7000,7 +7049,7 @@ static bool llm_load_tensors(
llama_default_buffer_type_offload(model, main_gpu) llama_default_buffer_type_offload(model, main_gpu)
}; };
} else { } else {
model.buft_output = llama_default_buffer_type_cpu(true); model.buft_output = llama_default_buffer_type_cpu(model, true);
} }
} }
@ -8872,7 +8921,7 @@ static bool llm_load_tensors(
// only the mmap region containing the tensors in the model is mapped to the backend buffer // only the mmap region containing the tensors in the model is mapped to the backend buffer
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) { if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
void * addr = nullptr; void * addr = nullptr;
size_t first, last; size_t first, last;
@ -8886,13 +8935,6 @@ static bool llm_load_tensors(
} }
model.bufs.push_back(buf); model.bufs.push_back(buf);
bufs.emplace(idx, buf); bufs.emplace(idx, buf);
#ifdef GGML_USE_CUDA
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf),
ggml_backend_buffer_get_size(buf));
}
#endif
} }
} }
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -16956,7 +16998,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
lctx.embd = nullptr; lctx.embd = nullptr;
} }
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size); lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
if (lctx.buf_output == nullptr) { if (lctx.buf_output == nullptr) {
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
return 0; return 0;
@ -18987,21 +19029,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
} }
size_t llama_max_devices(void) { size_t llama_max_devices(void) {
#if defined(GGML_USE_RPC) return 16;
return GGML_RPC_MAX_SERVERS;
#elif defined(GGML_USE_METAL)
return 1;
#elif defined(GGML_USE_CUDA)
return GGML_CUDA_MAX_DEVICES;
#elif defined(GGML_USE_SYCL)
return GGML_SYCL_MAX_DEVICES;
#elif defined(GGML_USE_VULKAN)
return GGML_VK_MAX_DEVICES;
#elif defined(GGML_USE_CANN)
return GGML_CANN_MAX_DEVICES;
#else
return 1;
#endif
} }
bool llama_supports_mmap(void) { bool llama_supports_mmap(void) {
@ -19013,12 +19041,13 @@ bool llama_supports_mlock(void) {
} }
bool llama_supports_gpu_offload(void) { bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ #if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true; return true;
#else #else
return false; return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
#endif #endif
} }
@ -19083,17 +19112,30 @@ struct llama_model * llama_load_model_from_file(
return true; return true;
}; };
} }
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers // split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers); std::string servers(params.rpc_servers);
size_t pos = 0; size_t pos = 0;
while ((pos = servers.find(",")) != std::string::npos) { while ((pos = servers.find(',')) != std::string::npos) {
std::string server = servers.substr(0, pos); std::string server = servers.substr(0, pos);
model->rpc_servers.push_back(server); model->rpc_servers.push_back(server);
servers.erase(0, pos + 1); servers.erase(0, pos + 1);
} }
model->rpc_servers.push_back(servers); model->rpc_servers.push_back(servers);
} }
// create list of devices to use with this model
// currently, we use all available devices
// TODO: rework API to give user more control over device selection
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
// skip the CPU backend since it is handled separately
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) {
model->devices.push_back(dev);
}
}
int status = llama_model_load(path_model, *model, params); int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0); GGML_ASSERT(status <= 0);
if (status < 0) { if (status < 0) {
@ -19255,6 +19297,36 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
// initialize backends // initialize backends
int main_gpu = model->main_gpu;
// with registry
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
ggml_backend_dev_t main_dev = model->devices[main_gpu];
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
for (auto * dev : model->devices) {
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
if (main_gpu >= (int)model->devices.size()) {
main_gpu -= (int)model->devices.size();
}
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) { for (const auto & endpoint : model->rpc_servers) {
@ -19267,6 +19339,9 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
} }
if (main_gpu >= (int)model->rpc_servers.size()) {
main_gpu -= (int)model->rpc_servers.size();
}
#endif #endif
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
@ -19279,28 +19354,6 @@ struct llama_context * llama_new_context_with_model(
} }
ctx->backends.push_back(ctx->backend_metal); ctx->backends.push_back(ctx->backend_metal);
} }
#elif defined(GGML_USE_CUDA)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cuda_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
@ -19308,7 +19361,7 @@ struct llama_context * llama_new_context_with_model(
return nullptr; return nullptr;
} }
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu); ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19329,9 +19382,9 @@ struct llama_context * llama_new_context_with_model(
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
@ -19350,7 +19403,7 @@ struct llama_context * llama_new_context_with_model(
} }
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(model->main_gpu); auto * backend = ggml_backend_kompute_init(main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19359,29 +19412,29 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
#elif defined(GGML_USE_CANN) #elif defined(GGML_USE_CANN)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
// TODO: ggml_backend_cann is not support split tensor now, just leave code here. // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cann_init(device);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device); LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cann_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
} }
}
#endif #endif
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
@ -19446,7 +19499,7 @@ struct llama_context * llama_new_context_with_model(
for (auto * backend : ctx->backends) { for (auto * backend : ctx->backends) {
if (ggml_backend_is_cpu(backend)) { if (ggml_backend_is_cpu(backend)) {
// use host buffers for the CPU backend compute buffer // use host buffers for the CPU backend compute buffer
backend_buft.push_back(llama_default_buffer_type_cpu(true)); backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
} else { } else {
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
} }
@ -19457,17 +19510,37 @@ struct llama_context * llama_new_context_with_model(
// buffer used to store the computation graph and the tensor meta data // buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel = bool pipeline_parallel =
llama_get_device_count(*model) > 1 && llama_get_device_count(*model) > 1 &&
model->n_gpu_layers > (int)model->hparams.n_layer && model->n_gpu_layers > (int)model->hparams.n_layer &&
model->split_mode == LLAMA_SPLIT_MODE_LAYER && model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv; params.offload_kqv;
#ifndef GGML_USE_CUDA
// pipeline parallelism requires support for async compute and events // pipeline parallelism requires support for async compute and events in all devices
// currently this is only implemented in the CUDA backend if (pipeline_parallel) {
pipeline_parallel = false; for (auto * backend : ctx->backends) {
#endif if (ggml_backend_is_cpu(backend)) {
// ignore CPU backend
continue;
}
auto * dev = ggml_backend_get_device(backend);
if (!dev) {
// backend is using old interface, not supported
pipeline_parallel = false;
break;
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
if (!props.caps.async || !props.caps.events) {
// device does not support async compute or events
pipeline_parallel = false;
break;
}
}
}
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
if (pipeline_parallel) { if (pipeline_parallel) {
@ -21772,15 +21845,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
} }
void llama_log_set(ggml_log_callback log_callback, void * user_data) { void llama_log_set(ggml_log_callback log_callback, void * user_data) {
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; ggml_log_set(log_callback, user_data);
g_state.log_callback_user_data = user_data; g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
#ifdef GGML_USE_METAL g_logger_state.log_callback_user_data = user_data;
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#elif defined(GGML_USE_CUDA)
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#elif defined(GGML_USE_CANN)
ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#endif
} }
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
@ -21789,12 +21856,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
char buffer[128]; char buffer[128];
int len = vsnprintf(buffer, 128, format, args); int len = vsnprintf(buffer, 128, format, args);
if (len < 128) { if (len < 128) {
g_state.log_callback(level, buffer, g_state.log_callback_user_data); g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
} else { } else {
char * buffer2 = new char[len + 1]; char * buffer2 = new char[len + 1];
vsnprintf(buffer2, len + 1, format, args_copy); vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0; buffer2[len] = 0;
g_state.log_callback(level, buffer2, g_state.log_callback_user_data); g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
delete[] buffer2; delete[] buffer2;
} }
va_end(args_copy); va_end(args_copy);

View File

@ -7,7 +7,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1 const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
{0x000000, 0x0080}, {0x000000, 0x0080},
{0x000020, 0x0008}, {0x000020, 0x0008},
{0x000021, 0x0020}, {0x000021, 0x0020},
@ -2311,7 +2311,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
0x003000, 0x003000,
}; };
const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = { // list is always in ascending order, to enable binary searh
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
{0x000041, 0x000061}, {0x000041, 0x000061},
{0x000042, 0x000062}, {0x000042, 0x000062},
{0x000043, 0x000063}, {0x000043, 0x000063},
@ -3747,7 +3748,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
{0x01E921, 0x01E943}, {0x01E921, 0x01E943},
}; };
const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = { // list is always in ascending order, to enable binary searh
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
{0x000061, 0x000041}, {0x000061, 0x000041},
{0x000062, 0x000042}, {0x000062, 0x000042},
{0x000063, 0x000043}, {0x000063, 0x000043},
@ -5200,7 +5202,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
{0x01E943, 0x01E921}, {0x01E943, 0x01E921},
}; };
const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd
{0x000000, 0x000000, 0x000000}, {0x000000, 0x000000, 0x000000},
{0x0000C0, 0x0000C5, 0x000041}, {0x0000C0, 0x0000C5, 0x000041},
{0x0000C7, 0x0000C7, 0x000043}, {0x0000C7, 0x0000C7, 0x000043},

View File

@ -13,8 +13,8 @@ struct range_nfd {
static const uint32_t MAX_CODEPOINTS = 0x110000; static const uint32_t MAX_CODEPOINTS = 0x110000;
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags; extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
extern const std::unordered_set<uint32_t> unicode_set_whitespace; extern const std::unordered_set<uint32_t> unicode_set_whitespace;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase; extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase; extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
extern const std::vector<range_nfd> unicode_ranges_nfd; extern const std::initializer_list<range_nfd> unicode_ranges_nfd;

View File

@ -123,11 +123,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
static std::vector<codepoint_flags> unicode_cpt_flags_array() { static std::vector<codepoint_flags> unicode_cpt_flags_array() {
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
assert (unicode_ranges_flags.front().first == 0); assert (unicode_ranges_flags.begin()[0].first == 0);
assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS); assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags
const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
cpt_flags[cpt] = range_ini.second; cpt_flags[cpt] = range_ini.second;
} }
@ -597,7 +597,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> result(cpts.size()); std::vector<uint32_t> result(cpts.size());
for (size_t i = 0; i < cpts.size(); ++i) { for (size_t i = 0; i < cpts.size(); ++i) {
const uint32_t cpt = cpts[i]; const uint32_t cpt = cpts[i];
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1; auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt; result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
} }
return result; return result;
@ -639,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
} }
uint32_t unicode_tolower(uint32_t cp) { uint32_t unicode_tolower(uint32_t cp) {
auto it = unicode_map_lowercase.find(cp); // binary search
return it == unicode_map_lowercase.end() ? cp : it->second; auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
return pair.first < value;
});
if (it != unicode_map_lowercase.end() && it->first == cp) {
return it->second;
}
return cp; // Return the original code point if no lowercase mapping is found
} }
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) { std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {

View File

@ -9,7 +9,7 @@ set(SOURCE_FILES
${WHISPER_LIB_DIR}/ggml/src/ggml.c ${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
${WHISPER_LIB_DIR}/src/whisper.cpp ${WHISPER_LIB_DIR}/src/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c ${CMAKE_SOURCE_DIR}/jni.c

View File

@ -21,7 +21,7 @@ if (NOT GGML_HOME)
${WHISPER_LIB_DIR}/ggml/src/ggml.c ${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
) )
endif() endif()

View File

@ -22,7 +22,7 @@
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; }; 18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; }; 18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; }; 18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; }; 18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; }; 18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; }; 7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; }; 7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@ -73,7 +73,7 @@
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; }; 18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; }; 18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; }; 18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; }; 18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; }; 18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; }; 18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; }; 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
@ -120,7 +120,7 @@
18A275FF2C2A9563001C8D37 /* ggml-common.h */, 18A275FF2C2A9563001C8D37 /* ggml-common.h */,
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */, 18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
18ABE1562AF556340044A204 /* ggml-backend-impl.h */, 18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
18ABE1572AF556340044A204 /* ggml-backend.c */, 18ABE1572AF556340044A204 /* ggml-backend.cpp */,
18ABE1552AF556340044A204 /* ggml-backend.h */, 18ABE1552AF556340044A204 /* ggml-backend.h */,
18ABE1582AF556340044A204 /* ggml-impl.h */, 18ABE1582AF556340044A204 /* ggml-impl.h */,
18ABE1592AF556340044A204 /* ggml-quants.c */, 18ABE1592AF556340044A204 /* ggml-quants.c */,
@ -248,7 +248,7 @@
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */, 18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */, 7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */, 1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */, 18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
18627C8C29052BE000BD2A04 /* main.m in Sources */, 18627C8C29052BE000BD2A04 /* main.m in Sources */,
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */, 18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */, 1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
6ebf0cf75db1739b6c8b26ccca3f5029ab35fe4a e7fd7deec20ef1ced3eebe38802f3c2126fddfa4

View File

@ -1239,6 +1239,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) { static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
ggml_backend_t result = NULL; ggml_backend_t result = NULL;
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
if (params.use_gpu) { if (params.use_gpu) {
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__); WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
@ -1252,7 +1254,6 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (params.use_gpu) { if (params.use_gpu) {
WHISPER_LOG_INFO("%s: using Metal backend\n", __func__); WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
result = ggml_backend_metal_init(); result = ggml_backend_metal_init();
if (!result) { if (!result) {
WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__); WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);