mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-18 20:27:53 +00:00
whisper : adapt to latest ggml (skip) (#0)
This commit is contained in:
parent
0b1b094a67
commit
941912467d
4
Makefile
4
Makefile
@ -904,10 +904,10 @@ ggml/src/ggml-alloc.o: \
|
|||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml/src/ggml-backend.o: \
|
ggml/src/ggml-backend.o: \
|
||||||
ggml/src/ggml-backend.c \
|
ggml/src/ggml-backend.cpp \
|
||||||
ggml/include/ggml.h \
|
ggml/include/ggml.h \
|
||||||
ggml/include/ggml-backend.h
|
ggml/include/ggml-backend.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml/src/ggml-quants.o: \
|
ggml/src/ggml-quants.o: \
|
||||||
ggml/src/ggml-quants.c \
|
ggml/src/ggml-quants.c \
|
||||||
|
@ -34,7 +34,7 @@ let package = Package(
|
|||||||
"src/whisper.cpp",
|
"src/whisper.cpp",
|
||||||
"ggml/src/ggml-aarch64.c",
|
"ggml/src/ggml-aarch64.c",
|
||||||
"ggml/src/ggml-alloc.c",
|
"ggml/src/ggml-alloc.c",
|
||||||
"ggml/src/ggml-backend.c",
|
"ggml/src/ggml-backend.cpp",
|
||||||
"ggml/src/ggml-quants.c",
|
"ggml/src/ggml-quants.c",
|
||||||
"ggml/src/ggml-metal.m"
|
"ggml/src/ggml-metal.m"
|
||||||
],
|
],
|
||||||
|
@ -11,7 +11,7 @@ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} ."
|
|||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
|
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
|
||||||
|
@ -12,9 +12,7 @@
|
|||||||
# include "ggml-rpc.h"
|
# include "ggml-rpc.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#if defined(GGML_USE_VULKAN)
|
||||||
# include "ggml-cuda.h"
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
# include "ggml-vulkan.h"
|
# include "ggml-vulkan.h"
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
# include "ggml-sycl.h"
|
# include "ggml-sycl.h"
|
||||||
@ -610,7 +608,7 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||||
{
|
{
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
{
|
{
|
||||||
@ -1566,32 +1564,32 @@ struct LLM_TN {
|
|||||||
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
std::string operator()(llm_tensor tensor, const char * suffix) const {
|
||||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||||
return "__missing__";
|
return "__missing__";
|
||||||
}
|
}
|
||||||
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, int bid) const {
|
std::string operator()(llm_tensor tensor, int bid) const {
|
||||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||||
return "__missing__";
|
return "__missing__";
|
||||||
}
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
|
||||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||||
return "__missing__";
|
return "__missing__";
|
||||||
}
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
|
||||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||||
return "__missing__";
|
return "__missing__";
|
||||||
}
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -2264,59 +2262,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
|
|||||||
return piece;
|
return piece;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
|
||||||
|
|
||||||
#if defined(GGML_USE_CUDA)
|
|
||||||
// host buffers should only be used when data is expected to be copied to/from the GPU
|
|
||||||
if (host_buffer) {
|
|
||||||
buft = ggml_backend_cuda_host_buffer_type();
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
if (host_buffer) {
|
|
||||||
buft = ggml_backend_sycl_host_buffer_type();
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_CANN)
|
|
||||||
if (host_buffer) {
|
|
||||||
buft = ggml_backend_cann_host_buffer_type();
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
if (host_buffer) {
|
|
||||||
buft = ggml_backend_vk_host_buffer_type();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (buft == nullptr) {
|
|
||||||
buft = ggml_backend_cpu_buffer_type();
|
|
||||||
}
|
|
||||||
return buft;
|
|
||||||
|
|
||||||
GGML_UNUSED(host_buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// globals
|
// globals
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_state {
|
struct llama_logger_state {
|
||||||
llama_state() {
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
|
||||||
#elif defined(GGML_USE_CANN)
|
|
||||||
ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// We save the log callback globally
|
|
||||||
ggml_log_callback log_callback = llama_log_callback_default;
|
ggml_log_callback log_callback = llama_log_callback_default;
|
||||||
void * log_callback_user_data = nullptr;
|
void * log_callback_user_data = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static llama_state g_state;
|
static llama_logger_state g_logger_state;
|
||||||
|
|
||||||
// available llama models
|
// available llama models
|
||||||
enum e_model {
|
enum e_model {
|
||||||
@ -2920,14 +2875,17 @@ struct llama_model {
|
|||||||
|
|
||||||
std::vector<llama_layer> layers;
|
std::vector<llama_layer> layers;
|
||||||
|
|
||||||
|
// gguf metadata
|
||||||
|
std::unordered_map<std::string, std::string> gguf_kv;
|
||||||
|
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
|
|
||||||
std::vector<std::string> rpc_servers;
|
// list of devices used in this model
|
||||||
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
|
|
||||||
// gguf metadata
|
std::vector<std::string> rpc_servers;
|
||||||
std::unordered_map<std::string, std::string> gguf_kv;
|
|
||||||
|
|
||||||
// layer -> buffer type mapping
|
// layer -> buffer type mapping
|
||||||
struct layer_buft {
|
struct layer_buft {
|
||||||
@ -2970,11 +2928,6 @@ struct llama_model {
|
|||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
for (ggml_backend_buffer_t buf : bufs) {
|
for (ggml_backend_buffer_t buf : bufs) {
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
|
||||||
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
}
|
}
|
||||||
while (!lora_adapters.empty()) {
|
while (!lora_adapters.empty()) {
|
||||||
@ -3460,72 +3413,116 @@ struct llama_lora_adapter {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static size_t llama_get_device_count(const llama_model & model) {
|
static int llama_get_device_count(const llama_model & model) {
|
||||||
size_t count = 1;
|
int count = (int) model.devices.size();
|
||||||
#if defined(GGML_USE_CUDA)
|
|
||||||
count = ggml_backend_cuda_get_device_count();
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
count = ggml_backend_sycl_get_device_count();
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
count = ggml_backend_vk_get_device_count();
|
|
||||||
#elif defined(GGML_USE_CANN)
|
|
||||||
return ggml_backend_cann_get_device_count();
|
|
||||||
#endif
|
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
count += model.rpc_servers.size();
|
count += (int) model.rpc_servers.size();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
|
count += 1;
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
count += ggml_backend_sycl_get_device_count();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
count += ggml_backend_vk_get_device_count();
|
||||||
|
#elif defined(GGML_USE_CANN)
|
||||||
|
count += ggml_backend_cann_get_device_count();
|
||||||
|
#endif
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
|
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
if (host_buffer) {
|
||||||
int rpc_count = (int)model.rpc_servers.size();
|
for (auto * dev : model.devices) {
|
||||||
#else
|
buft = ggml_backend_dev_host_buffer_type(dev);
|
||||||
int rpc_count = 0;
|
if (buft != nullptr) {
|
||||||
#endif
|
break;
|
||||||
int local_gpu = gpu - rpc_count;
|
}
|
||||||
#if defined(GGML_USE_RPC)
|
}
|
||||||
if (gpu < rpc_count) {
|
|
||||||
const char * endpoint = model.rpc_servers[gpu].c_str();
|
|
||||||
return ggml_backend_rpc_buffer_type(endpoint);
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_SYCL)
|
||||||
buft = ggml_backend_metal_buffer_type();
|
if (host_buffer) {
|
||||||
#elif defined(GGML_USE_CUDA)
|
buft = ggml_backend_sycl_host_buffer_type();
|
||||||
buft = ggml_backend_cuda_buffer_type(local_gpu);
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
buft = ggml_backend_vk_buffer_type(local_gpu);
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
buft = ggml_backend_sycl_buffer_type(local_gpu);
|
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
|
||||||
buft = ggml_backend_kompute_buffer_type(local_gpu);
|
|
||||||
if (buft == nullptr) {
|
|
||||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
|
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
buft = ggml_backend_cann_buffer_type(local_gpu);
|
if (host_buffer) {
|
||||||
|
buft = ggml_backend_cann_host_buffer_type();
|
||||||
|
}
|
||||||
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
if (host_buffer) {
|
||||||
|
buft = ggml_backend_vk_host_buffer_type();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
buft = llama_default_buffer_type_cpu(true);
|
buft = ggml_backend_cpu_buffer_type();
|
||||||
}
|
}
|
||||||
return buft;
|
return buft;
|
||||||
|
|
||||||
|
GGML_UNUSED(host_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
|
||||||
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
|
if (device < rpc_count) {
|
||||||
|
const char * endpoint = model.rpc_servers[device].c_str();
|
||||||
|
return ggml_backend_rpc_buffer_type(endpoint);
|
||||||
|
}
|
||||||
|
device -= rpc_count;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (device < (int)model.devices.size()) {
|
||||||
|
return ggml_backend_dev_buffer_type(model.devices[device]);
|
||||||
|
}
|
||||||
|
device -= (int)model.devices.size();
|
||||||
|
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
|
buft = ggml_backend_metal_buffer_type();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
buft = ggml_backend_vk_buffer_type(device);
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
buft = ggml_backend_sycl_buffer_type(device);
|
||||||
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
|
buft = ggml_backend_kompute_buffer_type(device);
|
||||||
|
#elif defined(GGML_USE_CANN)
|
||||||
|
buft = ggml_backend_cann_buffer_type(device);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (buft == nullptr) {
|
||||||
|
buft = llama_default_buffer_type_cpu(model, true);
|
||||||
|
}
|
||||||
|
return buft;
|
||||||
|
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
GGML_UNUSED(local_gpu);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
// find a backend that supports split buffers
|
||||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
for (size_t i = 0; i < ggml_backend_reg_count(); ++i) {
|
||||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
||||||
|
|
||||||
|
auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
|
||||||
|
if (ggml_backend_split_buffer_type_fn) {
|
||||||
|
buft = ggml_backend_split_buffer_type_fn(tensor_split);
|
||||||
|
if (buft != nullptr) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_SYCL
|
#ifdef GGML_USE_SYCL
|
||||||
if (ggml_backend_sycl_get_device_count() > 1) {
|
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||||
@ -3542,13 +3539,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
#ifdef GGML_USE_RPC
|
|
||||||
int rpc_count = (int)model.rpc_servers.size();
|
|
||||||
#else
|
|
||||||
int rpc_count = 0;
|
|
||||||
#endif
|
|
||||||
int local_device = device - rpc_count;
|
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
if (device < rpc_count) {
|
if (device < rpc_count) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
@ -3556,32 +3548,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|||||||
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
}
|
}
|
||||||
|
device = device - rpc_count;
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CUDA)
|
|
||||||
|
if (device < (int)model.devices.size()) {
|
||||||
|
ggml_backend_dev_t dev = model.devices[device];
|
||||||
|
size_t total;
|
||||||
|
size_t free;
|
||||||
|
ggml_backend_dev_memory(dev, &free, &total);
|
||||||
|
return free;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_SYCL)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(local_device, &free, &total);
|
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||||
return free;
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
size_t total;
|
|
||||||
size_t free;
|
|
||||||
ggml_backend_sycl_get_device_memory(local_device, &free, &total);
|
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_vk_get_device_memory(local_device, &free, &total);
|
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cann_get_device_memory(local_device, &free, &total);
|
ggml_backend_cann_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#else
|
#else
|
||||||
return 1;
|
return 1;
|
||||||
#endif
|
#endif
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
GGML_UNUSED(local_device);
|
GGML_UNUSED(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -3624,7 +3621,7 @@ static bool llama_kv_cache_init(
|
|||||||
buft_layer_count[model.buft_layer[i].buft]++;
|
buft_layer_count[model.buft_layer[i].buft]++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
|
||||||
}
|
}
|
||||||
|
|
||||||
// create a context for each buffer type
|
// create a context for each buffer type
|
||||||
@ -4916,7 +4913,7 @@ struct llama_model_loader {
|
|||||||
static const int TENSOR_NOT_REQUIRED = 1;
|
static const int TENSOR_NOT_REQUIRED = 1;
|
||||||
static const int TENSOR_DUPLICATED = 2;
|
static const int TENSOR_DUPLICATED = 2;
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0) {
|
||||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||||
|
|
||||||
if (cur == NULL) {
|
if (cur == NULL) {
|
||||||
@ -4926,7 +4923,7 @@ struct llama_model_loader {
|
|||||||
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
|
||||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
||||||
|
|
||||||
if (cur == NULL) {
|
if (cur == NULL) {
|
||||||
@ -4939,7 +4936,7 @@ struct llama_model_loader {
|
|||||||
|
|
||||||
std::array<int64_t, GGML_MAX_DIMS> dims;
|
std::array<int64_t, GGML_MAX_DIMS> dims;
|
||||||
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
dims[i] = i < ne.size() ? ne[i] : 1;
|
dims[i] = i < ne.size() ? ne.begin()[i] : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
||||||
@ -5037,7 +5034,7 @@ struct llama_model_loader {
|
|||||||
// Returns false if cancelled by progress_callback
|
// Returns false if cancelled by progress_callback
|
||||||
bool load_all_data(
|
bool load_all_data(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
llama_buf_map & bufs_mmap,
|
llama_buf_map & bufs,
|
||||||
llama_mlocks * lmlocks,
|
llama_mlocks * lmlocks,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
@ -5046,43 +5043,94 @@ struct llama_model_loader {
|
|||||||
std::vector<no_init<uint8_t>> read_buf;
|
std::vector<no_init<uint8_t>> read_buf;
|
||||||
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUDA)
|
|
||||||
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
||||||
// NVMe raid configurations might require more / larger buffers.
|
// NVMe raid configurations might require more / larger buffers.
|
||||||
constexpr size_t n_buffers = 4;
|
constexpr size_t n_buffers = 4;
|
||||||
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_t> host_buffers;
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
||||||
std::vector<void*> host_ptrs;
|
|
||||||
std::vector<ggml_backend_event_t> events;
|
std::vector<ggml_backend_event_t> events;
|
||||||
|
std::vector<void *> host_ptrs;
|
||||||
size_t buffer_idx = 0; // buffer to use for async loads
|
size_t buffer_idx = 0; // buffer to use for async loads
|
||||||
|
ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
|
||||||
ggml_backend_t cuda_backend = nullptr;
|
if (use_mmap || check_tensors) {
|
||||||
if (!use_mmap && !check_tensors) {
|
return nullptr;
|
||||||
|
}
|
||||||
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
||||||
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
// First determine if the backend supports the necessary features for async uploads.
|
||||||
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
|
||||||
if (buf) {
|
if (!buf) {
|
||||||
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
|
LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
|
||||||
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
return nullptr;
|
||||||
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
|
||||||
if (buffer_type == cuda_buffer_type) {
|
|
||||||
cuda_backend = ggml_backend_cuda_init(i);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
|
auto * buft = ggml_backend_buffer_get_type(buf);
|
||||||
if (cuda_backend) {
|
auto * dev = ggml_backend_buft_get_device(buft);
|
||||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
if (!dev) {
|
||||||
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
|
||||||
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
ggml_backend_buft_name(buft));
|
||||||
events.emplace_back(ggml_backend_event_new(cuda_backend));
|
return nullptr;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (buft != ggml_backend_dev_buffer_type(dev)) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
|
||||||
|
ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_props props;
|
||||||
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
|
||||||
|
ggml_backend_dev_name(dev));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
||||||
|
if (!host_buft) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
|
||||||
|
ggml_backend_dev_name(dev));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
||||||
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||||
|
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
||||||
|
if (!buf) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
|
||||||
|
ggml_backend_dev_name(dev));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
host_buffers.emplace_back(buf);
|
||||||
|
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
|
||||||
|
|
||||||
|
auto * event = ggml_backend_event_new(dev);
|
||||||
|
if (!event) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
|
||||||
|
ggml_backend_dev_name(dev));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
events.emplace_back(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
||||||
|
if (!backend) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
|
||||||
|
ggml_backend_dev_name(dev));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return backend;
|
||||||
|
}(__func__);
|
||||||
|
|
||||||
|
if (upload_backend) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
|
||||||
|
ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
|
||||||
|
ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
|
||||||
|
ggml_backend_name(upload_backend));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
const auto * weight = get_weight(ggml_get_name(cur));
|
const auto * weight = get_weight(ggml_get_name(cur));
|
||||||
@ -5102,8 +5150,8 @@ struct llama_model_loader {
|
|||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
const auto & mapping = mappings.at(weight->idx);
|
const auto & mapping = mappings.at(weight->idx);
|
||||||
ggml_backend_buffer_t buf_mmap = nullptr;
|
ggml_backend_buffer_t buf_mmap = nullptr;
|
||||||
if (bufs_mmap.count(weight->idx)) {
|
if (bufs.count(weight->idx)) {
|
||||||
buf_mmap = bufs_mmap.at(weight->idx);
|
buf_mmap = bufs.at(weight->idx);
|
||||||
}
|
}
|
||||||
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||||
|
|
||||||
@ -5139,9 +5187,8 @@ struct llama_model_loader {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(GGML_USE_CUDA)
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||||
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
if (upload_backend) {
|
||||||
if (cuda_backend) {
|
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
|
|
||||||
size_t bytes_read = 0;
|
size_t bytes_read = 0;
|
||||||
@ -5151,17 +5198,14 @@ struct llama_model_loader {
|
|||||||
|
|
||||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||||
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
||||||
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
||||||
ggml_backend_event_record(events[buffer_idx]);
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
||||||
|
|
||||||
bytes_read += read_iteration;
|
bytes_read += read_iteration;
|
||||||
++buffer_idx;
|
++buffer_idx;
|
||||||
buffer_idx %= n_buffers;
|
buffer_idx %= n_buffers;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
read_buf.resize(n_size);
|
read_buf.resize(n_size);
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
file->read_raw(read_buf.data(), n_size);
|
file->read_raw(read_buf.data(), n_size);
|
||||||
@ -5176,17 +5220,15 @@ struct llama_model_loader {
|
|||||||
size_done += n_size;
|
size_done += n_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUDA)
|
// free temporary resources used for async uploads
|
||||||
// free temporary resources used for async cuda uploads
|
for (auto * event : events) {
|
||||||
if (cuda_backend) {
|
ggml_backend_event_synchronize(event);
|
||||||
for (size_t idx = 0; idx < n_buffers;++idx) {
|
ggml_backend_event_free(event);
|
||||||
ggml_backend_event_synchronize(events[idx]);
|
|
||||||
ggml_backend_event_free(events[idx]);
|
|
||||||
ggml_backend_buffer_free(host_buffers[idx]);
|
|
||||||
}
|
|
||||||
ggml_backend_free(cuda_backend);
|
|
||||||
}
|
}
|
||||||
#endif
|
for (auto * buf : host_buffers) {
|
||||||
|
ggml_backend_buffer_free(buf);
|
||||||
|
}
|
||||||
|
ggml_backend_free(upload_backend);
|
||||||
|
|
||||||
// check validation results
|
// check validation results
|
||||||
bool validation_failed = false;
|
bool validation_failed = false;
|
||||||
@ -6922,6 +6964,13 @@ static bool llm_load_tensors(
|
|||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
// check if the value of main_gpu is valid
|
||||||
|
if (llama_get_device_count(model) > 0 &&
|
||||||
|
split_mode != LLAMA_SPLIT_MODE_LAYER &&
|
||||||
|
(main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
|
||||||
|
throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
|
||||||
|
}
|
||||||
|
|
||||||
model.split_mode = split_mode;
|
model.split_mode = split_mode;
|
||||||
model.main_gpu = main_gpu;
|
model.main_gpu = main_gpu;
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
@ -6931,14 +6980,14 @@ static bool llm_load_tensors(
|
|||||||
bool use_mmap_buffer = true;
|
bool use_mmap_buffer = true;
|
||||||
|
|
||||||
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
||||||
model.buft_input = llama_default_buffer_type_cpu(true);
|
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
||||||
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
||||||
|
|
||||||
model.buft_layer.resize(n_layer);
|
model.buft_layer.resize(n_layer);
|
||||||
|
|
||||||
// assign cpu layers
|
// assign cpu layers
|
||||||
for (int i = 0; i < i_gpu_start; ++i) {
|
for (int i = 0; i < i_gpu_start; ++i) {
|
||||||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||||
@ -6976,7 +7025,7 @@ static bool llm_load_tensors(
|
|||||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
||||||
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
||||||
} else {
|
} else {
|
||||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_buffer_type_t split_buft;
|
ggml_backend_buffer_type_t split_buft;
|
||||||
@ -7000,7 +7049,7 @@ static bool llm_load_tensors(
|
|||||||
llama_default_buffer_type_offload(model, main_gpu)
|
llama_default_buffer_type_offload(model, main_gpu)
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -8872,7 +8921,7 @@ static bool llm_load_tensors(
|
|||||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||||
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
void * addr = nullptr;
|
void * addr = nullptr;
|
||||||
size_t first, last;
|
size_t first, last;
|
||||||
@ -8886,13 +8935,6 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
model.bufs.push_back(buf);
|
model.bufs.push_back(buf);
|
||||||
bufs.emplace(idx, buf);
|
bufs.emplace(idx, buf);
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
if (n_layer >= n_gpu_layers) {
|
|
||||||
ggml_backend_cuda_register_host_buffer(
|
|
||||||
ggml_backend_buffer_get_base(buf),
|
|
||||||
ggml_backend_buffer_get_size(buf));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
@ -16956,7 +16998,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|||||||
lctx.embd = nullptr;
|
lctx.embd = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
|
||||||
if (lctx.buf_output == nullptr) {
|
if (lctx.buf_output == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
||||||
return 0;
|
return 0;
|
||||||
@ -18987,21 +19029,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_max_devices(void) {
|
size_t llama_max_devices(void) {
|
||||||
#if defined(GGML_USE_RPC)
|
return 16;
|
||||||
return GGML_RPC_MAX_SERVERS;
|
|
||||||
#elif defined(GGML_USE_METAL)
|
|
||||||
return 1;
|
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
return GGML_CUDA_MAX_DEVICES;
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
return GGML_SYCL_MAX_DEVICES;
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
return GGML_VK_MAX_DEVICES;
|
|
||||||
#elif defined(GGML_USE_CANN)
|
|
||||||
return GGML_CANN_MAX_DEVICES;
|
|
||||||
#else
|
|
||||||
return 1;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_mmap(void) {
|
bool llama_supports_mmap(void) {
|
||||||
@ -19013,12 +19041,13 @@ bool llama_supports_mlock(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||||
|
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -19083,17 +19112,30 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||||
// split the servers set them into model->rpc_servers
|
// split the servers set them into model->rpc_servers
|
||||||
std::string servers(params.rpc_servers);
|
std::string servers(params.rpc_servers);
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
while ((pos = servers.find(",")) != std::string::npos) {
|
while ((pos = servers.find(',')) != std::string::npos) {
|
||||||
std::string server = servers.substr(0, pos);
|
std::string server = servers.substr(0, pos);
|
||||||
model->rpc_servers.push_back(server);
|
model->rpc_servers.push_back(server);
|
||||||
servers.erase(0, pos + 1);
|
servers.erase(0, pos + 1);
|
||||||
}
|
}
|
||||||
model->rpc_servers.push_back(servers);
|
model->rpc_servers.push_back(servers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// create list of devices to use with this model
|
||||||
|
// currently, we use all available devices
|
||||||
|
// TODO: rework API to give user more control over device selection
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
// skip the CPU backend since it is handled separately
|
||||||
|
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) {
|
||||||
|
model->devices.push_back(dev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int status = llama_model_load(path_model, *model, params);
|
int status = llama_model_load(path_model, *model, params);
|
||||||
GGML_ASSERT(status <= 0);
|
GGML_ASSERT(status <= 0);
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
@ -19255,6 +19297,36 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// initialize backends
|
// initialize backends
|
||||||
|
int main_gpu = model->main_gpu;
|
||||||
|
|
||||||
|
// with registry
|
||||||
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
|
||||||
|
ggml_backend_dev_t main_dev = model->devices[main_gpu];
|
||||||
|
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||||
|
for (auto * dev : model->devices) {
|
||||||
|
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (main_gpu >= (int)model->devices.size()) {
|
||||||
|
main_gpu -= (int)model->devices.size();
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
for (const auto & endpoint : model->rpc_servers) {
|
for (const auto & endpoint : model->rpc_servers) {
|
||||||
@ -19267,6 +19339,9 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (main_gpu >= (int)model->rpc_servers.size()) {
|
||||||
|
main_gpu -= (int)model->rpc_servers.size();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_METAL)
|
||||||
@ -19279,28 +19354,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
} else {
|
|
||||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
|
||||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
||||||
@ -19308,7 +19361,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||||
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -19329,9 +19382,9 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -19350,7 +19403,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
auto * backend = ggml_backend_kompute_init(main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -19359,29 +19412,29 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
|
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
} else {
|
|
||||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
|
||||||
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
|
||||||
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
|
|
||||||
ggml_backend_t backend = ggml_backend_cann_init(device);
|
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
|
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
|
} else {
|
||||||
|
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||||
|
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
||||||
|
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
|
||||||
|
ggml_backend_t backend = ggml_backend_cann_init(device);
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
@ -19446,7 +19499,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
for (auto * backend : ctx->backends) {
|
for (auto * backend : ctx->backends) {
|
||||||
if (ggml_backend_is_cpu(backend)) {
|
if (ggml_backend_is_cpu(backend)) {
|
||||||
// use host buffers for the CPU backend compute buffer
|
// use host buffers for the CPU backend compute buffer
|
||||||
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
|
||||||
} else {
|
} else {
|
||||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
@ -19457,17 +19510,37 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
// buffer used to store the computation graph and the tensor meta data
|
// buffer used to store the computation graph and the tensor meta data
|
||||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||||
|
|
||||||
|
// TODO: move these checks to ggml_backend_sched
|
||||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||||
bool pipeline_parallel =
|
bool pipeline_parallel =
|
||||||
llama_get_device_count(*model) > 1 &&
|
llama_get_device_count(*model) > 1 &&
|
||||||
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
||||||
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||||
params.offload_kqv;
|
params.offload_kqv;
|
||||||
#ifndef GGML_USE_CUDA
|
|
||||||
// pipeline parallelism requires support for async compute and events
|
// pipeline parallelism requires support for async compute and events in all devices
|
||||||
// currently this is only implemented in the CUDA backend
|
if (pipeline_parallel) {
|
||||||
pipeline_parallel = false;
|
for (auto * backend : ctx->backends) {
|
||||||
#endif
|
if (ggml_backend_is_cpu(backend)) {
|
||||||
|
// ignore CPU backend
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto * dev = ggml_backend_get_device(backend);
|
||||||
|
if (!dev) {
|
||||||
|
// backend is using old interface, not supported
|
||||||
|
pipeline_parallel = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ggml_backend_dev_props props;
|
||||||
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
if (!props.caps.async || !props.caps.events) {
|
||||||
|
// device does not support async compute or events
|
||||||
|
pipeline_parallel = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
||||||
|
|
||||||
if (pipeline_parallel) {
|
if (pipeline_parallel) {
|
||||||
@ -21772,15 +21845,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
ggml_log_set(log_callback, user_data);
|
||||||
g_state.log_callback_user_data = user_data;
|
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
#ifdef GGML_USE_METAL
|
g_logger_state.log_callback_user_data = user_data;
|
||||||
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
||||||
#elif defined(GGML_USE_CANN)
|
|
||||||
ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
||||||
@ -21789,12 +21856,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
|
|||||||
char buffer[128];
|
char buffer[128];
|
||||||
int len = vsnprintf(buffer, 128, format, args);
|
int len = vsnprintf(buffer, 128, format, args);
|
||||||
if (len < 128) {
|
if (len < 128) {
|
||||||
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
||||||
} else {
|
} else {
|
||||||
char * buffer2 = new char[len + 1];
|
char * buffer2 = new char[len + 1];
|
||||||
vsnprintf(buffer2, len + 1, format, args_copy);
|
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||||
buffer2[len] = 0;
|
buffer2[len] = 0;
|
||||||
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
||||||
delete[] buffer2;
|
delete[] buffer2;
|
||||||
}
|
}
|
||||||
va_end(args_copy);
|
va_end(args_copy);
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
|
const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
|
||||||
{0x000000, 0x0080},
|
{0x000000, 0x0080},
|
||||||
{0x000020, 0x0008},
|
{0x000020, 0x0008},
|
||||||
{0x000021, 0x0020},
|
{0x000021, 0x0020},
|
||||||
@ -2311,7 +2311,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
|
|||||||
0x003000,
|
0x003000,
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
|
// list is always in ascending order, to enable binary searh
|
||||||
|
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
|
||||||
{0x000041, 0x000061},
|
{0x000041, 0x000061},
|
||||||
{0x000042, 0x000062},
|
{0x000042, 0x000062},
|
||||||
{0x000043, 0x000063},
|
{0x000043, 0x000063},
|
||||||
@ -3747,7 +3748,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
|
|||||||
{0x01E921, 0x01E943},
|
{0x01E921, 0x01E943},
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
|
// list is always in ascending order, to enable binary searh
|
||||||
|
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
|
||||||
{0x000061, 0x000041},
|
{0x000061, 0x000041},
|
||||||
{0x000062, 0x000042},
|
{0x000062, 0x000042},
|
||||||
{0x000063, 0x000043},
|
{0x000063, 0x000043},
|
||||||
@ -5200,7 +5202,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
|
|||||||
{0x01E943, 0x01E921},
|
{0x01E943, 0x01E921},
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
|
const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd
|
||||||
{0x000000, 0x000000, 0x000000},
|
{0x000000, 0x000000, 0x000000},
|
||||||
{0x0000C0, 0x0000C5, 0x000041},
|
{0x0000C0, 0x0000C5, 0x000041},
|
||||||
{0x0000C7, 0x0000C7, 0x000043},
|
{0x0000C7, 0x0000C7, 0x000043},
|
||||||
|
@ -13,8 +13,8 @@ struct range_nfd {
|
|||||||
|
|
||||||
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
||||||
|
|
||||||
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
||||||
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
||||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
|
||||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
|
||||||
extern const std::vector<range_nfd> unicode_ranges_nfd;
|
extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
|
||||||
|
@ -123,11 +123,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
|||||||
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
||||||
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
||||||
|
|
||||||
assert (unicode_ranges_flags.front().first == 0);
|
assert (unicode_ranges_flags.begin()[0].first == 0);
|
||||||
assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
|
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
||||||
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
|
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
|
||||||
const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
|
const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags
|
||||||
const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
|
const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
|
||||||
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
|
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
|
||||||
cpt_flags[cpt] = range_ini.second;
|
cpt_flags[cpt] = range_ini.second;
|
||||||
}
|
}
|
||||||
@ -597,7 +597,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
|||||||
std::vector<uint32_t> result(cpts.size());
|
std::vector<uint32_t> result(cpts.size());
|
||||||
for (size_t i = 0; i < cpts.size(); ++i) {
|
for (size_t i = 0; i < cpts.size(); ++i) {
|
||||||
const uint32_t cpt = cpts[i];
|
const uint32_t cpt = cpts[i];
|
||||||
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
|
auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
|
||||||
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
|
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@ -639,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32_t unicode_tolower(uint32_t cp) {
|
uint32_t unicode_tolower(uint32_t cp) {
|
||||||
auto it = unicode_map_lowercase.find(cp);
|
// binary search
|
||||||
return it == unicode_map_lowercase.end() ? cp : it->second;
|
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
|
||||||
|
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
||||||
|
return pair.first < value;
|
||||||
|
});
|
||||||
|
if (it != unicode_map_lowercase.end() && it->first == cp) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
return cp; // Return the original code point if no lowercase mapping is found
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||||
|
@ -9,7 +9,7 @@ set(SOURCE_FILES
|
|||||||
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
||||||
${WHISPER_LIB_DIR}/src/whisper.cpp
|
${WHISPER_LIB_DIR}/src/whisper.cpp
|
||||||
${CMAKE_SOURCE_DIR}/jni.c
|
${CMAKE_SOURCE_DIR}/jni.c
|
||||||
|
@ -21,7 +21,7 @@ if (NOT GGML_HOME)
|
|||||||
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
|
||||||
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
|
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
|
||||||
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
||||||
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
||||||
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
|
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
|
||||||
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
|
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
|
||||||
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
||||||
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
|
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
|
||||||
@ -73,7 +73,7 @@
|
|||||||
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
|
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
|
||||||
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
|
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
|
||||||
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
|
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
|
||||||
18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
|
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
|
||||||
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
||||||
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
||||||
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
|
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
|
||||||
@ -120,7 +120,7 @@
|
|||||||
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
|
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
|
||||||
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
|
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
|
||||||
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
|
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
|
||||||
18ABE1572AF556340044A204 /* ggml-backend.c */,
|
18ABE1572AF556340044A204 /* ggml-backend.cpp */,
|
||||||
18ABE1552AF556340044A204 /* ggml-backend.h */,
|
18ABE1552AF556340044A204 /* ggml-backend.h */,
|
||||||
18ABE1582AF556340044A204 /* ggml-impl.h */,
|
18ABE1582AF556340044A204 /* ggml-impl.h */,
|
||||||
18ABE1592AF556340044A204 /* ggml-quants.c */,
|
18ABE1592AF556340044A204 /* ggml-quants.c */,
|
||||||
@ -248,7 +248,7 @@
|
|||||||
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
|
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
|
||||||
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
|
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
|
||||||
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
||||||
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
|
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
|
||||||
18627C8C29052BE000BD2A04 /* main.m in Sources */,
|
18627C8C29052BE000BD2A04 /* main.m in Sources */,
|
||||||
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
|
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
|
||||||
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
|
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
|||||||
6ebf0cf75db1739b6c8b26ccca3f5029ab35fe4a
|
e7fd7deec20ef1ced3eebe38802f3c2126fddfa4
|
||||||
|
@ -1239,6 +1239,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
|
|||||||
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
||||||
ggml_backend_t result = NULL;
|
ggml_backend_t result = NULL;
|
||||||
|
|
||||||
|
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (params.use_gpu) {
|
if (params.use_gpu) {
|
||||||
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
|
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
|
||||||
@ -1252,7 +1254,6 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
|
|||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.use_gpu) {
|
if (params.use_gpu) {
|
||||||
WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
|
WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
|
||||||
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
||||||
result = ggml_backend_metal_init();
|
result = ggml_backend_metal_init();
|
||||||
if (!result) {
|
if (!result) {
|
||||||
WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
|
WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
|
Loading…
Reference in New Issue
Block a user