talk-llama : sync llama.cpp

This commit is contained in:
Georgi Gerganov 2024-01-12 22:04:51 +02:00
parent 1560288048
commit 40ae0962f4
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 934 additions and 1470 deletions

File diff suppressed because it is too large Load Diff

View File

@ -118,6 +118,12 @@ extern "C" {
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
}; };
enum llama_split_mode {
LLAMA_SPLIT_NONE = 0, // single GPU
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
};
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
@ -180,8 +186,16 @@ extern "C" {
struct llama_model_params { struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors enum llama_split_mode split_mode; // how to split the model across multiple GPUs
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
const float * tensor_split;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues. // If the provided progress_callback returns true, model loading continues.