mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
1560288048
commit
40ae0962f4
File diff suppressed because it is too large
Load Diff
@ -118,6 +118,12 @@ extern "C" {
|
|||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_split_mode {
|
||||||
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||||
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||||
|
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||||
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
llama_token id; // token id
|
llama_token id; // token id
|
||||||
float logit; // log-odds of the token
|
float logit; // log-odds of the token
|
||||||
@ -180,8 +186,16 @@ extern "C" {
|
|||||||
|
|
||||||
struct llama_model_params {
|
struct llama_model_params {
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
|
||||||
|
// main_gpu interpretation depends on split_mode:
|
||||||
|
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||||
|
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||||
|
// LLAMA_SPLIT_LAYER: ignored
|
||||||
|
int32_t main_gpu;
|
||||||
|
|
||||||
|
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||||
|
const float * tensor_split;
|
||||||
|
|
||||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
// If the provided progress_callback returns true, model loading continues.
|
// If the provided progress_callback returns true, model loading continues.
|
||||||
|
Loading…
Reference in New Issue
Block a user