mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-26 05:49:49 +00:00
sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)
* sync : ggml * sync : llama.cpp * talk-llama : fix obsolete param * ggml-alloc : fix ggml_tallocr_is_own * talk.wasm : update to new ggml * ggml : fix type punning in ggml_scale * ggml : cuda jetson + arm quants warnings
This commit is contained in:
parent
d2ee117a0a
commit
3a5302108d
File diff suppressed because it is too large
Load Diff
@ -39,10 +39,11 @@
|
|||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
|
||||||
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 2
|
#define LLAMA_SESSION_VERSION 3
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
@ -126,7 +127,7 @@ extern "C" {
|
|||||||
bool sorted;
|
bool sorted;
|
||||||
} llama_token_data_array;
|
} llama_token_data_array;
|
||||||
|
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
// Input data for llama_decode
|
// Input data for llama_decode
|
||||||
// A llama_batch object can contain input about one or many sequences
|
// A llama_batch object can contain input about one or many sequences
|
||||||
@ -158,16 +159,38 @@ extern "C" {
|
|||||||
llama_seq_id all_seq_id; // used if seq_id == NULL
|
llama_seq_id all_seq_id; // used if seq_id == NULL
|
||||||
} llama_batch;
|
} llama_batch;
|
||||||
|
|
||||||
|
enum llama_model_kv_override_type {
|
||||||
|
LLAMA_KV_OVERRIDE_INT,
|
||||||
|
LLAMA_KV_OVERRIDE_FLOAT,
|
||||||
|
LLAMA_KV_OVERRIDE_BOOL,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_model_kv_override {
|
||||||
|
char key[128];
|
||||||
|
enum llama_model_kv_override_type tag;
|
||||||
|
union {
|
||||||
|
int64_t int_value;
|
||||||
|
double float_value;
|
||||||
|
bool bool_value;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_model_params {
|
struct llama_model_params {
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
|
// If the provided progress_callback returns true, model loading continues.
|
||||||
|
// If it returns false, model loading is immediately aborted.
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
|
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
void * progress_callback_user_data;
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
// override key-value pairs of the model meta data
|
||||||
|
const struct llama_model_kv_override * kv_overrides;
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
@ -185,17 +208,20 @@ extern "C" {
|
|||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||||
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
||||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
|
enum ggml_type type_k; // data type for K cache
|
||||||
|
enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -290,7 +316,9 @@ extern "C" {
|
|||||||
|
|
||||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
// TODO: become more consistent with returned int types across the API
|
||||||
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||||
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||||
|
|
||||||
@ -301,6 +329,23 @@ extern "C" {
|
|||||||
// Get the model's RoPE frequency scaling factor
|
// Get the model's RoPE frequency scaling factor
|
||||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Functions to access the model's GGUF metadata scalar values
|
||||||
|
// - The functions return the length of the string on success, or -1 on failure
|
||||||
|
// - The output string is always null-terminated and cleared on failure
|
||||||
|
// - GGUF array values are not supported by these functions
|
||||||
|
|
||||||
|
// Get metadata value as a string by key name
|
||||||
|
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get the number of metadata key/value pairs
|
||||||
|
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get metadata key name by index
|
||||||
|
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get metadata value as a string by index
|
||||||
|
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
// Get a string describing the model type
|
// Get a string describing the model type
|
||||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
|
||||||
@ -344,9 +389,60 @@ extern "C" {
|
|||||||
// KV cache
|
// KV cache
|
||||||
//
|
//
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache
|
// Information associated with an individual cell in the KV cache view.
|
||||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
struct llama_kv_cache_view_cell {
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
// The position for this cell. Takes KV cache shifts into account.
|
||||||
|
// May be negative if the cell is not populated.
|
||||||
|
llama_pos pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
// An updateable view of the KV cache.
|
||||||
|
struct llama_kv_cache_view {
|
||||||
|
// Number of KV cache cells. This will be the same as the context size.
|
||||||
|
int32_t n_cells;
|
||||||
|
|
||||||
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
||||||
|
// if there are more sequences in a cell than this value, however they will
|
||||||
|
// not be visible in the view cells_sequences.
|
||||||
|
int32_t n_max_seq;
|
||||||
|
|
||||||
|
// Number of tokens in the cache. For example, if there are two populated
|
||||||
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
||||||
|
// ids then you'll have 3 tokens.
|
||||||
|
int32_t token_count;
|
||||||
|
|
||||||
|
// Number of populated cache cells.
|
||||||
|
int32_t used_cells;
|
||||||
|
|
||||||
|
// Maximum contiguous empty slots in the cache.
|
||||||
|
int32_t max_contiguous;
|
||||||
|
|
||||||
|
// Index to the start of the max_contiguous slot range. Can be negative
|
||||||
|
// when cache is full.
|
||||||
|
int32_t max_contiguous_idx;
|
||||||
|
|
||||||
|
// Information for an individual cell.
|
||||||
|
struct llama_kv_cache_view_cell * cells;
|
||||||
|
|
||||||
|
// The sequences for each cell. There will be n_max_seq items per cell.
|
||||||
|
llama_seq_id * cells_sequences;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create an empty KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
||||||
|
|
||||||
|
// Free a KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||||
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||||
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||||
|
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||||
|
|
||||||
// Clear the KV cache
|
// Clear the KV cache
|
||||||
LLAMA_API void llama_kv_cache_clear(
|
LLAMA_API void llama_kv_cache_clear(
|
||||||
@ -517,6 +613,12 @@ extern "C" {
|
|||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
|
||||||
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
|
||||||
|
|
||||||
// codellama infill tokens
|
// codellama infill tokens
|
||||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||||
|
@ -282,7 +282,6 @@ int main(int argc, char ** argv) {
|
|||||||
// tune these to your liking
|
// tune these to your liking
|
||||||
lcparams.n_ctx = 2048;
|
lcparams.n_ctx = 2048;
|
||||||
lcparams.seed = 1;
|
lcparams.seed = 1;
|
||||||
lcparams.f16_kv = true;
|
|
||||||
lcparams.n_threads = params.n_threads;
|
lcparams.n_threads = params.n_threads;
|
||||||
|
|
||||||
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
||||||
|
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
||||||
|
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
|
||||||
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
||||||
|
|
||||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
||||||
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
||||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
|
||||||
|
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
||||||
|
|
||||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||||
|
|
||||||
@ -524,8 +524,7 @@ bool gpt2_eval(
|
|||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled =
|
||||||
ggml_scale(ctx0,
|
ggml_scale(ctx0,
|
||||||
KQ,
|
KQ,
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
1.0f/sqrt(float(n_embd)/n_head));
|
||||||
);
|
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
// [n_past + N, N, 12]
|
// [n_past + N, N, 12]
|
||||||
|
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
||||||
|
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
|
||||||
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
||||||
|
|
||||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
||||||
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
||||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
|
||||||
|
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
||||||
|
|
||||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||||
|
|
||||||
@ -525,8 +525,7 @@ bool gpt2_eval(
|
|||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled =
|
||||||
ggml_scale(ctx0,
|
ggml_scale(ctx0,
|
||||||
KQ,
|
KQ,
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
1.0f/sqrt(float(n_embd)/n_head));
|
||||||
);
|
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
// [n_past + N, N, 12]
|
// [n_past + N, N, 12]
|
||||||
|
5
extra/sync-llama.sh
Executable file
5
extra/sync-llama.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
|
||||||
|
cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
|
||||||
|
cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
|
18
ggml-alloc.c
18
ggml-alloc.c
@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t
|
|||||||
|
|
||||||
// check if a tensor is allocated by this buffer
|
// check if a tensor is allocated by this buffer
|
||||||
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
||||||
return tensor->buffer == alloc->buffer;
|
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||||
@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|||||||
if (update_backend) {
|
if (update_backend) {
|
||||||
view->backend = view->view_src->backend;
|
view->backend = view->view_src->backend;
|
||||||
}
|
}
|
||||||
view->buffer = view->view_src->buffer;
|
// views are initialized in the alloc buffer rather than the view_src buffer
|
||||||
|
view->buffer = alloc->buffer;
|
||||||
view->data = (char *)view->view_src->data + view->view_offs;
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
|
||||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
if (!alloc->measure) {
|
||||||
@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_free(ggml_allocr_t alloc) {
|
void ggml_allocr_free(ggml_allocr_t alloc) {
|
||||||
|
if (alloc == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_gallocr_free(alloc->galloc);
|
ggml_gallocr_free(alloc->galloc);
|
||||||
ggml_tallocr_free(alloc->talloc);
|
ggml_tallocr_free(alloc->talloc);
|
||||||
free(alloc);
|
free(alloc);
|
||||||
@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (nbytes == 0) {
|
if (nbytes == 0) {
|
||||||
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
// all the tensors in the context are already allocated
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
} else {
|
} else {
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(buffer, t);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (t->view_src != NULL) {
|
||||||
|
// view of a pre-allocated tensor
|
||||||
|
ggml_backend_view_init(buffer, t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,9 @@ extern "C" {
|
|||||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
|
// check if tensor data is in host memory
|
||||||
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||||
|
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
struct ggml_backend_buffer_type {
|
||||||
@ -40,6 +43,7 @@ extern "C" {
|
|||||||
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
|
@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
|||||||
return buft->iface.supports_backend(buft, backend);
|
return buft->iface.supports_backend(buft, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.is_host) {
|
||||||
|
return buft->iface.is_host(buft);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// backend buffer
|
// backend buffer
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
buffer->iface.clear(buffer, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->buft;
|
return buffer->buft;
|
||||||
}
|
}
|
||||||
@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
free(buffer->context);
|
free(buffer->context);
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
|
|||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
memset(buffer->context, value, buffer->size);
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
// for buffers from ptr, free is not called
|
// for buffers from ptr, free is not called
|
||||||
@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||||
@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cpu;
|
return &ggml_backend_cpu_buffer_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
|
||||||
|
// buffer type HBM
|
||||||
|
|
||||||
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
hbw_free(buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
//void * ptr = hbw_malloc(size);
|
||||||
|
void * ptr;
|
||||||
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
|
if (result != 0) {
|
||||||
|
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
buffer->buft = buft;
|
||||||
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_buffer_type_hbm;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
void * work_data;
|
void * work_data;
|
||||||
@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
cpu_plan->cgraph = *cgraph;
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|||||||
// utils
|
// utils
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->data == NULL);
|
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
@ -21,6 +21,7 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
@ -29,6 +30,8 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -76,6 +79,10 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend registry
|
||||||
//
|
//
|
||||||
|
438
ggml-cuda.cu
438
ggml-cuda.cu
@ -31,6 +31,7 @@
|
|||||||
#define CUDA_R_16F HIPBLAS_R_16F
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
#define CUDA_R_32F HIPBLAS_R_32F
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
|
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||||
#define cublasCreate hipblasCreate
|
#define cublasCreate hipblasCreate
|
||||||
#define cublasGemmEx hipblasGemmEx
|
#define cublasGemmEx hipblasGemmEx
|
||||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||||
@ -40,6 +41,7 @@
|
|||||||
#define cublasSetStream hipblasSetStream
|
#define cublasSetStream hipblasSetStream
|
||||||
#define cublasSgemm hipblasSgemm
|
#define cublasSgemm hipblasSgemm
|
||||||
#define cublasStatus_t hipblasStatus_t
|
#define cublasStatus_t hipblasStatus_t
|
||||||
|
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||||
@ -58,8 +60,13 @@
|
|||||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||||
#define cudaGetErrorString hipGetErrorString
|
#define cudaGetErrorString hipGetErrorString
|
||||||
#define cudaGetLastError hipGetLastError
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#ifdef GGML_HIP_UMA
|
||||||
|
#define cudaMalloc hipMallocManaged
|
||||||
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
||||||
|
#else
|
||||||
#define cudaMalloc hipMalloc
|
#define cudaMalloc hipMalloc
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
|
#endif
|
||||||
#define cudaMemcpy hipMemcpy
|
#define cudaMemcpy hipMemcpy
|
||||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
@ -78,10 +85,18 @@
|
|||||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||||
#define cudaStream_t hipStream_t
|
#define cudaStream_t hipStream_t
|
||||||
#define cudaSuccess hipSuccess
|
#define cudaSuccess hipSuccess
|
||||||
|
#define __trap abort
|
||||||
#else
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
// CUDA 10.2 does not have these macro definitions.
|
||||||
|
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#endif
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
@ -510,6 +525,14 @@ static size_t g_scratch_offset = 0;
|
|||||||
|
|
||||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
static __device__ void bad_arch() {
|
||||||
|
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
|
||||||
|
__trap();
|
||||||
|
|
||||||
|
(void) bad_arch; // suppress unused function warning
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
@ -1970,8 +1993,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|||||||
// second part effectively subtracts 8 from each quant value
|
// second part effectively subtracts 8 from each quant value
|
||||||
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2008,8 +2030,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|||||||
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
||||||
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2044,8 +2065,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|||||||
// second part effectively subtracts 16 from each quant value
|
// second part effectively subtracts 16 from each quant value
|
||||||
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2090,8 +2110,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|||||||
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2112,8 +2131,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|||||||
|
|
||||||
return d8_0*d8_1 * sumi;
|
return d8_0*d8_1 * sumi;
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2143,8 +2161,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||||||
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
||||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2179,8 +2196,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|||||||
|
|
||||||
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2217,8 +2233,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|||||||
|
|
||||||
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2258,8 +2273,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|||||||
|
|
||||||
return d3 * sumf;
|
return d3 * sumf;
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2284,8 +2298,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|||||||
|
|
||||||
return d3*d8 * sumi;
|
return d3*d8 * sumi;
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2318,8 +2331,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|||||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2352,8 +2364,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|||||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2393,8 +2404,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|||||||
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2427,8 +2437,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|||||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2458,8 +2467,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|||||||
|
|
||||||
return d*sumf;
|
return d*sumf;
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2490,8 +2498,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|||||||
return d6 * sumf_d;
|
return d6 * sumf_d;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3357,8 +3364,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||||||
return dall * sumf_d - dmin * sumf_m;
|
return dall * sumf_d - dmin * sumf_m;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -3541,8 +3547,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||||||
return d * sumf_d;
|
return d * sumf_d;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
bad_arch();
|
||||||
return 0.0f; // only to satisfy the compiler
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -3952,7 +3957,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q4_0_q8_1_mul_mat;
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4021,7 +4026,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q4_1_q8_1_mul_mat;
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4088,7 +4093,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q5_0_q8_1_mul_mat;
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4155,7 +4160,7 @@ mul_mat_q5_1(
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q5_1_q8_1_mul_mat;
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4222,7 +4227,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q8_0_q8_1_mul_mat;
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4289,7 +4294,7 @@ mul_mat_q2_K(
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q2_K_q8_1_mul_mat;
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4358,7 +4363,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q3_K_q8_1_mul_mat;
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4427,7 +4432,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q4_K_q8_1_mul_mat;
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4494,7 +4499,7 @@ mul_mat_q5_K(
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q5_K_q8_1_mul_mat;
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4563,7 +4568,7 @@ template <bool need_check> static __global__ void
|
|||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
#else
|
#else
|
||||||
(void) vec_dot_q6_K_q8_1_mul_mat;
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
||||||
assert(false);
|
bad_arch();
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4998,6 +5003,15 @@ static __global__ void rope_neox(
|
|||||||
const int ib = col / n_dims;
|
const int ib = col / n_dims;
|
||||||
const int ic = col % n_dims;
|
const int ic = col % n_dims;
|
||||||
|
|
||||||
|
if (ib > 0) {
|
||||||
|
const int i = row*ncols + ib*n_dims + ic;
|
||||||
|
|
||||||
|
dst[i + 0] = x[i + 0];
|
||||||
|
dst[i + 1] = x[i + 1];
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const int i = row*ncols + ib*n_dims + ic/2;
|
const int i = row*ncols + ib*n_dims + ic/2;
|
||||||
const int i2 = row/p_delta_rows;
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
@ -5259,17 +5273,17 @@ static __global__ void im2col_f32_f16(
|
|||||||
const int ky = (i - kd) / OW;
|
const int ky = (i - kd) / OW;
|
||||||
const int ix = i % OW;
|
const int ix = i % OW;
|
||||||
|
|
||||||
const int iiw = ix * s0 + kx * d0 - p0;
|
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||||
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||||
|
|
||||||
const int offset_dst =
|
const int64_t offset_dst =
|
||||||
(blockIdx.y * OW + ix) * CHW +
|
(blockIdx.y * OW + ix) * CHW +
|
||||||
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
||||||
|
|
||||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||||
dst[offset_dst] = __float2half(0.0f);
|
dst[offset_dst] = __float2half(0.0f);
|
||||||
} else {
|
} else {
|
||||||
const int offset_src = blockIdx.z * offset_delta;
|
const int64_t offset_src = blockIdx.z * offset_delta;
|
||||||
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -6814,6 +6828,7 @@ static void ggml_cuda_op_get_rows(
|
|||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// TODO: k-quants
|
// TODO: k-quants
|
||||||
|
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -7057,6 +7072,7 @@ inline void ggml_cuda_op_upscale(
|
|||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_pad(
|
inline void ggml_cuda_op_pad(
|
||||||
@ -7073,6 +7089,7 @@ inline void ggml_cuda_op_pad(
|
|||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_rms_norm(
|
inline void ggml_cuda_op_rms_norm(
|
||||||
@ -7376,7 +7393,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||||||
|
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||||
half * src0_as_f16 = nullptr;
|
half * src0_as_f16 = nullptr;
|
||||||
size_t src0_as = 0;
|
size_t src0_as = 0;
|
||||||
@ -7690,17 +7707,10 @@ inline void ggml_cuda_op_scale(
|
|||||||
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
float scale;
|
float scale;
|
||||||
// HACK: support for ggml backend interface
|
memcpy(&scale, dst->op_params, sizeof(float));
|
||||||
if (src1->backend == GGML_BACKEND_CPU) {
|
|
||||||
scale = ((float *) src1->data)[0];
|
|
||||||
} else {
|
|
||||||
// TODO: pass pointer to kernel instead of copying to host
|
|
||||||
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
|
||||||
}
|
|
||||||
|
|
||||||
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
@ -7747,8 +7757,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|||||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
||||||
|
|
||||||
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
|
||||||
|
|
||||||
// dd = data device
|
// dd = data device
|
||||||
float * src0_ddf = nullptr;
|
float * src0_ddf = nullptr;
|
||||||
float * src1_ddf = nullptr;
|
float * src1_ddf = nullptr;
|
||||||
@ -7769,7 +7777,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_src1 && !src1_stays_on_host) {
|
if (use_src1) {
|
||||||
if (src1_on_device) {
|
if (src1_on_device) {
|
||||||
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
} else {
|
} else {
|
||||||
@ -7817,6 +7825,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef NDEBUG
|
#ifdef NDEBUG
|
||||||
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
}
|
||||||
|
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||||
|
|
||||||
@ -7868,8 +7881,6 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
ggml_cuda_set_peer_access(ne11);
|
|
||||||
|
|
||||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
|
|
||||||
@ -8300,27 +8311,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void k_compute_batched_ptrs(
|
static __global__ void k_compute_batched_ptrs(
|
||||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
||||||
const void ** ptrs_src, void ** ptrs_dst,
|
const void ** ptrs_src, void ** ptrs_dst,
|
||||||
int ne12, int ne13,
|
int64_t ne12, int64_t ne13,
|
||||||
int ne23,
|
int64_t ne23,
|
||||||
int nb02, int nb03,
|
size_t nb02, size_t nb03,
|
||||||
int nb12, int nb13,
|
size_t nb12, size_t nb13,
|
||||||
int nb2, int nb3,
|
size_t nbd2, size_t nbd3,
|
||||||
int r2, int r3) {
|
int64_t r2, int64_t r3) {
|
||||||
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (i13 >= ne13 || i12 >= ne12) {
|
if (i13 >= ne13 || i12 >= ne12) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int i03 = i13 / r3;
|
int64_t i03 = i13 / r3;
|
||||||
int i02 = i12 / r2;
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
||||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
@ -8376,7 +8387,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
||||||
|
|
||||||
size_t dst_as = 0;
|
size_t dst_as = 0;
|
||||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
|
||||||
|
half * dst_f16 = nullptr;
|
||||||
|
char * dst_t = nullptr;
|
||||||
|
|
||||||
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||||
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||||
|
|
||||||
|
// dst strides
|
||||||
|
size_t nbd2 = dst->nb[2];
|
||||||
|
size_t nbd3 = dst->nb[3];
|
||||||
|
|
||||||
|
const half alpha_f16 = 1.0f;
|
||||||
|
const half beta_f16 = 0.0f;
|
||||||
|
|
||||||
|
const float alpha_f32 = 1.0f;
|
||||||
|
const float beta_f32 = 0.0f;
|
||||||
|
|
||||||
|
const void * alpha = &alpha_f16;
|
||||||
|
const void * beta = &beta_f16;
|
||||||
|
|
||||||
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
|
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||||
|
dst_t = (char *) dst_f16;
|
||||||
|
|
||||||
|
nbd2 /= sizeof(float) / sizeof(half);
|
||||||
|
nbd3 /= sizeof(float) / sizeof(half);
|
||||||
|
} else {
|
||||||
|
dst_t = (char *) dst_ddf;
|
||||||
|
|
||||||
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||||
|
cu_data_type = CUDA_R_32F;
|
||||||
|
|
||||||
|
alpha = &alpha_f32;
|
||||||
|
beta = &beta_f32;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
@ -8385,9 +8430,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
const int64_t r2 = ne12/ne02;
|
const int64_t r2 = ne12/ne02;
|
||||||
const int64_t r3 = ne13/ne03;
|
const int64_t r3 = ne13/ne03;
|
||||||
|
|
||||||
const half alpha_f16 = 1.0f;
|
|
||||||
const half beta_f16 = 0.0f;
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// use cublasGemmEx
|
// use cublasGemmEx
|
||||||
{
|
{
|
||||||
@ -8397,12 +8439,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
int i02 = i12 / r2;
|
int i02 = i12 / r2;
|
||||||
|
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||||
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
||||||
CUBLAS_COMPUTE_16F,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -8414,11 +8456,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||||
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||||
ne12*ne13,
|
ne12*ne13,
|
||||||
CUBLAS_COMPUTE_16F,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
} else {
|
} else {
|
||||||
// use cublasGemmBatchedEx
|
// use cublasGemmBatchedEx
|
||||||
@ -8435,24 +8477,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
|
|
||||||
dim3 block_dims(ne13, ne12);
|
dim3 block_dims(ne13, ne12);
|
||||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||||
src0_as_f16, src1_as_f16, dst_f16,
|
src0_as_f16, src1_as_f16, dst_t,
|
||||||
ptrs_src, ptrs_dst,
|
ptrs_src, ptrs_dst,
|
||||||
ne12, ne13,
|
ne12, ne13,
|
||||||
ne23,
|
ne23,
|
||||||
nb02, nb03,
|
nb02, nb03,
|
||||||
nb12, nb13,
|
nb12, nb13,
|
||||||
dst->nb[2], dst->nb[3],
|
nbd2, nbd3,
|
||||||
r2, r3);
|
r2, r3);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||||
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||||
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
||||||
ne23,
|
ne23,
|
||||||
CUBLAS_COMPUTE_16F,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
||||||
if (ptrs_src_s != 0) {
|
if (ptrs_src_s != 0) {
|
||||||
@ -8464,13 +8506,16 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||||
|
|
||||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
|
||||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||||
@ -8732,7 +8777,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||||||
// TODO: mmq/mmv support
|
// TODO: mmq/mmv support
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
const int64_t nb11 = src1->nb[1];
|
||||||
|
const int64_t nb1 = dst->nb[1];
|
||||||
|
|
||||||
const struct ggml_tensor * ids = src0;
|
const struct ggml_tensor * ids = src0;
|
||||||
const int32_t id = ((int32_t *) dst->op_params)[0];
|
const int32_t id = ((int32_t *) dst->op_params)[0];
|
||||||
@ -8740,10 +8786,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||||||
|
|
||||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||||
|
|
||||||
|
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
if (ids->backend == GGML_BACKEND_GPU) {
|
if (ids->backend == GGML_BACKEND_GPU) {
|
||||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||||
} else {
|
} else {
|
||||||
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
||||||
}
|
}
|
||||||
@ -8757,18 +8805,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||||||
ggml_tensor src1_row = *src1;
|
ggml_tensor src1_row = *src1;
|
||||||
ggml_tensor dst_row = *dst;
|
ggml_tensor dst_row = *dst;
|
||||||
|
|
||||||
src1_row.ne[1] = 1;
|
src1_row.backend = GGML_BACKEND_GPU;
|
||||||
dst_row.ne[1] = 1;
|
dst_row.backend = GGML_BACKEND_GPU;
|
||||||
|
|
||||||
src1_row.nb[2] = src1_row.nb[1];
|
|
||||||
dst_row.nb[2] = dst_row.nb[1];
|
|
||||||
|
|
||||||
src1_row.nb[3] = src1_row.nb[1];
|
|
||||||
dst_row.nb[3] = dst_row.nb[1];
|
|
||||||
|
|
||||||
src1_row.extra = &src1_row_extra;
|
src1_row.extra = &src1_row_extra;
|
||||||
dst_row.extra = &dst_row_extra;
|
dst_row.extra = &dst_row_extra;
|
||||||
|
|
||||||
|
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
||||||
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
||||||
|
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
||||||
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
|
if (src1->ne[1] == 1) {
|
||||||
|
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||||
|
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
//int32_t row_id;
|
//int32_t row_id;
|
||||||
@ -8781,14 +8831,85 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||||||
|
|
||||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||||
|
|
||||||
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
|
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
||||||
src1_row.data = (char *) src1->data + i01*src1->nb[1];
|
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
||||||
|
|
||||||
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
|
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
||||||
dst_row.data = (char *) dst->data + i01*dst->nb[1];
|
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
||||||
|
|
||||||
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
size_t as_src1, as_dst;
|
||||||
|
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
||||||
|
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
||||||
|
|
||||||
|
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
||||||
|
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
||||||
|
|
||||||
|
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
||||||
|
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||||
|
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
||||||
|
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
||||||
|
|
||||||
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||||
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||||
|
|
||||||
|
int64_t num_src1_rows = 0;
|
||||||
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
|
if (row_id_i != row_id) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
||||||
|
nb11, src1_kind, stream));
|
||||||
|
num_src1_rows++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (num_src1_rows == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
src1_row.ne[1] = num_src1_rows;
|
||||||
|
dst_row.ne[1] = num_src1_rows;
|
||||||
|
|
||||||
|
src1_row.nb[1] = nb11;
|
||||||
|
src1_row.nb[2] = num_src1_rows*nb11;
|
||||||
|
src1_row.nb[3] = num_src1_rows*nb11;
|
||||||
|
|
||||||
|
dst_row.nb[1] = nb1;
|
||||||
|
dst_row.nb[2] = num_src1_rows*nb1;
|
||||||
|
dst_row.nb[3] = num_src1_rows*nb1;
|
||||||
|
|
||||||
|
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||||
|
|
||||||
|
num_src1_rows = 0;
|
||||||
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
|
if (row_id_i != row_id) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
||||||
|
nb1, dst_kind, stream));
|
||||||
|
num_src1_rows++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
||||||
|
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst->backend == GGML_BACKEND_CPU) {
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
@ -8898,6 +9019,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|||||||
(void) dst;
|
(void) dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||||
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
const int64_t nrows = ggml_nrows(tensor);
|
const int64_t nrows = ggml_nrows(tensor);
|
||||||
|
|
||||||
@ -8947,8 +9074,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|||||||
|
|
||||||
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||||
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||||
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char * buf;
|
char * buf;
|
||||||
@ -8975,7 +9101,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||||
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9098,11 +9224,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
|
|
||||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
const bool inplace = tensor->view_src != nullptr;
|
||||||
tensor->op == GGML_OP_VIEW;
|
|
||||||
|
|
||||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
size_t view_offset = 0;
|
size_t view_offset = 0;
|
||||||
if (tensor->op == GGML_OP_VIEW) {
|
if (tensor->op == GGML_OP_VIEW) {
|
||||||
@ -9182,14 +9307,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -9318,6 +9443,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
||||||
|
}
|
||||||
|
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -9391,7 +9520,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||||
tensor->backend = tensor->view_src->backend;
|
tensor->backend = tensor->view_src->backend;
|
||||||
tensor->extra = tensor->view_src->extra;
|
tensor->extra = tensor->view_src->extra;
|
||||||
return;
|
return;
|
||||||
@ -9422,23 +9551,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
UNUSED(buffer);
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
UNUSED(buffer);
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||||
@ -9449,6 +9589,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|||||||
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ NULL,
|
/* .cpy_tensor_from = */ NULL,
|
||||||
/* .cpy_tensor_to = */ NULL,
|
/* .cpy_tensor_to = */ NULL,
|
||||||
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
// cuda buffer type
|
// cuda buffer type
|
||||||
@ -9485,8 +9626,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|||||||
|
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||||
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||||
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9501,35 +9641,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
||||||
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
|
||||||
if (!ggml_backend_buffer_type_cuda_initialized) {
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
||||||
|
|
||||||
|
if (!ggml_backend_cuda_buffer_type_initialized) {
|
||||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
||||||
ggml_backend_buffer_type_cuda[i] = {
|
ggml_backend_cuda_buffer_types[i] = {
|
||||||
/* .iface = */ cuda_backend_buffer_type_interface,
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
||||||
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_type_cuda_initialized = true;
|
ggml_backend_cuda_buffer_type_initialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cuda[device];
|
return &ggml_backend_cuda_buffer_types[device];
|
||||||
}
|
}
|
||||||
|
|
||||||
// host buffer type
|
// host buffer type
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||||
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
|
||||||
delete ctx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
@ -9542,24 +9683,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|||||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
|
|
||||||
UNUSED(buft);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||||
|
/* .iface = */ {
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
};
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
},
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
|
||||||
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cuda_host;
|
return &ggml_backend_cuda_buffer_type_host;
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend
|
// backend
|
||||||
@ -9591,8 +9729,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
@ -9602,8 +9738,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
|
@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
|
228
ggml-metal.m
228
ggml-metal.m
@ -180,7 +180,15 @@ struct ggml_metal_context {
|
|||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
ggml_log_callback ggml_metal_log_callback = NULL;
|
|
||||||
|
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
||||||
|
fprintf(stderr, "%s", msg);
|
||||||
|
|
||||||
|
UNUSED(level);
|
||||||
|
UNUSED(user_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
||||||
void * ggml_metal_log_user_data = NULL;
|
void * ggml_metal_log_user_data = NULL;
|
||||||
|
|
||||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||||
@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// temporarily defined here for compatibility between ggml-backend and the old API
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
||||||
struct ggml_backend_metal_buffer_context {
|
|
||||||
|
struct ggml_backend_metal_buffer {
|
||||||
void * data;
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
id<MTLBuffer> metal;
|
id<MTLBuffer> metal;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_metal_buffer_context {
|
||||||
|
void * all_data;
|
||||||
|
size_t all_size;
|
||||||
|
bool owned;
|
||||||
|
|
||||||
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||||
|
int n_buffers;
|
||||||
|
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
|
};
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||||
|
|
||||||
// compatibility with ggml-backend
|
// compatibility with ggml-backend
|
||||||
if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
|
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
|
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||||
|
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
|
// find the view that contains the tensor fully
|
||||||
|
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
||||||
GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
|
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
||||||
|
|
||||||
|
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
||||||
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
return buf_ctx->metal;
|
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
||||||
|
|
||||||
|
return buf_ctx->buffers[i].metal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||||
|
|
||||||
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
|
|||||||
{
|
{
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
const float scale = *(const float *) src1->data;
|
const float scale = *(const float *) dst->op_params;
|
||||||
|
|
||||||
int64_t n = ggml_nelements(dst);
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// backend interface
|
// backend interface
|
||||||
|
|
||||||
|
// default buffer
|
||||||
static id<MTLDevice> g_backend_device = nil;
|
static id<MTLDevice> g_backend_device = nil;
|
||||||
static int g_backend_device_ref_count = 0;
|
static int g_backend_device_ref_count = 0;
|
||||||
|
|
||||||
@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
|
|||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
return ctx->data;
|
return ctx->all_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
[ctx->metal release];
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
[ctx->buffers[i].metal release];
|
||||||
|
}
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
|
|
||||||
free(ctx->data);
|
if (ctx->owned) {
|
||||||
free(ctx);
|
free(ctx->all_data);
|
||||||
|
}
|
||||||
|
|
||||||
UNUSED(buffer);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
|||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
memset(ctx->all_data, value, ctx->all_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
|||||||
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_metal_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// default buffer type
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->data = ggml_metal_host_malloc(size);
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
|
|
||||||
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
|
ctx->all_size = size_aligned;
|
||||||
|
ctx->owned = true;
|
||||||
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
|
ctx->buffers[0].data = ctx->all_data;
|
||||||
|
ctx->buffers[0].size = size;
|
||||||
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
|
if (ctx->buffers[0].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_free_device();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
|||||||
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
return &ggml_backend_buffer_type_metal;
|
return &ggml_backend_buffer_type_metal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buffer from ptr
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
|
ctx->all_data = data;
|
||||||
|
ctx->all_size = size;
|
||||||
|
ctx->owned = false;
|
||||||
|
ctx->n_buffers = 0;
|
||||||
|
|
||||||
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
size_t size_aligned = size;
|
||||||
|
if ((size_aligned % size_page) != 0) {
|
||||||
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
|
}
|
||||||
|
|
||||||
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
|
|
||||||
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
} else {
|
||||||
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
||||||
|
// one of the views
|
||||||
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
||||||
|
const size_t size_step = device.maxBufferLength - size_ovlp;
|
||||||
|
const size_t size_view = device.maxBufferLength;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
|
if (i + size_step < size) {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
|
||||||
UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_metal_buffer_type();
|
return ggml_backend_metal_buffer_type();
|
||||||
|
|
||||||
@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
|
|||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_from_async = */ NULL,
|
/* .cpy_tensor_from_async = */ NULL,
|
||||||
/* .cpy_tensor_to_async = */ NULL,
|
/* .cpy_tensor_to_async = */ NULL,
|
||||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: make a common log callback for all backends in ggml-backend
|
|
||||||
static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
|
||||||
fprintf(stderr, "%s", msg);
|
|
||||||
|
|
||||||
UNUSED(level);
|
|
||||||
UNUSED(user_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_metal_init(void) {
|
ggml_backend_t ggml_backend_metal_init(void) {
|
||||||
ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
|
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
|
@ -1702,8 +1702,9 @@ kernel void kernel_rope(
|
|||||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
||||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
if (ic < n_dims) {
|
||||||
|
const int64_t ib = 0;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
const float cur_rot = inv_ndims*ic - ib;
|
const float cur_rot = inv_ndims*ic - ib;
|
||||||
@ -1722,6 +1723,14 @@ kernel void kernel_rope(
|
|||||||
|
|
||||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
} else {
|
||||||
|
const int64_t i0 = ic;
|
||||||
|
|
||||||
|
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_data[0] = src[0];
|
||||||
|
dst_data[1] = src[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3677,7 +3677,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
||||||
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||||
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
|
||||||
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
||||||
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
||||||
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
||||||
@ -6626,7 +6626,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||||
const int8x16_t scales = vld1q_s8(scale);
|
const int8x16_t scales = vld1q_s8(scale);
|
||||||
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
|
||||||
|
|
||||||
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
||||||
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
||||||
|
468
ggml.c
468
ggml.c
@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|||||||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
||||||
|
|
||||||
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_blck_size(enum ggml_type type) {
|
int ggml_blck_size(enum ggml_type type) {
|
||||||
return type_traits[type].blck_size;
|
return type_traits[type].blck_size;
|
||||||
}
|
}
|
||||||
@ -2011,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
|
|||||||
return type_traits[type].type_size;
|
return type_traits[type].type_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
float ggml_type_sizef(enum ggml_type type) {
|
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||||
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
|
assert(ne % ggml_blck_size(type) == 0);
|
||||||
|
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
double ggml_type_sizef(enum ggml_type type) {
|
||||||
|
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_type_name(enum ggml_type type) {
|
const char * ggml_type_name(enum ggml_type type) {
|
||||||
@ -2049,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|||||||
return ggml_type_size(tensor->type);
|
return ggml_type_size(tensor->type);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_is_3d(const struct ggml_tensor * tensor) {
|
||||||
|
return tensor->ne[3] == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_n_dims(const struct ggml_tensor * tensor) {
|
||||||
|
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
|
||||||
|
if (tensor->ne[i] > 1) {
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
@ -2371,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|||||||
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
||||||
size_t max_size = 0;
|
size_t max_size = 0;
|
||||||
|
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||||
|
max_size = MAX(max_size, ggml_nbytes(tensor));
|
||||||
while (obj != NULL) {
|
|
||||||
if (obj->type == GGML_OBJECT_TENSOR) {
|
|
||||||
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
|
||||||
|
|
||||||
const size_t size = ggml_nbytes(tensor);
|
|
||||||
|
|
||||||
if (max_size < size) {
|
|
||||||
max_size = size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
obj = obj->next;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return max_size;
|
return max_size;
|
||||||
@ -2473,7 +2473,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||||||
view_src = view_src->view_src;
|
view_src = view_src->view_src;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
size_t data_size = ggml_row_size(type, ne[0]);
|
||||||
for (int i = 1; i < n_dims; i++) {
|
for (int i = 1; i < n_dims; i++) {
|
||||||
data_size *= ne[i];
|
data_size *= ne[i];
|
||||||
}
|
}
|
||||||
@ -2516,7 +2516,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||||||
/*.type =*/ type,
|
/*.type =*/ type,
|
||||||
/*.backend =*/ GGML_BACKEND_CPU,
|
/*.backend =*/ GGML_BACKEND_CPU,
|
||||||
/*.buffer =*/ NULL,
|
/*.buffer =*/ NULL,
|
||||||
/*.n_dims =*/ n_dims,
|
|
||||||
/*.ne =*/ { 1, 1, 1, 1 },
|
/*.ne =*/ { 1, 1, 1, 1 },
|
||||||
/*.nb =*/ { 0, 0, 0, 0 },
|
/*.nb =*/ { 0, 0, 0, 0 },
|
||||||
/*.op =*/ GGML_OP_NONE,
|
/*.op =*/ GGML_OP_NONE,
|
||||||
@ -2623,7 +2622,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
||||||
return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
|
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
||||||
@ -3072,7 +3071,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|||||||
struct ggml_tensor * ggml_view_tensor(
|
struct ggml_tensor * ggml_view_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * src) {
|
struct ggml_tensor * src) {
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
|
||||||
ggml_format_name(result, "%s (view)", src->name);
|
ggml_format_name(result, "%s (view)", src->name);
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
@ -3082,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
char * const mem_buffer = ctx->mem_buffer;
|
char * const mem_buffer = ctx->mem_buffer;
|
||||||
@ -3098,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
||||||
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
||||||
obj = obj->next;
|
obj = obj->next;
|
||||||
|
|
||||||
@ -3230,10 +3229,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
||||||
|
|
||||||
result->op = GGML_OP_ADD;
|
result->op = GGML_OP_ADD;
|
||||||
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
|
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
result->src[1] = b;
|
result->src[1] = b;
|
||||||
|
|
||||||
@ -3602,12 +3601,12 @@ struct ggml_tensor * ggml_sum_rows(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ne[4] = {1,1,1,1};
|
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
||||||
for (int i=1; i<a->n_dims; ++i) {
|
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||||
ne[i] = a->ne[i];
|
ne[i] = a->ne[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||||||
|
|
||||||
result->op = GGML_OP_SUM_ROWS;
|
result->op = GGML_OP_SUM_ROWS;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -3628,8 +3627,8 @@ struct ggml_tensor * ggml_mean(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||||
|
|
||||||
result->op = GGML_OP_MEAN;
|
result->op = GGML_OP_MEAN;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -3651,8 +3650,7 @@ struct ggml_tensor * ggml_argmax(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
|
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
|
||||||
|
|
||||||
result->op = GGML_OP_ARGMAX;
|
result->op = GGML_OP_ARGMAX;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -3675,7 +3673,7 @@ struct ggml_tensor * ggml_repeat(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
||||||
|
|
||||||
result->op = GGML_OP_REPEAT;
|
result->op = GGML_OP_REPEAT;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -3702,7 +3700,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
||||||
|
|
||||||
result->op = GGML_OP_REPEAT_BACK;
|
result->op = GGML_OP_REPEAT_BACK;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -4078,7 +4076,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||||
|
|
||||||
result->op = GGML_OP_MUL_MAT;
|
result->op = GGML_OP_MUL_MAT;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -4088,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_mul_mat_set_prec(
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
enum ggml_prec prec) {
|
||||||
|
const int32_t prec_i32 = (int32_t) prec;
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(a, 0, prec_i32);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_mul_mat_id
|
// ggml_mul_mat_id
|
||||||
|
|
||||||
struct ggml_tensor * ggml_mul_mat_id(
|
struct ggml_tensor * ggml_mul_mat_id(
|
||||||
@ -4112,7 +4118,7 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||||
|
|
||||||
ggml_set_op_params_i32(result, 0, id);
|
ggml_set_op_params_i32(result, 0, id);
|
||||||
ggml_set_op_params_i32(result, 1, n_as);
|
ggml_set_op_params_i32(result, 1, n_as);
|
||||||
@ -4150,7 +4156,7 @@ struct ggml_tensor * ggml_out_prod(
|
|||||||
|
|
||||||
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
||||||
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||||
|
|
||||||
result->op = GGML_OP_OUT_PROD;
|
result->op = GGML_OP_OUT_PROD;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -4165,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
|
|||||||
static struct ggml_tensor * ggml_scale_impl(
|
static struct ggml_tensor * ggml_scale_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
float s,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
GGML_ASSERT(ggml_is_scalar(b));
|
|
||||||
GGML_ASSERT(ggml_is_padded_1d(a));
|
GGML_ASSERT(ggml_is_padded_1d(a));
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad || b->grad) {
|
if (a->grad) {
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
ggml_set_op_params(result, &s, sizeof(s));
|
||||||
|
|
||||||
result->op = GGML_OP_SCALE;
|
result->op = GGML_OP_SCALE;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
result->src[1] = b;
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -4189,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
|
|||||||
struct ggml_tensor * ggml_scale(
|
struct ggml_tensor * ggml_scale(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b) {
|
float s) {
|
||||||
return ggml_scale_impl(ctx, a, b, false);
|
return ggml_scale_impl(ctx, a, s, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_scale_inplace(
|
struct ggml_tensor * ggml_scale_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b) {
|
float s) {
|
||||||
return ggml_scale_impl(ctx, a, b, true);
|
return ggml_scale_impl(ctx, a, s, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_set
|
// ggml_set
|
||||||
@ -4435,7 +4441,7 @@ struct ggml_tensor * ggml_reshape(
|
|||||||
//GGML_ASSERT(false);
|
//GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
|
||||||
ggml_format_name(result, "%s (reshaped)", a->name);
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_RESHAPE;
|
result->op = GGML_OP_RESHAPE;
|
||||||
@ -4813,7 +4819,7 @@ struct ggml_tensor * ggml_diag(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
|
||||||
|
|
||||||
result->op = GGML_OP_DIAG;
|
result->op = GGML_OP_DIAG;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -5460,7 +5466,7 @@ struct ggml_tensor * ggml_pool_1d(
|
|||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t ne[3] = {
|
const int64_t ne[2] = {
|
||||||
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
||||||
a->ne[1],
|
a->ne[1],
|
||||||
};
|
};
|
||||||
@ -5579,7 +5585,7 @@ struct ggml_tensor * ggml_argsort(
|
|||||||
enum ggml_sort_order order) {
|
enum ggml_sort_order order) {
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
||||||
|
|
||||||
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
||||||
|
|
||||||
@ -5626,7 +5632,7 @@ struct ggml_tensor * ggml_flash_attn(
|
|||||||
}
|
}
|
||||||
|
|
||||||
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
||||||
|
|
||||||
int32_t t = masked ? 1 : 0;
|
int32_t t = masked ? 1 : 0;
|
||||||
ggml_set_op_params(result, &t, sizeof(t));
|
ggml_set_op_params(result, &t, sizeof(t));
|
||||||
@ -5659,7 +5665,7 @@ struct ggml_tensor * ggml_flash_ff(
|
|||||||
}
|
}
|
||||||
|
|
||||||
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
||||||
|
|
||||||
result->op = GGML_OP_FLASH_FF;
|
result->op = GGML_OP_FLASH_FF;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
@ -5775,7 +5781,6 @@ struct ggml_tensor * ggml_win_part(
|
|||||||
const int np = npx*npy;
|
const int np = npx*npy;
|
||||||
|
|
||||||
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||||
|
|
||||||
int32_t params[] = { npx, npy, w };
|
int32_t params[] = { npx, npy, w };
|
||||||
@ -7759,10 +7764,10 @@ static void ggml_compute_forward_mul_f32(
|
|||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
// TODO: OpenCL kernel support broadcast
|
|
||||||
#ifdef GGML_USE_CLBLAST
|
#ifdef GGML_USE_CLBLAST
|
||||||
if (src1->backend == GGML_BACKEND_GPU) {
|
if (src1->backend == GGML_BACKEND_GPU) {
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
// TODO: OpenCL kernel support full broadcast
|
||||||
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
ggml_cl_mul(src0, src1, dst);
|
ggml_cl_mul(src0, src1, dst);
|
||||||
}
|
}
|
||||||
@ -9159,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
|
|||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
GGML_ASSERT(eps > 0.0f);
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
@ -9228,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
|||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
GGML_ASSERT(eps > 0.0f);
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
@ -9571,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// off1 = offset in i11 and i1
|
|
||||||
// cne1 = ne11 and ne1
|
|
||||||
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
|
||||||
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
|
||||||
static void ggml_compute_forward_mul_mat(
|
static void ggml_compute_forward_mul_mat(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst,
|
struct ggml_tensor * dst) {
|
||||||
int64_t off1, int64_t cne1) {
|
|
||||||
int64_t t0 = ggml_perf_time_us();
|
int64_t t0 = ggml_perf_time_us();
|
||||||
UNUSED(t0);
|
UNUSED(t0);
|
||||||
|
|
||||||
@ -9649,8 +9653,8 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
const int64_t i02 = i12/r2;
|
const int64_t i02 = i12/r2;
|
||||||
|
|
||||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||||
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
|
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
||||||
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
if (type != GGML_TYPE_F32) {
|
if (type != GGML_TYPE_F32) {
|
||||||
float * const wdata = params->wdata;
|
float * const wdata = params->wdata;
|
||||||
@ -9667,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
}
|
}
|
||||||
|
|
||||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
cne1, ne01, ne10,
|
ne1, ne01, ne10,
|
||||||
1.0f, y, ne10,
|
1.0f, y, ne10,
|
||||||
x, ne00,
|
x, ne00,
|
||||||
0.0f, d, ne01);
|
0.0f, d, ne01);
|
||||||
@ -9683,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
char * wdata = params->wdata;
|
char * wdata = params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||||
assert(src1->type == GGML_TYPE_F32);
|
assert(src1->type == GGML_TYPE_F32);
|
||||||
@ -9706,10 +9710,10 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
||||||
|
|
||||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||||
|
|
||||||
@ -9751,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||||
const int64_t i13 = (ir1/(ne12*cne1));
|
const int64_t i13 = (ir1/(ne12*ne1));
|
||||||
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
||||||
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
|
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
||||||
|
|
||||||
// broadcast src0 into src1
|
// broadcast src0 into src1
|
||||||
const int64_t i03 = i13/r3;
|
const int64_t i03 = i13/r3;
|
||||||
@ -9793,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
|
|
||||||
static void ggml_compute_forward_mul_mat_id(
|
static void ggml_compute_forward_mul_mat_id(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * ids,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
|
|
||||||
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const struct ggml_tensor * ids = src0;
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const enum ggml_type type = src0->type;
|
||||||
|
|
||||||
|
const bool src1_cont = ggml_is_contiguous(src1);
|
||||||
|
|
||||||
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||||
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||||
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||||
|
|
||||||
|
GGML_ASSERT(ne0 == ne01);
|
||||||
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
GGML_ASSERT(ne2 == ne12);
|
||||||
|
GGML_ASSERT(ne3 == ne13);
|
||||||
|
|
||||||
|
// we don't support permuted src0 or src1
|
||||||
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
||||||
|
|
||||||
|
// dst cannot be transposed or permuted
|
||||||
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb0 <= nb1);
|
||||||
|
GGML_ASSERT(nb1 <= nb2);
|
||||||
|
GGML_ASSERT(nb2 <= nb3);
|
||||||
|
|
||||||
|
// broadcast factors
|
||||||
|
const int64_t r2 = ne12/ne02;
|
||||||
|
const int64_t r3 = ne13/ne03;
|
||||||
|
|
||||||
|
// row groups
|
||||||
const int id = ggml_get_op_params_i32(dst, 0);
|
const int id = ggml_get_op_params_i32(dst, 0);
|
||||||
const int n_as = ggml_get_op_params_i32(dst, 1);
|
const int n_as = ggml_get_op_params_i32(dst, 1);
|
||||||
|
|
||||||
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
||||||
|
(char *) params->wdata :
|
||||||
|
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
||||||
|
|
||||||
|
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||||
|
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
|
||||||
|
|
||||||
|
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT) {
|
||||||
|
char * wdata = params->wdata;
|
||||||
|
if (src1->type != vec_dot_type) {
|
||||||
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
|
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||||
|
assert(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||||
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||||
|
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
||||||
|
wdata += row_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize matrix_row_counts
|
||||||
|
GGML_ASSERT(wdata == wdata_src1_end);
|
||||||
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
||||||
|
|
||||||
|
// group rows by src0 matrix
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
|
||||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
matrix_row_counts[row_id] += 1;
|
||||||
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute each matrix multiplication in sequence
|
||||||
|
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
||||||
|
const int64_t cne1 = matrix_row_counts[cur_a];
|
||||||
|
|
||||||
|
if (cne1 == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
||||||
|
|
||||||
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
|
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
||||||
|
|
||||||
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||||
|
|
||||||
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||||
|
|
||||||
|
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||||
|
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||||
|
|
||||||
|
const int64_t ith0 = ith % nth0;
|
||||||
|
const int64_t ith1 = ith / nth0;
|
||||||
|
|
||||||
|
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
||||||
|
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
||||||
|
|
||||||
|
const int64_t ir010 = dr0*ith0;
|
||||||
|
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
||||||
|
|
||||||
|
const int64_t ir110 = dr1*ith1;
|
||||||
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
||||||
|
|
||||||
|
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
||||||
|
|
||||||
|
// threads with no work simply yield (not sure if it helps)
|
||||||
|
if (ir010 >= ir011 || ir110 >= ir111) {
|
||||||
|
sched_yield();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ne12 % ne02 == 0);
|
||||||
|
assert(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
// block-tiling attempt
|
||||||
|
const int64_t blck_0 = 16;
|
||||||
|
const int64_t blck_1 = 16;
|
||||||
|
|
||||||
|
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||||
|
float tmp[16];
|
||||||
|
|
||||||
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||||
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||||
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||||
|
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
|
||||||
|
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
||||||
|
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
||||||
|
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
||||||
|
|
||||||
|
// broadcast src0 into src1
|
||||||
|
const int64_t i03 = i13/r3;
|
||||||
|
const int64_t i02 = i12/r2;
|
||||||
|
|
||||||
|
const int64_t i1 = i11;
|
||||||
|
const int64_t i2 = i12;
|
||||||
|
const int64_t i3 = i13;
|
||||||
|
|
||||||
|
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||||
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||||
|
// the original src1 data pointer, so we should index using the indices directly
|
||||||
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||||
|
const char * src1_col = (const char *) wdata +
|
||||||
|
(src1_cont || src1->type != vec_dot_type
|
||||||
|
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||||
|
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||||
|
|
||||||
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||||
|
|
||||||
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||||
|
//}
|
||||||
|
|
||||||
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
|
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
||||||
|
}
|
||||||
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef MMID_MATRIX_ROW
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_out_prod
|
// ggml_compute_forward_out_prod
|
||||||
@ -10158,19 +10325,18 @@ static void ggml_compute_forward_out_prod(
|
|||||||
static void ggml_compute_forward_scale_f32(
|
static void ggml_compute_forward_scale_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
GGML_ASSERT(ggml_is_scalar(src1));
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale factor
|
// scale factor
|
||||||
const float v = *(float *) src1->data;
|
float v;
|
||||||
|
memcpy(&v, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
@ -10201,12 +10367,11 @@ static void ggml_compute_forward_scale_f32(
|
|||||||
static void ggml_compute_forward_scale(
|
static void ggml_compute_forward_scale(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_scale_f32(params, src0, src1, dst);
|
ggml_compute_forward_scale_f32(params, src0, dst);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -11395,10 +11560,13 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
||||||
|
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
||||||
theta_base *= freq_scale;
|
theta_base *= freq_scale;
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
if (ic < n_dims) {
|
||||||
|
const int64_t ib = 0;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
|
||||||
@ -11421,6 +11589,14 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
|
|
||||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
} else {
|
||||||
|
const int64_t i0 = ic;
|
||||||
|
|
||||||
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_data[0] = src[0];
|
||||||
|
dst_data[1] = src[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -11548,10 +11724,13 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
||||||
|
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
||||||
theta_base *= freq_scale;
|
theta_base *= freq_scale;
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
if (ic < n_dims) {
|
||||||
|
const int64_t ib = 0;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
|
||||||
@ -11574,6 +11753,14 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
|
|
||||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
|
} else {
|
||||||
|
const int64_t i0 = ic;
|
||||||
|
|
||||||
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_data[0] = src[0];
|
||||||
|
dst_data[1] = src[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -14182,7 +14369,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
@ -14194,7 +14381,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
|
ggml_compute_forward_scale(params, tensor->src[0], tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SET:
|
case GGML_OP_SET:
|
||||||
{
|
{
|
||||||
@ -14558,7 +14745,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|||||||
return replacements->vals[i];
|
return replacements->vals[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
|
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
|
||||||
|
|
||||||
// insert clone into replacements
|
// insert clone into replacements
|
||||||
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
||||||
@ -14650,7 +14837,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
|
|||||||
|
|
||||||
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
||||||
if (ggml_hash_contains(zero_table, a)) {
|
if (ggml_hash_contains(zero_table, a)) {
|
||||||
struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
|
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
|
||||||
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
||||||
} else {
|
} else {
|
||||||
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
||||||
@ -14786,7 +14973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
src0->grad,
|
src0->grad,
|
||||||
ggml_scale(ctx,
|
ggml_scale(ctx,
|
||||||
ggml_mul(ctx, src0, tensor->grad),
|
ggml_mul(ctx, src0, tensor->grad),
|
||||||
ggml_new_f32(ctx, 2.0f)),
|
2.0f),
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -14800,7 +14987,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
ggml_div(ctx,
|
ggml_div(ctx,
|
||||||
tensor->grad,
|
tensor->grad,
|
||||||
tensor),
|
tensor),
|
||||||
ggml_new_f32(ctx, 0.5f)),
|
0.5f),
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -14966,17 +15153,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
{
|
{
|
||||||
// necessary for llama
|
// necessary for llama
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
|
float s;
|
||||||
|
memcpy(&s, tensor->op_params, sizeof(float));
|
||||||
|
|
||||||
src0->grad =
|
src0->grad =
|
||||||
ggml_add_or_set(ctx,
|
ggml_add_or_set(ctx,
|
||||||
src0->grad,
|
src0->grad,
|
||||||
ggml_scale_impl(ctx, tensor->grad, src1, false),
|
ggml_scale_impl(ctx, tensor->grad, s, false),
|
||||||
zero_table);
|
|
||||||
}
|
|
||||||
if (src1->grad) {
|
|
||||||
src1->grad =
|
|
||||||
ggml_add_or_set(ctx,
|
|
||||||
src1->grad,
|
|
||||||
ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
|
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -15154,6 +15337,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
const int n_past = ((int32_t *) tensor->op_params)[0];
|
const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||||
src0->grad =
|
src0->grad =
|
||||||
ggml_add_or_set(ctx, src0->grad,
|
ggml_add_or_set(ctx, src0->grad,
|
||||||
|
/* ggml_diag_mask_inf_impl() shouldn't be here */
|
||||||
|
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
|
||||||
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
@ -15982,7 +16167,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
// FIXME: blas
|
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
@ -16311,25 +16495,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (node->src[1]->type != vec_dot_type) {
|
if (node->src[1]->type != vec_dot_type) {
|
||||||
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
const struct ggml_tensor * a = node->src[2];
|
const struct ggml_tensor * src0 = node->src[2];
|
||||||
const struct ggml_tensor * b = node->src[1];
|
const struct ggml_tensor * src1 = node->src[1];
|
||||||
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
if (src1->type != vec_dot_type) {
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
||||||
if (a->type != GGML_TYPE_F32) {
|
|
||||||
// here we need memory just for single 2D matrix from src0
|
|
||||||
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
|
||||||
}
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (b->type != vec_dot_type) {
|
|
||||||
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
|
||||||
}
|
}
|
||||||
|
const int n_as = ggml_get_op_params_i32(node, 1);
|
||||||
|
cur = GGML_PAD(cur, sizeof(int64_t)); // align
|
||||||
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
||||||
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
@ -16559,7 +16739,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|||||||
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||||
ggml_type_name(tensor->type),
|
ggml_type_name(tensor->type),
|
||||||
ggml_op_name (tensor->op),
|
ggml_op_name (tensor->op),
|
||||||
tensor->n_dims,
|
ggml_n_dims(tensor),
|
||||||
ne[0], ne[1], ne[2], ne[3],
|
ne[0], ne[1], ne[2], ne[3],
|
||||||
nb[0], nb[1], nb[2], nb[3],
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
tensor->data,
|
tensor->data,
|
||||||
@ -16574,7 +16754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|||||||
arg,
|
arg,
|
||||||
ggml_type_name(tensor->type),
|
ggml_type_name(tensor->type),
|
||||||
ggml_op_name (tensor->op),
|
ggml_op_name (tensor->op),
|
||||||
tensor->n_dims,
|
ggml_n_dims(tensor),
|
||||||
ne[0], ne[1], ne[2], ne[3],
|
ne[0], ne[1], ne[2], ne[3],
|
||||||
nb[0], nb[1], nb[2], nb[3],
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
tensor->data,
|
tensor->data,
|
||||||
@ -16664,11 +16844,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|||||||
|
|
||||||
const uint32_t type = tensor->type;
|
const uint32_t type = tensor->type;
|
||||||
const uint32_t op = tensor->op;
|
const uint32_t op = tensor->op;
|
||||||
const uint32_t n_dims = tensor->n_dims;
|
|
||||||
|
|
||||||
fwrite(&type, sizeof(uint32_t), 1, fout);
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
||||||
fwrite(&op, sizeof(uint32_t), 1, fout);
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
||||||
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
const uint64_t ne = tensor->ne[j];
|
const uint64_t ne = tensor->ne[j];
|
||||||
@ -16698,11 +16876,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|||||||
|
|
||||||
const uint32_t type = tensor->type;
|
const uint32_t type = tensor->type;
|
||||||
const uint32_t op = tensor->op;
|
const uint32_t op = tensor->op;
|
||||||
const uint32_t n_dims = tensor->n_dims;
|
|
||||||
|
|
||||||
fwrite(&type, sizeof(uint32_t), 1, fout);
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
||||||
fwrite(&op, sizeof(uint32_t), 1, fout);
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
||||||
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
const uint64_t ne = tensor->ne[j];
|
const uint64_t ne = tensor->ne[j];
|
||||||
@ -16874,12 +17050,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||||||
{
|
{
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint32_t op;
|
uint32_t op;
|
||||||
uint32_t n_dims;
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_leafs; ++i) {
|
for (uint32_t i = 0; i < n_leafs; ++i) {
|
||||||
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
||||||
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
||||||
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
|
||||||
|
|
||||||
int64_t ne[GGML_MAX_DIMS];
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
size_t nb[GGML_MAX_DIMS];
|
size_t nb[GGML_MAX_DIMS];
|
||||||
@ -16895,7 +17069,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||||||
nb[j] = nb_cur;
|
nb[j] = nb_cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
||||||
|
|
||||||
tensor->op = (enum ggml_op) op;
|
tensor->op = (enum ggml_op) op;
|
||||||
|
|
||||||
@ -16912,7 +17086,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||||||
|
|
||||||
ptr += ggml_nbytes(tensor);
|
ptr += ggml_nbytes(tensor);
|
||||||
|
|
||||||
fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -16922,12 +17096,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||||||
{
|
{
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint32_t op;
|
uint32_t op;
|
||||||
uint32_t n_dims;
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_nodes; ++i) {
|
for (uint32_t i = 0; i < n_nodes; ++i) {
|
||||||
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
||||||
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
||||||
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
|
||||||
|
|
||||||
enum ggml_op eop = (enum ggml_op) op;
|
enum ggml_op eop = (enum ggml_op) op;
|
||||||
|
|
||||||
@ -16998,7 +17170,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
||||||
|
|
||||||
tensor->op = eop;
|
tensor->op = eop;
|
||||||
} break;
|
} break;
|
||||||
@ -17017,7 +17189,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||||||
|
|
||||||
result->nodes[i] = tensor;
|
result->nodes[i] = tensor;
|
||||||
|
|
||||||
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -17155,7 +17327,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||||||
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->n_dims == 2) {
|
if (ggml_is_matrix(node)) {
|
||||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
||||||
} else {
|
} else {
|
||||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
||||||
@ -17422,7 +17594,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||||||
int64_t i = 0;
|
int64_t i = 0;
|
||||||
for (int p = 0; p < np; ++p) {
|
for (int p = 0; p < np; ++p) {
|
||||||
const int64_t ne = ggml_nelements(ps[p]);
|
const int64_t ne = ggml_nelements(ps[p]);
|
||||||
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
|
||||||
for (int64_t j = 0; j < ne; ++j) {
|
for (int64_t j = 0; j < ne; ++j) {
|
||||||
float x = ggml_get_f32_1d(ps[p], j);
|
float x = ggml_get_f32_1d(ps[p], j);
|
||||||
float g_ = g[i]*gnorm;
|
float g_ = g[i]*gnorm;
|
||||||
@ -18696,7 +18868,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
const size_t size_cur = ggml_row_size(info->type, ne);
|
||||||
|
|
||||||
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
||||||
}
|
}
|
||||||
@ -19025,6 +19197,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|||||||
return ctx->infos[i].name.data;
|
return ctx->infos[i].name.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
||||||
|
return ctx->infos[i].type;
|
||||||
|
}
|
||||||
|
|
||||||
// returns the index
|
// returns the index
|
||||||
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||||
const int idx = gguf_find_key(ctx, key);
|
const int idx = gguf_find_key(ctx, key);
|
||||||
@ -19200,8 +19376,8 @@ void gguf_add_tensor(
|
|||||||
ctx->infos[idx].ne[i] = 1;
|
ctx->infos[idx].ne[i] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->infos[idx].n_dims = tensor->n_dims;
|
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
|
||||||
for (int i = 0; i < tensor->n_dims; i++) {
|
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
|
||||||
ctx->infos[idx].ne[i] = tensor->ne[i];
|
ctx->infos[idx].ne[i] = tensor->ne[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
41
ggml.h
41
ggml.h
@ -303,7 +303,7 @@ extern "C" {
|
|||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
typedef half ggml_fp16_t;
|
typedef half ggml_fp16_t;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
typedef __fp16 ggml_fp16_t;
|
typedef __fp16 ggml_fp16_t;
|
||||||
#else
|
#else
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
@ -343,6 +343,12 @@ extern "C" {
|
|||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// precision
|
||||||
|
enum ggml_prec {
|
||||||
|
GGML_PREC_DEFAULT,
|
||||||
|
GGML_PREC_F32,
|
||||||
|
};
|
||||||
|
|
||||||
enum ggml_backend_type {
|
enum ggml_backend_type {
|
||||||
GGML_BACKEND_CPU = 0,
|
GGML_BACKEND_CPU = 0,
|
||||||
GGML_BACKEND_GPU = 10,
|
GGML_BACKEND_GPU = 10,
|
||||||
@ -478,7 +484,8 @@ extern "C" {
|
|||||||
enum ggml_log_level {
|
enum ggml_log_level {
|
||||||
GGML_LOG_LEVEL_ERROR = 2,
|
GGML_LOG_LEVEL_ERROR = 2,
|
||||||
GGML_LOG_LEVEL_WARN = 3,
|
GGML_LOG_LEVEL_WARN = 3,
|
||||||
GGML_LOG_LEVEL_INFO = 4
|
GGML_LOG_LEVEL_INFO = 4,
|
||||||
|
GGML_LOG_LEVEL_DEBUG = 5
|
||||||
};
|
};
|
||||||
|
|
||||||
// ggml object
|
// ggml object
|
||||||
@ -502,7 +509,6 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
int n_dims;
|
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||||
// nb[0] = ggml_type_size(type)
|
// nb[0] = ggml_type_size(type)
|
||||||
@ -534,7 +540,7 @@ extern "C" {
|
|||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[12];
|
char padding[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
@ -639,11 +645,14 @@ extern "C" {
|
|||||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
|
||||||
|
|
||||||
GGML_API int ggml_blck_size(enum ggml_type type);
|
GGML_API int ggml_blck_size(enum ggml_type type);
|
||||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||||
|
|
||||||
|
GGML_DEPRECATED(
|
||||||
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||||
|
"use ggml_row_size() instead");
|
||||||
|
|
||||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||||
@ -662,6 +671,11 @@ extern "C" {
|
|||||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
|
||||||
@ -722,8 +736,8 @@ extern "C" {
|
|||||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||||
|
|
||||||
// Context tensor enumeration and lookup
|
// Context tensor enumeration and lookup
|
||||||
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
|
||||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
@ -1050,6 +1064,12 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// change the precision of a matrix multiplication
|
||||||
|
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
|
||||||
|
GGML_API void ggml_mul_mat_set_prec(
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
enum ggml_prec prec);
|
||||||
|
|
||||||
// indirect matrix multiplication
|
// indirect matrix multiplication
|
||||||
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||||
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||||
@ -1075,13 +1095,13 @@ extern "C" {
|
|||||||
GGML_API struct ggml_tensor * ggml_scale(
|
GGML_API struct ggml_tensor * ggml_scale(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
float s);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
float s);
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||||
GGML_API struct ggml_tensor * ggml_set(
|
GGML_API struct ggml_tensor * ggml_set(
|
||||||
@ -2120,6 +2140,7 @@ extern "C" {
|
|||||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||||
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
// overrides existing values or adds a new one
|
// overrides existing values or adds a new one
|
||||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
|
38
whisper.cpp
38
whisper.cpp
@ -1777,7 +1777,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
||||||
|
|
||||||
ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
//ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
||||||
|
|
||||||
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
|
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
|
||||||
//ggml_allocr_alloc(alloc, cur);
|
//ggml_allocr_alloc(alloc, cur);
|
||||||
@ -1787,13 +1787,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
//}
|
//}
|
||||||
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
|
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
|
||||||
|
|
||||||
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
|
||||||
ggml_allocr_alloc(alloc, KQscale);
|
|
||||||
|
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
const float val = 1.0f/sqrtf(float(n_state)/n_head);
|
|
||||||
ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===================================================================
|
// ===================================================================
|
||||||
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
||||||
@ -1843,14 +1837,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
|
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
|
||||||
|
|
||||||
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
//Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
|
||||||
|
|
||||||
// note: no bias for Key
|
// note: no bias for Key
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
||||||
layer.attn_k_w,
|
layer.attn_k_w,
|
||||||
cur);
|
cur);
|
||||||
|
|
||||||
//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
//Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
||||||
layer.attn_v_w,
|
layer.attn_v_w,
|
||||||
@ -2032,7 +2026,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
//ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
||||||
|
|
||||||
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
||||||
//ggml_allocr_alloc(alloc, cur);
|
//ggml_allocr_alloc(alloc, cur);
|
||||||
@ -2042,13 +2036,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
|
|||||||
//}
|
//}
|
||||||
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
||||||
|
|
||||||
struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
const float Kscale = pow(float(n_state) / n_head, -0.25);
|
||||||
ggml_allocr_alloc(alloc, Kscale);
|
|
||||||
|
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
const float val = pow(float(n_state) / n_head, -0.25);
|
|
||||||
ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
||||||
auto & layer = model.layers_decoder[il];
|
auto & layer = model.layers_decoder[il];
|
||||||
@ -2207,13 +2195,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
const float KQscale = pow(float(n_state)/n_head, -0.25);
|
||||||
ggml_allocr_alloc(alloc, KQscale);
|
|
||||||
|
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
const float val = pow(float(n_state)/n_head, -0.25);
|
|
||||||
ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||||
ggml_allocr_alloc(alloc, KQ_mask);
|
ggml_allocr_alloc(alloc, KQ_mask);
|
||||||
@ -6128,7 +6110,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
|
|
||||||
// multi-thread
|
// multi-thread
|
||||||
|
|
||||||
for (uint32_t k = 1; k <= n_threads; k++) {
|
for (int32_t k = 1; k <= n_threads; k++) {
|
||||||
char * src = (char *) malloc(size);
|
char * src = (char *) malloc(size);
|
||||||
char * dst = (char *) malloc(size);
|
char * dst = (char *) malloc(size);
|
||||||
|
|
||||||
@ -6152,13 +6134,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
const int64_t t0 = ggml_time_us();
|
const int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
std::vector<std::thread> threads(k - 1);
|
std::vector<std::thread> threads(k - 1);
|
||||||
for (uint32_t th = 0; th < k - 1; ++th) {
|
for (int32_t th = 0; th < k - 1; ++th) {
|
||||||
threads[th] = std::thread(helper, th);
|
threads[th] = std::thread(helper, th);
|
||||||
}
|
}
|
||||||
|
|
||||||
helper(k - 1);
|
helper(k - 1);
|
||||||
|
|
||||||
for (uint32_t th = 0; th < k - 1; ++th) {
|
for (int32_t th = 0; th < k - 1; ++th) {
|
||||||
threads[th].join();
|
threads[th].join();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user