mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-02-19 00:30:27 +00:00
Some checks failed
Bindings Tests (Ruby) / ubuntu-latest (push) Has been cancelled
CI / ubuntu-latest (linux/amd64) (push) Has been cancelled
CI / ubuntu-latest (linux/ppc64le) (push) Has been cancelled
CI / ubuntu-latest-arm64 (linux/arm64) (push) Has been cancelled
CI / ubuntu-latest-arm-v7 (linux/arm/v7) (push) Has been cancelled
CI / macOS-latest (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm64 (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm64 (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm-v7 (linux/arm/v7, Debug) (push) Has been cancelled
CI / ubuntu-latest-gcc-arm-v7 (linux/arm/v7, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-latest-clang (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, ADDRESS) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, THREAD) (push) Has been cancelled
CI / ubuntu-latest-gcc-sanitized (linux/amd64, UNDEFINED) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Has been cancelled
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Has been cancelled
CI / windows (Win32, Release, win32-x86, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows (x64, Release, win32-x86-64, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (Win32, ON, Release, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (x64, ON, Release, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 11.8.0, ON, 2.28.5) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 12.2.0, ON, 2.28.5) (push) Has been cancelled
CI / emscripten (Release) (push) Has been cancelled
CI / ios-xcode-build (Release) (push) Has been cancelled
CI / android (push) Has been cancelled
CI / quantize (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main.Dockerfile platform:linux/amd64 tag:main]) (push) Has been cancelled
373 lines
11 KiB
C++
373 lines
11 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
#include "llama-arch.h"
|
|
#include "llama-hparams.h"
|
|
#include "llama-vocab.h"
|
|
|
|
#include <memory>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
struct llama_model_loader;
|
|
|
|
// available models
|
|
enum llm_type {
|
|
LLM_TYPE_UNKNOWN,
|
|
LLM_TYPE_14M,
|
|
LLM_TYPE_17M,
|
|
LLM_TYPE_22M,
|
|
LLM_TYPE_33M,
|
|
LLM_TYPE_60M,
|
|
LLM_TYPE_70M,
|
|
LLM_TYPE_80M,
|
|
LLM_TYPE_109M,
|
|
LLM_TYPE_137M,
|
|
LLM_TYPE_160M,
|
|
LLM_TYPE_220M,
|
|
LLM_TYPE_250M,
|
|
LLM_TYPE_270M,
|
|
LLM_TYPE_335M,
|
|
LLM_TYPE_410M,
|
|
LLM_TYPE_450M,
|
|
LLM_TYPE_770M,
|
|
LLM_TYPE_780M,
|
|
LLM_TYPE_0_5B,
|
|
LLM_TYPE_1B,
|
|
LLM_TYPE_1_3B,
|
|
LLM_TYPE_1_4B,
|
|
LLM_TYPE_1_5B,
|
|
LLM_TYPE_1_6B,
|
|
LLM_TYPE_2B,
|
|
LLM_TYPE_2_8B,
|
|
LLM_TYPE_3B,
|
|
LLM_TYPE_4B,
|
|
LLM_TYPE_6B,
|
|
LLM_TYPE_6_9B,
|
|
LLM_TYPE_7B,
|
|
LLM_TYPE_8B,
|
|
LLM_TYPE_9B,
|
|
LLM_TYPE_11B,
|
|
LLM_TYPE_12B,
|
|
LLM_TYPE_13B,
|
|
LLM_TYPE_14B,
|
|
LLM_TYPE_15B,
|
|
LLM_TYPE_16B,
|
|
LLM_TYPE_20B,
|
|
LLM_TYPE_30B,
|
|
LLM_TYPE_32B,
|
|
LLM_TYPE_34B,
|
|
LLM_TYPE_35B,
|
|
LLM_TYPE_40B,
|
|
LLM_TYPE_65B,
|
|
LLM_TYPE_70B,
|
|
LLM_TYPE_236B,
|
|
LLM_TYPE_314B,
|
|
LLM_TYPE_671B,
|
|
LLM_TYPE_SMALL,
|
|
LLM_TYPE_MEDIUM,
|
|
LLM_TYPE_LARGE,
|
|
LLM_TYPE_XL,
|
|
LLM_TYPE_A1_7B,
|
|
LLM_TYPE_A2_7B,
|
|
LLM_TYPE_8x7B,
|
|
LLM_TYPE_8x22B,
|
|
LLM_TYPE_16x12B,
|
|
LLM_TYPE_16x3_8B,
|
|
LLM_TYPE_10B_128x3_66B,
|
|
LLM_TYPE_57B_A14B,
|
|
LLM_TYPE_27B,
|
|
};
|
|
|
|
struct llama_layer_posnet {
|
|
// resnet
|
|
struct ggml_tensor * norm1 = nullptr;
|
|
struct ggml_tensor * norm1_b = nullptr;
|
|
|
|
struct ggml_tensor * conv1 = nullptr;
|
|
struct ggml_tensor * conv1_b = nullptr;
|
|
|
|
struct ggml_tensor * norm2 = nullptr;
|
|
struct ggml_tensor * norm2_b = nullptr;
|
|
|
|
struct ggml_tensor * conv2 = nullptr;
|
|
struct ggml_tensor * conv2_b = nullptr;
|
|
|
|
// attention
|
|
struct ggml_tensor * attn_norm = nullptr;
|
|
struct ggml_tensor * attn_norm_b = nullptr;
|
|
|
|
struct ggml_tensor * attn_q = nullptr;
|
|
struct ggml_tensor * attn_q_b = nullptr;
|
|
|
|
struct ggml_tensor * attn_k = nullptr;
|
|
struct ggml_tensor * attn_k_b = nullptr;
|
|
|
|
struct ggml_tensor * attn_v = nullptr;
|
|
struct ggml_tensor * attn_v_b = nullptr;
|
|
|
|
struct ggml_tensor * attn_o = nullptr;
|
|
struct ggml_tensor * attn_o_b = nullptr;
|
|
|
|
// normalize
|
|
struct ggml_tensor * norm = nullptr;
|
|
struct ggml_tensor * norm_b = nullptr;
|
|
};
|
|
|
|
struct llama_layer_convnext {
|
|
struct ggml_tensor * dw = nullptr;
|
|
struct ggml_tensor * dw_b = nullptr;
|
|
|
|
struct ggml_tensor * norm = nullptr;
|
|
struct ggml_tensor * norm_b = nullptr;
|
|
|
|
struct ggml_tensor * pw1 = nullptr;
|
|
struct ggml_tensor * pw1_b = nullptr;
|
|
|
|
struct ggml_tensor * pw2 = nullptr;
|
|
struct ggml_tensor * pw2_b = nullptr;
|
|
|
|
struct ggml_tensor * gamma = nullptr;
|
|
};
|
|
|
|
struct llama_layer {
|
|
// normalization
|
|
struct ggml_tensor * attn_norm = nullptr;
|
|
struct ggml_tensor * attn_norm_b = nullptr;
|
|
struct ggml_tensor * attn_norm_2 = nullptr;
|
|
struct ggml_tensor * attn_norm_2_b = nullptr;
|
|
struct ggml_tensor * attn_q_norm = nullptr;
|
|
struct ggml_tensor * attn_q_norm_b = nullptr;
|
|
struct ggml_tensor * attn_k_norm = nullptr;
|
|
struct ggml_tensor * attn_k_norm_b = nullptr;
|
|
struct ggml_tensor * attn_out_norm = nullptr;
|
|
struct ggml_tensor * attn_out_norm_b = nullptr;
|
|
struct ggml_tensor * attn_q_a_norm = nullptr;
|
|
struct ggml_tensor * attn_kv_a_norm = nullptr;
|
|
struct ggml_tensor * attn_sub_norm = nullptr;
|
|
struct ggml_tensor * attn_post_norm = nullptr;
|
|
struct ggml_tensor * ffn_sub_norm = nullptr;
|
|
struct ggml_tensor * attn_norm_cross = nullptr;
|
|
struct ggml_tensor * attn_norm_enc = nullptr;
|
|
|
|
// attention
|
|
struct ggml_tensor * wq = nullptr;
|
|
struct ggml_tensor * wk = nullptr;
|
|
struct ggml_tensor * wv = nullptr;
|
|
struct ggml_tensor * wo = nullptr;
|
|
struct ggml_tensor * wqkv = nullptr;
|
|
struct ggml_tensor * wq_a = nullptr;
|
|
struct ggml_tensor * wq_b = nullptr;
|
|
struct ggml_tensor * wkv_a_mqa = nullptr;
|
|
struct ggml_tensor * wkv_b = nullptr;
|
|
struct ggml_tensor * wq_cross = nullptr;
|
|
struct ggml_tensor * wk_cross = nullptr;
|
|
struct ggml_tensor * wv_cross = nullptr;
|
|
struct ggml_tensor * wo_cross = nullptr;
|
|
struct ggml_tensor * wq_enc = nullptr;
|
|
struct ggml_tensor * wk_enc = nullptr;
|
|
struct ggml_tensor * wv_enc = nullptr;
|
|
struct ggml_tensor * wo_enc = nullptr;
|
|
|
|
// attention bias
|
|
struct ggml_tensor * bq = nullptr;
|
|
struct ggml_tensor * bk = nullptr;
|
|
struct ggml_tensor * bv = nullptr;
|
|
struct ggml_tensor * bo = nullptr;
|
|
struct ggml_tensor * bqkv = nullptr;
|
|
|
|
// relative position bias
|
|
struct ggml_tensor * attn_rel_b = nullptr;
|
|
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
|
struct ggml_tensor * attn_rel_b_cross = nullptr;
|
|
|
|
// normalization
|
|
struct ggml_tensor * ffn_norm = nullptr;
|
|
struct ggml_tensor * ffn_norm_b = nullptr;
|
|
struct ggml_tensor * ffn_post_norm = nullptr;
|
|
struct ggml_tensor * layer_out_norm = nullptr;
|
|
struct ggml_tensor * layer_out_norm_b = nullptr;
|
|
struct ggml_tensor * ffn_norm_exps = nullptr;
|
|
struct ggml_tensor * ffn_norm_enc = nullptr;
|
|
|
|
// ff
|
|
struct ggml_tensor * ffn_gate = nullptr; // w1
|
|
struct ggml_tensor * ffn_down = nullptr; // w2
|
|
struct ggml_tensor * ffn_up = nullptr; // w3
|
|
struct ggml_tensor * ffn_gate_enc = nullptr;
|
|
struct ggml_tensor * ffn_down_enc = nullptr;
|
|
struct ggml_tensor * ffn_up_enc = nullptr;
|
|
|
|
// ff MoE
|
|
struct ggml_tensor * ffn_gate_inp = nullptr;
|
|
struct ggml_tensor * ffn_gate_exps = nullptr;
|
|
struct ggml_tensor * ffn_down_exps = nullptr;
|
|
struct ggml_tensor * ffn_up_exps = nullptr;
|
|
|
|
// ff shared expert (shexp)
|
|
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
struct ggml_tensor * ffn_gate_shexp = nullptr;
|
|
struct ggml_tensor * ffn_down_shexp = nullptr;
|
|
struct ggml_tensor * ffn_up_shexp = nullptr;
|
|
|
|
// ff bias
|
|
struct ggml_tensor * ffn_gate_b = nullptr;
|
|
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
|
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
|
struct ggml_tensor * ffn_act = nullptr;
|
|
struct ggml_tensor * ffn_exp_probs_b = nullptr;
|
|
|
|
// mamba proj
|
|
struct ggml_tensor * ssm_in = nullptr;
|
|
struct ggml_tensor * ssm_x = nullptr;
|
|
struct ggml_tensor * ssm_dt = nullptr;
|
|
struct ggml_tensor * ssm_out = nullptr;
|
|
|
|
// mamba
|
|
struct ggml_tensor * ssm_conv1d = nullptr;
|
|
struct ggml_tensor * ssm_a = nullptr;
|
|
struct ggml_tensor * ssm_d = nullptr;
|
|
|
|
// mamba bias
|
|
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
|
struct ggml_tensor * ssm_dt_b = nullptr;
|
|
|
|
// rwkv
|
|
struct ggml_tensor * time_mix_w1 = nullptr;
|
|
struct ggml_tensor * time_mix_w2 = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_x = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_w = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_k = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
|
struct ggml_tensor * time_mix_lerp_fused = nullptr;
|
|
|
|
struct ggml_tensor * time_mix_first = nullptr;
|
|
struct ggml_tensor * time_mix_decay = nullptr;
|
|
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
|
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
|
struct ggml_tensor * time_mix_key = nullptr;
|
|
struct ggml_tensor * time_mix_key_b = nullptr;
|
|
struct ggml_tensor * time_mix_value = nullptr;
|
|
struct ggml_tensor * time_mix_value_b = nullptr;
|
|
struct ggml_tensor * time_mix_receptance = nullptr;
|
|
struct ggml_tensor * time_mix_receptance_b = nullptr;
|
|
struct ggml_tensor * time_mix_gate = nullptr;
|
|
|
|
struct ggml_tensor * time_mix_ln = nullptr;
|
|
struct ggml_tensor * time_mix_ln_b = nullptr;
|
|
struct ggml_tensor * time_mix_output = nullptr;
|
|
|
|
struct ggml_tensor * channel_mix_lerp_k = nullptr;
|
|
struct ggml_tensor * channel_mix_lerp_r = nullptr;
|
|
|
|
struct ggml_tensor * channel_mix_key = nullptr;
|
|
struct ggml_tensor * channel_mix_receptance = nullptr;
|
|
struct ggml_tensor * channel_mix_value = nullptr;
|
|
|
|
// long rope factors
|
|
struct ggml_tensor * rope_long = nullptr;
|
|
struct ggml_tensor * rope_short = nullptr;
|
|
struct ggml_tensor * rope_freqs = nullptr;
|
|
|
|
// bitnet scale
|
|
struct ggml_tensor * wq_scale = nullptr;
|
|
struct ggml_tensor * wk_scale = nullptr;
|
|
struct ggml_tensor * wv_scale = nullptr;
|
|
struct ggml_tensor * wo_scale = nullptr;
|
|
struct ggml_tensor * ffn_gate_scale = nullptr;
|
|
struct ggml_tensor * ffn_up_scale = nullptr;
|
|
struct ggml_tensor * ffn_down_scale = nullptr;
|
|
|
|
struct llama_layer_posnet posnet;
|
|
|
|
struct llama_layer_convnext convnext;
|
|
};
|
|
|
|
struct llama_model {
|
|
llm_type type = LLM_TYPE_UNKNOWN;
|
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
|
|
std::string name = "n/a";
|
|
|
|
llama_hparams hparams = {};
|
|
llama_vocab vocab;
|
|
|
|
struct ggml_tensor * tok_embd = nullptr;
|
|
struct ggml_tensor * type_embd = nullptr;
|
|
struct ggml_tensor * pos_embd = nullptr;
|
|
struct ggml_tensor * tok_norm = nullptr;
|
|
struct ggml_tensor * tok_norm_b = nullptr;
|
|
|
|
struct ggml_tensor * output_norm = nullptr;
|
|
struct ggml_tensor * output_norm_b = nullptr;
|
|
struct ggml_tensor * output = nullptr;
|
|
struct ggml_tensor * output_b = nullptr;
|
|
struct ggml_tensor * output_norm_enc = nullptr;
|
|
|
|
// classifier
|
|
struct ggml_tensor * cls = nullptr;
|
|
struct ggml_tensor * cls_b = nullptr;
|
|
struct ggml_tensor * cls_out = nullptr;
|
|
struct ggml_tensor * cls_out_b = nullptr;
|
|
|
|
struct ggml_tensor * conv1d = nullptr;
|
|
struct ggml_tensor * conv1d_b = nullptr;
|
|
|
|
std::vector<llama_layer> layers;
|
|
|
|
llama_model_params params;
|
|
|
|
// gguf metadata
|
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
|
|
std::vector<std::string> rpc_servers;
|
|
|
|
// list of devices used in this model
|
|
std::vector<ggml_backend_dev_t> devices;
|
|
|
|
// for quantize-stats only
|
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
|
|
|
int64_t t_load_us = 0;
|
|
int64_t t_start_us = 0;
|
|
|
|
explicit llama_model(const struct llama_model_params & params);
|
|
~llama_model();
|
|
|
|
void load_stats (llama_model_loader & ml);
|
|
void load_arch (llama_model_loader & ml);
|
|
void load_hparams(llama_model_loader & ml);
|
|
void load_vocab (llama_model_loader & ml);
|
|
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
|
|
|
|
std::string arch_name() const;
|
|
std::string type_name() const;
|
|
|
|
std::string desc() const;
|
|
|
|
size_t size() const;
|
|
size_t max_nodes() const;
|
|
size_t n_devices() const;
|
|
|
|
// total number of parameters in the model
|
|
uint64_t n_elements() const;
|
|
|
|
void print_info() const;
|
|
|
|
ggml_backend_dev_t dev_layer(int il) const;
|
|
ggml_backend_dev_t dev_output() const;
|
|
|
|
ggml_backend_buffer_type_t select_buft(int il) const;
|
|
|
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
const char * llm_type_name(llm_type type);
|