talk-llama : sync llama.cpp

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-05-01 10:43:30 +03:00
parent 5cd59c9396
commit 0778b6ff5f
13 changed files with 159 additions and 74 deletions

View File

@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_NOMIC_BERT_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_JINA_BERT_V2,
{

View File

@ -23,6 +23,7 @@ enum llm_arch {
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
@ -110,6 +111,7 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,

View File

@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
}
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
@ -154,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains("[gMASK]sop")) {
// chatglm3-6b
return LLM_CHAT_TEMPLATE_CHATGML_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGML_4;
return LLM_CHAT_TEMPLATE_CHATGLM_3;
} else if (tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
return LLM_CHAT_TEMPLATE_MINICPM;
@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
// chatglm3-6b
ss << "[gMASK]" << "sop";
for (auto message : chat) {
@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) {

View File

@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3,
LLM_CHAT_TEMPLATE_CHATGML_4,
LLM_CHAT_TEMPLATE_CHATGLM_3,
LLM_CHAT_TEMPLATE_CHATGLM_4,
LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,

View File

@ -114,7 +114,7 @@ llama_context::llama_context(
}
if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale,
ggml_backend_buffer * bbuf) const {
float freq_scale) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
if (bbuf) {
for (const auto & backend : backends) {
// Figure out which backend KV cache belongs to
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
break;
}
}
}
tmp = ggml_rope_ext_inplace(ctx0, tmp,
tmp = ggml_rope_ext(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
0);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_build_forward_expand(gf, cur);
}
@ -1547,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1);
ggml_backend_buffer_clear(buf_output.get(), 0);
this->n_outputs = 0;
this->n_outputs_max = n_outputs_max;

View File

@ -170,8 +170,7 @@ private:
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale,
ggml_backend_buffer * bbuf) const;
float freq_scale) const;
llm_graph_result_ptr build_kv_self_shift(
ggml_context * ctx0,

View File

@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
if (ubatch->token && n_pos_per_embd == 4) {
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
}
}
}
@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
) * f_attn_temp_scale + 1.0;
}
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
}
}
@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res (std::make_unique<llm_graph_result>()) {
}
int64_t llm_graph_context::n_pos_per_token() const {
int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
}
@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) {
cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
if (down_b) {
@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(gate, "ffn_moe_gate", il);
ggml_tensor * experts = nullptr;
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
switch (type_op) {
case LLM_FFN_SILU:
{
gate = ggml_silu(ctx0, gate);
cb(gate, "ffn_moe_silu", il);
cur = ggml_silu(ctx0, cur);
cb(cur, "ffn_moe_silu", il);
} break;
case LLM_FFN_GELU:
{
gate = ggml_gelu(ctx0, gate);
cb(gate, "ffn_moe_gelu", il);
cur = ggml_gelu(ctx0, cur);
cb(cur, "ffn_moe_gelu", il);
} break;
default:
GGML_ABORT("fatal error");
}
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
cb(par, "ffn_moe_gate_par", il);
if (gate_exps) {
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate_par", il);
}
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
if (!weight_before_ffn) {
@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
}
ggml_tensor * llm_graph_context::build_inp_pos() const {
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
ggml_set_input(cur);
res->add_input(std::move(inp));
@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
}
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale;
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
// this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur);
res->add_input(std::move(inp));

View File

@ -90,29 +90,27 @@ public:
class llm_graph_input_pos : public llm_graph_input_i {
public:
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
virtual ~llm_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_token = 1;
const int64_t n_pos_per_embd = 1;
};
// temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i {
public:
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
virtual ~llm_graph_input_attn_temp() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
const int64_t n_pos_per_token = 1;
const uint32_t n_attn_temp_floor_scale;
const float f_attn_temp_scale;
};
@ -419,7 +417,7 @@ struct llm_graph_context {
llm_graph_context(const llm_graph_params & params);
int64_t n_pos_per_token() const;
int64_t n_pos_per_embd() const;
void cb(ggml_tensor * cur, const char * name, int il) const;

View File

@ -66,6 +66,7 @@ struct llama_hparams {
float expert_weights_scale = 0.0;
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
float f_norm_eps;
float f_norm_rms_eps;

View File

@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_335M: return "335M";
case LLM_TYPE_410M: return "410M";
case LLM_TYPE_450M: return "450M";
case LLM_TYPE_475M: return "475M";
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B";
@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B";
@ -74,6 +78,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B";
@ -88,10 +93,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B";
}
}
@ -695,13 +700,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
} break;
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
if (arch == LLM_ARCH_NOMIC_BERT) {
type = LLM_TYPE_137M;
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
type = LLM_TYPE_475M;
}
}
} break;
case LLM_ARCH_BLOOM:
@ -791,6 +802,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@ -800,6 +815,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@ -2057,6 +2074,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break;
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@ -2090,21 +2108,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
}
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
} else {
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT) {
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
}
}
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
@ -5730,6 +5759,11 @@ struct llm_build_bert : public llm_graph_context {
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@ -5782,13 +5816,29 @@ struct llm_build_bert : public llm_graph_context {
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
if (model.arch == LLM_ARCH_BERT) {
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_GELU,
false, false,
0.0f,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
@ -5796,6 +5846,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
@ -5803,8 +5854,8 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
}
cb(cur, "ffn_out", il);
}
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
@ -12842,6 +12893,7 @@ llm_graph_result_ptr llama_model::build_graph(
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
llm = std::make_unique<llm_build_bert>(*this, params, gf);
} break;
@ -13200,6 +13252,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN:

View File

@ -36,14 +36,17 @@ enum llm_type {
LLM_TYPE_335M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_475M,
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
LLM_TYPE_1B,
LLM_TYPE_1_3B,
LLM_TYPE_1_4B,
LLM_TYPE_1_5B,
LLM_TYPE_1_6B,
LLM_TYPE_1_7B,
LLM_TYPE_1_8B,
LLM_TYPE_2B,
LLM_TYPE_2_8B,
@ -62,6 +65,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@ -70,6 +74,7 @@ enum llm_type {
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_671B,
LLM_TYPE_SMALL,
@ -84,10 +89,10 @@ enum llm_type {
LLM_TYPE_16x3_8B,
LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
LLM_TYPE_290B,
LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick
LLM_TYPE_30B_A3B,
LLM_TYPE_235B_A22B,
};
struct llama_layer_posnet {

View File

@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// }
if (k <= 0) {
k = cur_p->size;
return;
}
k = std::min(k, (int) cur_p->size);
@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}
cur_p->sorted = true;
}
cur_p->size = k;
}

View File

@ -1232,6 +1232,7 @@ extern "C" {
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751