From 0778b6ff5fb86439a4aeaec32359416413ecaa2d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 1 May 2025 10:43:30 +0300 Subject: [PATCH] talk-llama : sync llama.cpp ggml-ci --- examples/talk-llama/llama-arch.cpp | 20 +++++++ examples/talk-llama/llama-arch.h | 2 + examples/talk-llama/llama-chat.cpp | 22 +++----- examples/talk-llama/llama-chat.h | 4 +- examples/talk-llama/llama-context.cpp | 21 ++----- examples/talk-llama/llama-context.h | 3 +- examples/talk-llama/llama-graph.cpp | 58 +++++++++++++------ examples/talk-llama/llama-graph.h | 12 ++-- examples/talk-llama/llama-hparams.h | 1 + examples/talk-llama/llama-model.cpp | 77 ++++++++++++++++++++++---- examples/talk-llama/llama-model.h | 9 ++- examples/talk-llama/llama-sampling.cpp | 3 +- examples/talk-llama/llama.h | 1 + 13 files changed, 159 insertions(+), 74 deletions(-) diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index 62e1480b..f2bc8ca7 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -19,6 +19,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, @@ -106,6 +107,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, + { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, @@ -472,6 +474,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_NOMIC_BERT_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_JINA_BERT_V2, { diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index 98ca00a1..41a023da 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -23,6 +23,7 @@ enum llm_arch { LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, + LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, @@ -110,6 +111,7 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, + LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, diff --git a/examples/talk-llama/llama-chat.cpp b/examples/talk-llama/llama-chat.cpp index 41f89e3a..735d2619 100644 --- a/examples/talk-llama/llama-chat.cpp +++ b/examples/talk-llama/llama-chat.cpp @@ -50,8 +50,8 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, - { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, - { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, + { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, + { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 }, { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, @@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("[gMASK]")) { + return LLM_CHAT_TEMPLATE_CHATGLM_4; } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { return tmpl_contains("") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) { @@ -154,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_LLAMA_3; } else if (tmpl_contains("[gMASK]sop")) { // chatglm3-6b - return LLM_CHAT_TEMPLATE_CHATGML_3; - } else if (tmpl_contains("[gMASK]")) { - return LLM_CHAT_TEMPLATE_CHATGML_4; + return LLM_CHAT_TEMPLATE_CHATGLM_3; } else if (tmpl_contains(LU8("<用户>"))) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF return LLM_CHAT_TEMPLATE_MINICPM; @@ -437,7 +437,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) { // chatglm3-6b ss << "[gMASK]" << "sop"; for (auto message : chat) { @@ -447,7 +447,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { ss << "[gMASK]" << ""; for (auto message : chat) { std::string role(message->role); @@ -456,14 +456,6 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>" << "\n" << message->content; - } - if (add_ass) { - ss << "<|assistant|>"; - } } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF for (auto message : chat) { diff --git a/examples/talk-llama/llama-chat.h b/examples/talk-llama/llama-chat.h index dc30df71..3f584346 100644 --- a/examples/talk-llama/llama-chat.h +++ b/examples/talk-llama/llama-chat.h @@ -29,8 +29,8 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, - LLM_CHAT_TEMPLATE_CHATGML_3, - LLM_CHAT_TEMPLATE_CHATGML_4, + LLM_CHAT_TEMPLATE_CHATGLM_3, + LLM_CHAT_TEMPLATE_CHATGLM_4, LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_EXAONE_3, diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index 983385f8..5a2eef9b 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -114,7 +114,7 @@ llama_context::llama_context( } if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", __func__, n_ctx_per_seq, hparams.n_ctx_train); } @@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift( ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const { + float freq_scale) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & yarn_ext_factor = cparams.yarn_ext_factor; @@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift( // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); - if (bbuf) { - for (const auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - } - - tmp = ggml_rope_ext_inplace(ctx0, tmp, + tmp = ggml_rope_ext(ctx0, tmp, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); @@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); ggml_build_forward_expand(gf, cur); } @@ -1547,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - ggml_backend_buffer_clear(buf_output.get(), 0); - this->n_outputs = 0; this->n_outputs_max = n_outputs_max; diff --git a/examples/talk-llama/llama-context.h b/examples/talk-llama/llama-context.h index 04facb54..5457f077 100644 --- a/examples/talk-llama/llama-context.h +++ b/examples/talk-llama/llama-context.h @@ -170,8 +170,7 @@ private: ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const; + float freq_scale) const; llm_graph_result_ptr build_kv_self_shift( ggml_context * ctx0, diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index a85e9728..fabb9ca2 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + if (ubatch->token && n_pos_per_embd == 4) { + // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D + // the 3 first dims are the same, and 4th dim is all 0 + std::vector pos_data(n_tokens*n_pos_per_embd); + // copy the first dimension + for (int i = 0; i < n_tokens; ++i) { + pos_data[ i] = ubatch->pos[i]; + pos_data[ n_tokens + i] = ubatch->pos[i]; + pos_data[2 * n_tokens + i] = ubatch->pos[i]; + pos_data[3 * n_tokens + i] = 0; // 4th dim is 0 + } + ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); + } else { + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + } } } @@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ) * f_attn_temp_scale + 1.0; } - ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); + ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale)); } } @@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : res (std::make_unique()) { } -int64_t llm_graph_context::n_pos_per_token() const { +int64_t llm_graph_context::n_pos_per_embd() const { return arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); + if (arch == LLM_ARCH_GLM4) { + // GLM4 seems to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } } if (down_b) { @@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); + ggml_tensor * experts = nullptr; + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } switch (type_op) { case LLM_FFN_SILU: { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_moe_gelu", il); } break; default: GGML_ABORT("fatal error"); } - ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); + if (gate_exps) { + cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate_par", il); + } - ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); if (!weight_before_ffn) { @@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { } ggml_tensor * llm_graph_context::build_inp_pos() const { - auto inp = std::make_unique(n_pos_per_token()); + auto inp = std::make_unique(n_pos_per_embd()); auto & cur = inp->pos; - cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd()); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto & cur = inp->attn_scale; - cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); + // this need to be 1x1xN for broadcasting + cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); res->add_input(std::move(inp)); diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index d192dc14..d0c8d321 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -90,29 +90,27 @@ public: class llm_graph_input_pos : public llm_graph_input_i { public: - llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} virtual ~llm_graph_input_pos() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * pos = nullptr; // I32 [n_batch] - const int64_t n_pos_per_token = 1; + const int64_t n_pos_per_embd = 1; }; // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * attn_scale = nullptr; // F32 [n_batch] - const int64_t n_pos_per_token = 1; - const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; }; @@ -419,7 +417,7 @@ struct llm_graph_context { llm_graph_context(const llm_graph_params & params); - int64_t n_pos_per_token() const; + int64_t n_pos_per_embd() const; void cb(ggml_tensor * cur, const char * name, int il) const; diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index 80fcd65d..7ee6a5b7 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -66,6 +66,7 @@ struct llama_hparams { float expert_weights_scale = 0.0; bool expert_weights_norm = false; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; + uint32_t moe_every_n_layers = 0; float f_norm_eps; float f_norm_rms_eps; diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index 6b7bfecf..51092a12 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_335M: return "335M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; + case LLM_TYPE_475M: return "475M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; + case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_6B: return "1.6B"; + case LLM_TYPE_1_7B: return "1.7B"; case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2_8B: return "2.8B"; @@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_15B: return "15B"; case LLM_TYPE_16B: return "16B"; case LLM_TYPE_20B: return "20B"; + case LLM_TYPE_27B: return "27B"; case LLM_TYPE_30B: return "30B"; case LLM_TYPE_32B: return "32B"; case LLM_TYPE_34B: return "34B"; @@ -74,6 +78,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; case LLM_TYPE_236B: return "236B"; + case LLM_TYPE_290B: return "290B"; case LLM_TYPE_314B: return "314B"; case LLM_TYPE_671B: return "671B"; case LLM_TYPE_SMALL: return "0.1B"; @@ -88,10 +93,10 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_57B_A14B: return "57B.A14B"; - case LLM_TYPE_27B: return "27B"; - case LLM_TYPE_290B: return "290B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_235B_A22B: return "235B.A22B"; default: return "?B"; } } @@ -695,13 +700,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { - type = LLM_TYPE_137M; + if (arch == LLM_ARCH_NOMIC_BERT) { + type = LLM_TYPE_137M; + } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { + type = LLM_TYPE_475M; + } } } break; case LLM_ARCH_BLOOM: @@ -791,6 +802,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; + case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; + case 40: type = LLM_TYPE_14B; break; + case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -800,6 +815,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -2057,6 +2074,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); @@ -2090,20 +2108,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); } + if (arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + } else { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } } layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); @@ -5730,6 +5759,11 @@ struct llm_build_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); @@ -5782,13 +5816,29 @@ struct llm_build_bert : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - if (model.arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + nullptr, + model.layers[il].ffn_down_exps, + nullptr, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_GELU, + false, false, + 0.0f, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cb(cur, "ffn_moe_out", il); + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -5796,6 +5846,7 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -5803,8 +5854,8 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } - cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); @@ -12842,6 +12893,7 @@ llm_graph_result_ptr llama_model::build_graph( case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { llm = std::make_unique(*this, params, gf); } break; @@ -13200,6 +13252,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DBRX: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index fd82d106..34aac337 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -36,14 +36,17 @@ enum llm_type { LLM_TYPE_335M, LLM_TYPE_410M, LLM_TYPE_450M, + LLM_TYPE_475M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, + LLM_TYPE_0_6B, LLM_TYPE_1B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, LLM_TYPE_1_6B, + LLM_TYPE_1_7B, LLM_TYPE_1_8B, LLM_TYPE_2B, LLM_TYPE_2_8B, @@ -62,6 +65,7 @@ enum llm_type { LLM_TYPE_15B, LLM_TYPE_16B, LLM_TYPE_20B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, LLM_TYPE_34B, @@ -70,6 +74,7 @@ enum llm_type { LLM_TYPE_65B, LLM_TYPE_70B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, LLM_TYPE_671B, LLM_TYPE_SMALL, @@ -84,10 +89,10 @@ enum llm_type { LLM_TYPE_16x3_8B, LLM_TYPE_10B_128x3_66B, LLM_TYPE_57B_A14B, - LLM_TYPE_27B, - LLM_TYPE_290B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_30B_A3B, + LLM_TYPE_235B_A22B, }; struct llama_layer_posnet { diff --git a/examples/talk-llama/llama-sampling.cpp b/examples/talk-llama/llama-sampling.cpp index d1497985..c0a5f934 100644 --- a/examples/talk-llama/llama-sampling.cpp +++ b/examples/talk-llama/llama-sampling.cpp @@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // } if (k <= 0) { - k = cur_p->size; + return; } k = std::min(k, (int) cur_p->size); @@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } cur_p->sorted = true; } + cur_p->size = k; } diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index a13350e1..06c56395 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -1232,6 +1232,7 @@ extern "C" { "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + /// Setting k <= 0 makes this a noop LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751