diff --git a/Makefile b/Makefile index 0ec85bc3..4392980b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=c05e8c9934f94fde49bc1bc9dc51eed282605150 +CPPLLAMA_VERSION?=924518e2e5726e81f3aeb2518fb85963a500e93a # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 7632aebc..f0a16ffa 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -428,6 +428,7 @@ struct llama_server_context { llama_model *model = nullptr; llama_context *ctx = nullptr; + const llama_vocab * vocab = nullptr; clip_ctx *clp_ctx = nullptr; @@ -439,6 +440,7 @@ struct llama_server_context bool clean_kv_cache = true; bool all_slots_are_idle = false; bool add_bos_token = true; + bool has_eos_token = true; int32_t n_ctx; // total context for all clients / slots @@ -502,7 +504,7 @@ struct llama_server_context if (multimodal) { const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); - const int n_embd_llm = llama_n_embd(model); + const int n_embd_llm = llama_model_n_embd(model); if (n_embd_clip != n_embd_llm) { LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); llama_free(ctx); @@ -511,23 +513,15 @@ struct llama_server_context } } + vocab = llama_model_get_vocab(model); n_ctx = llama_n_ctx(ctx); - add_bos_token = llama_add_bos_token(model); + add_bos_token = llama_vocab_get_add_bos(vocab); + has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; return true; } - void validate_model_chat_template(server_params & sparams) { - llama_chat_message chat[] = {{"user", "test"}}; - std::vector buf(1); - int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); - if (res < 0) { - LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__); - sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template - } - } - llama_client_slot* get_active_slot() { for (llama_client_slot& slot : slots) { // Check if the slot is currently processing @@ -725,8 +719,8 @@ struct llama_server_context slot->prompt = ""; } - if (json_value(data, "ignore_eos", false)) { - slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + if (json_value(data, "ignore_eos", false) && has_eos_token) { + slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY}); } /* slot->sparams.penalty_prompt_tokens.clear(); @@ -765,13 +759,13 @@ struct llama_server_context } } */ - slot->sparams.logit_bias.clear(); const auto &logit_bias = data.find("logit_bias"); if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int n_vocab = llama_vocab_n_tokens(vocab); for (const auto &el : *logit_bias) { if (el.is_array() && el.size() == 2) @@ -800,7 +794,7 @@ struct llama_server_context } else if (el[0].is_string()) { - auto toks = common_tokenize(model, el[0].get(), false); + auto toks = common_tokenize(vocab, el[0].get(), false); for (auto tok : toks) { slot->sparams.logit_bias.push_back({tok, bias}); @@ -1130,7 +1124,7 @@ struct llama_server_context slot.has_next_token = false; } - if (result.tok == llama_token_eos(model)) + if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok)) { slot.stopped_eos = true; slot.has_next_token = false; @@ -1325,7 +1319,7 @@ struct llama_server_context res.error = false; res.stop = true; - const int n_embd = llama_n_embd(model); + const int n_embd = llama_model_n_embd(model); if (!params.embedding) { LOG_WARNING("embedding disabled", { @@ -1424,7 +1418,7 @@ struct llama_server_context n_eval = n_batch; } - const int n_embd = llama_n_embd(model); + const int n_embd = llama_model_n_embd(model); float * embd = img.image_embedding + i * n_embd; llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0); if (llama_decode(ctx, llava_batch.batch)) @@ -1705,11 +1699,11 @@ struct llama_server_context suffix_tokens.erase(suffix_tokens.begin()); } - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); + prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab)); + prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS + prefix_tokens.insert(prefix_tokens.end(), llama_vocab_fim_suf(vocab)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); + prefix_tokens.push_back(llama_vocab_fim_mid(vocab)); prompt_tokens = prefix_tokens; } else