chore(deps): update llama.cpp (#3497)

* Apply llava patch

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-09-12 20:55:27 +02:00 committed by GitHub
parent e35d8169b1
commit d51444d606
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 70 additions and 86 deletions

View File

@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04 CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
# go-rwkv version # go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp

View File

@ -17,11 +17,10 @@
#include "common.h" #include "common.h"
#include "json.hpp" #include "json.hpp"
#include "llama.h" #include "llama.h"
#include "grammar-parser.h"
#include "backend.pb.h" #include "backend.pb.h"
#include "backend.grpc.pb.h" #include "backend.grpc.pb.h"
#include "utils.hpp" #include "utils.hpp"
#include "sampling.h"
// include std::regex // include std::regex
#include <cstddef> #include <cstddef>
#include <thread> #include <thread>
@ -203,8 +202,8 @@ struct llama_client_slot
std::string stopping_word; std::string stopping_word;
// sampling // sampling
struct llama_sampling_params sparams; struct gpt_sampler_params sparams;
llama_sampling_context *ctx_sampling = nullptr; gpt_sampler *ctx_sampling = nullptr;
int32_t ga_i = 0; // group-attention state int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor int32_t ga_n = 1; // group-attention factor
@ -619,7 +618,7 @@ struct llama_server_context
bool launch_slot_with_data(llama_client_slot* &slot, json data) { bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params; slot_params default_params;
llama_sampling_params default_sparams; gpt_sampler_params default_sparams;
slot->params.stream = json_value(data, "stream", false); slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false);
@ -628,7 +627,7 @@ struct llama_server_context
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@ -641,7 +640,7 @@ struct llama_server_context
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed); slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
@ -665,6 +664,7 @@ struct llama_server_context
slot->params.input_prefix = ""; slot->params.input_prefix = "";
} }
if (data.count("input_suffix") != 0) if (data.count("input_suffix") != 0)
{ {
slot->params.input_suffix = data["input_suffix"]; slot->params.input_suffix = data["input_suffix"];
@ -683,6 +683,10 @@ struct llama_server_context
slot->prompt = ""; slot->prompt = "";
} }
if (json_value(data, "ignore_eos", false)) {
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
}
/*
slot->sparams.penalty_prompt_tokens.clear(); slot->sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false; slot->sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt"); const auto &penalty_prompt = data.find("penalty_prompt");
@ -718,14 +722,10 @@ struct llama_server_context
slot->sparams.use_penalty_prompt_tokens = true; slot->sparams.use_penalty_prompt_tokens = true;
} }
} }
*/
slot->sparams.logit_bias.clear(); slot->sparams.logit_bias.clear();
if (json_value(data, "ignore_eos", false))
{
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
const auto &logit_bias = data.find("logit_bias"); const auto &logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array()) if (logit_bias != data.end() && logit_bias->is_array())
{ {
@ -753,7 +753,7 @@ struct llama_server_context
llama_token tok = el[0].get<llama_token>(); llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab) if (tok >= 0 && tok < n_vocab)
{ {
slot->sparams.logit_bias[tok] = bias; slot->sparams.logit_bias.push_back({tok, bias});
} }
} }
else if (el[0].is_string()) else if (el[0].is_string())
@ -761,7 +761,7 @@ struct llama_server_context
auto toks = llama_tokenize(model, el[0].get<std::string>(), false); auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks) for (auto tok : toks)
{ {
slot->sparams.logit_bias[tok] = bias; slot->sparams.logit_bias.push_back({tok, bias});
} }
} }
} }
@ -782,24 +782,22 @@ struct llama_server_context
} }
} }
const auto &samplers_sequence = data.find("samplers"); const auto & samplers = data.find("samplers");
if (samplers_sequence != data.end() && samplers_sequence->is_array()) if (samplers != data.end() && samplers->is_array()) {
{
std::vector<std::string> sampler_names; std::vector<std::string> sampler_names;
for (const auto &sampler_name : *samplers_sequence) for (const auto & name : *samplers) {
{ if (name.is_string()) {
if (sampler_name.is_string()) sampler_names.emplace_back(name);
{
sampler_names.emplace_back(sampler_name);
} }
} }
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
} }
else else
{ {
slot->sparams.samplers_sequence = default_sparams.samplers_sequence; slot->sparams.samplers = default_sparams.samplers;
} }
if (multimodal) if (multimodal)
{ {
const auto &images_data = data.find("image_data"); const auto &images_data = data.find("image_data");
@ -875,10 +873,10 @@ struct llama_server_context
if (slot->ctx_sampling != nullptr) if (slot->ctx_sampling != nullptr)
{ {
llama_sampling_free(slot->ctx_sampling); gpt_sampler_free(slot->ctx_sampling);
} }
slot->ctx_sampling = llama_sampling_init(slot->sparams); slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed); //llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT; slot->command = LOAD_PROMPT;
all_slots_are_idle = false; all_slots_are_idle = false;
@ -888,7 +886,7 @@ struct llama_server_context
{"task_id", slot->task_id}, {"task_id", slot->task_id},
}); });
LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
return true; return true;
} }
@ -1006,11 +1004,13 @@ struct llama_server_context
slot.generated_text += token_str; slot.generated_text += token_str;
slot.has_next_token = true; slot.has_next_token = true;
/*
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
{ {
// we can change penalty_prompt_tokens because it is always created from scratch each request // we can change penalty_prompt_tokens because it is always created from scratch each request
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
} }
*/
// check if there is incomplete UTF-8 character at the end // check if there is incomplete UTF-8 character at the end
bool incomplete = false; bool incomplete = false;
@ -1144,13 +1144,11 @@ struct llama_server_context
json get_formated_generation(llama_client_slot &slot) json get_formated_generation(llama_client_slot &slot)
{ {
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); std::vector<std::string> samplers;
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && samplers.reserve(slot.sparams.samplers.size());
eos_bias->second < 0.0f && std::isinf(eos_bias->second); for (const auto & sampler : slot.sparams.samplers)
std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence)
{ {
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); samplers.emplace_back(gpt_sampler_type_to_str(sampler));
} }
return json { return json {
@ -1165,13 +1163,11 @@ struct llama_server_context
{"top_p", slot.sparams.top_p}, {"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p}, {"min_p", slot.sparams.min_p},
{"tfs_z", slot.sparams.tfs_z}, {"tfs_z", slot.sparams.tfs_z},
{"typical_p", slot.sparams.typical_p}, {"typical_p", slot.sparams.typ_p},
{"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_last_n", slot.sparams.penalty_last_n},
{"repeat_penalty", slot.sparams.penalty_repeat}, {"repeat_penalty", slot.sparams.penalty_repeat},
{"presence_penalty", slot.sparams.penalty_present}, {"presence_penalty", slot.sparams.penalty_present},
{"frequency_penalty", slot.sparams.penalty_freq}, {"frequency_penalty", slot.sparams.penalty_freq},
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
{"mirostat", slot.sparams.mirostat}, {"mirostat", slot.sparams.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta}, {"mirostat_eta", slot.sparams.mirostat_eta},
@ -1179,13 +1175,13 @@ struct llama_server_context
{"stop", slot.params.antiprompt}, {"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict}, {"n_predict", slot.params.n_predict},
{"n_keep", params.n_keep}, {"n_keep", params.n_keep},
{"ignore_eos", ignore_eos}, {"ignore_eos", slot.sparams.ignore_eos},
{"stream", slot.params.stream}, {"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias}, // {"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs}, {"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep}, {"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar}, {"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence} {"samplers", samplers}
}; };
} }
@ -1714,7 +1710,7 @@ struct llama_server_context
if (!slot.params.cache_prompt) if (!slot.params.cache_prompt)
{ {
llama_sampling_reset(slot.ctx_sampling); gpt_sampler_reset(slot.ctx_sampling);
slot.n_past = 0; slot.n_past = 0;
slot.n_past_se = 0; slot.n_past_se = 0;
@ -1726,7 +1722,7 @@ struct llama_server_context
// push the prompt into the sampling context (do not apply grammar) // push the prompt into the sampling context (do not apply grammar)
for (auto &token : prompt_tokens) for (auto &token : prompt_tokens)
{ {
llama_sampling_accept(slot.ctx_sampling, ctx, token, false); gpt_sampler_accept(slot.ctx_sampling, token, false);
} }
slot.n_past = common_part(slot.cache_tokens, prompt_tokens); slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@ -1934,9 +1930,9 @@ struct llama_server_context
} }
completion_token_output result; completion_token_output result;
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
llama_sampling_accept(slot.ctx_sampling, ctx, id, true); gpt_sampler_accept(slot.ctx_sampling, id, true);
slot.n_decoded += 1; slot.n_decoded += 1;
if (slot.n_decoded == 1) if (slot.n_decoded == 1)
@ -1946,19 +1942,14 @@ struct llama_server_context
metrics.on_prompt_eval(slot); metrics.on_prompt_eval(slot);
} }
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
result.tok = id; result.tok = id;
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
const int32_t n_probs = slot.sparams.n_probs; for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
if (slot.sparams.temp <= 0 && n_probs > 0) result.probs.push_back({
{ cur_p->data[i].id,
// for llama_sample_token_greedy we need to sort candidates i >= cur_p->size ? 0.0f : cur_p->data[i].p,
llama_sample_softmax(ctx, &cur_p); });
}
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
{
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
} }
if (!process_token(result, slot)) if (!process_token(result, slot))

View File

@ -0,0 +1,13 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 342042ff..224db9b5 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + 1;
+ patches_data[i] = i;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);

View File

@ -1,5 +1,12 @@
#!/bin/bash #!/bin/bash
## Patches
## Apply patches from the `patches` directory
for patch in $(ls patches); do
echo "Applying patch $patch"
patch -d llama.cpp/ -p1 < patches/$patch
done
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/ cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -rfv json.hpp llama.cpp/examples/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/

View File

@ -481,30 +481,3 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
return ret; return ret;
} }
//
// random string / id
//
static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd;
std::mt19937 generator(rd());
std::string result(32, ' ');
for (int i = 0; i < 32; ++i) {
result[i] = str[generator() % str.size()];
}
return result;
}
static std::string gen_chatcmplid()
{
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}