chore(deps): update llama.cpp to 6262d13e0b2da91f230129a93a996609a2fa2f2 (#3549)

chore(deps): update llama.cpp to 6262d13e0b2da91f230129a93a996609a2f5a2f2

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-09-16 10:29:20 +02:00 committed by GitHub
parent 3d3db1d74f
commit 25deb4ba95
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 33 additions and 30 deletions

View File

@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=feff4aa8461da7c432d144c11da4802e41fef3cf CPPLLAMA_VERSION?=6262d13e0b2da91f230129a93a996609a2f5a2f2
# go-rwkv version # go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp

View File

@ -13,6 +13,7 @@
#include <getopt.h> #include <getopt.h>
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "log.h"
#include "stb_image.h" #include "stb_image.h"
#include "common.h" #include "common.h"
#include "json.hpp" #include "json.hpp"
@ -448,7 +449,7 @@ struct llama_server_context
LOG_INFO("Multi Modal Mode Enabled", {}); LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) { if(clp_ctx == nullptr) {
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
return false; return false;
} }
@ -462,7 +463,7 @@ struct llama_server_context
ctx = llama_init.context; ctx = llama_init.context;
if (model == nullptr) if (model == nullptr)
{ {
LOG_ERROR("unable to load model", {{"model", params.model}}); LOG_ERR("unable to load model: %s", params.model.c_str());
return false; return false;
} }
@ -470,7 +471,7 @@ struct llama_server_context
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_n_embd(model); const int n_embd_llm = llama_n_embd(model);
if (n_embd_clip != n_embd_llm) { if (n_embd_clip != n_embd_llm) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
return false; return false;
@ -489,7 +490,7 @@ struct llama_server_context
std::vector<char> buf(1); std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) { if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
} }
} }
@ -812,10 +813,11 @@ struct llama_server_context
img_sl.img_data = clip_image_u8_init(); img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{ {
LOG_ERROR("failed to load image", { LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
{"slot_id", slot->id}, __func__,
{"img_sl_id", img_sl.id} slot->id,
}); img_sl.id
);
return false; return false;
} }
LOG_VERBOSE("image loaded", { LOG_VERBOSE("image loaded", {
@ -853,12 +855,12 @@ struct llama_server_context
} }
} }
if (!found) { if (!found) {
LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); LOG("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear(); slot->images.clear();
return false; return false;
} }
} catch (const std::invalid_argument& e) { } catch (const std::invalid_argument& e) {
LOG_TEE("Invalid image number id in prompt\n"); LOG("Invalid image number id in prompt\n");
slot->images.clear(); slot->images.clear();
return false; return false;
} }
@ -886,7 +888,7 @@ struct llama_server_context
{"task_id", slot->task_id}, {"task_id", slot->task_id},
}); });
// LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); // LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
return true; return true;
} }
@ -926,7 +928,7 @@ struct llama_server_context
}; };
if (llama_decode(ctx, batch_view) != 0) if (llama_decode(ctx, batch_view) != 0)
{ {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG("%s: llama_decode() failed\n", __func__);
return; return;
} }
} }
@ -938,7 +940,7 @@ struct llama_server_context
} }
} }
LOG_TEE("system prompt updated\n"); LOG("system prompt updated\n");
system_need_update = false; system_need_update = false;
} }
@ -1120,7 +1122,7 @@ struct llama_server_context
} }
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG_TEE("Error processing the given image"); LOG("Error processing the given image");
return false; return false;
} }
@ -1132,7 +1134,7 @@ struct llama_server_context
void send_error(task_server& task, const std::string &error) void send_error(task_server& task, const std::string &error)
{ {
LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); LOG("task %i - error: %s\n", task.id, error.c_str());
task_result res; task_result res;
res.id = task.id; res.id = task.id;
res.multitask_id = task.multitask_id; res.multitask_id = task.multitask_id;
@ -1371,7 +1373,7 @@ struct llama_server_context
}; };
if (llama_decode(ctx, batch_view)) if (llama_decode(ctx, batch_view))
{ {
LOG_TEE("%s : failed to eval\n", __func__); LOG("%s : failed to eval\n", __func__);
return false; return false;
} }
} }
@ -1389,7 +1391,7 @@ struct llama_server_context
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
if (llama_decode(ctx, batch_img)) if (llama_decode(ctx, batch_img))
{ {
LOG_TEE("%s : failed to eval image\n", __func__); LOG("%s : failed to eval image\n", __func__);
return false; return false;
} }
slot.n_past += n_eval; slot.n_past += n_eval;
@ -1572,7 +1574,7 @@ struct llama_server_context
slot.n_past = 0; slot.n_past = 0;
slot.truncated = false; slot.truncated = false;
slot.has_next_token = true; slot.has_next_token = true;
LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
continue; continue;
// END LOCALAI changes // END LOCALAI changes
@ -1820,10 +1822,11 @@ struct llama_server_context
if (has_images && !ingest_images(slot, n_batch)) if (has_images && !ingest_images(slot, n_batch))
{ {
LOG_ERROR("failed processing images", { LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
"slot_id", slot.id, __func__,
"task_id", slot.task_id, slot.id,
}); slot.task_id
);
// FIXME @phymbert: to be properly tested // FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever // early returning without changing the slot state will block the slot for ever
// no one at the moment is checking the return value // no one at the moment is checking the return value
@ -1863,10 +1866,10 @@ struct llama_server_context
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
LOG_TEE("\n"); LOG("\n");
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@ -1876,7 +1879,7 @@ struct llama_server_context
slot.ga_i += slot.ga_w / slot.ga_n; slot.ga_i += slot.ga_w / slot.ga_n;
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
} }
slot.n_past_se += n_tokens; slot.n_past_se += n_tokens;
} }
@ -1901,11 +1904,11 @@ struct llama_server_context
if (n_batch == 1 || ret < 0) if (n_batch == 1 || ret < 0)
{ {
// if you get here, it means the KV cache is full - try increasing it via the context size // if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return false; return false;
} }
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
// retry with half the batch size to try to find a free slot in the KV cache // retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2; n_batch /= 2;