mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-10 04:23:17 +00:00
fix(llama.cpp): disable infinite context shifting (#1704)
Infinite context loop might as well trigger an infinite loop of context shifting if the model hallucinates and does not stop answering. This has the unpleasant effect that the predicion never terminates, which is the case especially on small models which tends to hallucinate. Workarounds https://github.com/mudler/LocalAI/issues/1333 by removing context-shifting. See also upstream issue: https://github.com/ggerganov/llama.cpp/issues/3969
This commit is contained in:
parent
2e61ff32ad
commit
c56b6ddb1c
@ -1387,30 +1387,20 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||||
{
|
{
|
||||||
// Shift context
|
// START LOCALAI changes
|
||||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
|
||||||
const int n_discard = n_left / 2;
|
// See: https://github.com/mudler/LocalAI/issues/1333
|
||||||
|
// Context is exhausted, release the slot
|
||||||
|
slot.release();
|
||||||
|
send_final_response(slot);
|
||||||
|
slot.cache_tokens.clear();
|
||||||
|
slot.n_past = 0;
|
||||||
|
slot.truncated = false;
|
||||||
|
slot.has_next_token = true;
|
||||||
|
LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||||
|
|
||||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
continue;
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
// END LOCALAI changes
|
||||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
|
||||||
|
|
||||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
|
||||||
{
|
|
||||||
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
|
|
||||||
|
|
||||||
slot.n_past -= n_discard;
|
|
||||||
|
|
||||||
slot.truncated = true;
|
|
||||||
|
|
||||||
LOG_VERBOSE("context shift", {
|
|
||||||
{ "n_ctx", n_ctx },
|
|
||||||
{ "n_keep", params.n_keep },
|
|
||||||
{ "n_left", n_left },
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user