fix(llama.cpp): disable infinite context shifting (#1704)

Infinite context loop might as well trigger an infinite loop of context
shifting if the model hallucinates and does not stop answering.
This has the unpleasant effect that the predicion never terminates,
which is the case especially on small models which tends to hallucinate.

Workarounds https://github.com/mudler/LocalAI/issues/1333 by removing
context-shifting.

See also upstream issue: https://github.com/ggerganov/llama.cpp/issues/3969
This commit is contained in:
Ettore Di Giacinto 2024-02-13 21:17:21 +01:00 committed by GitHub
parent 2e61ff32ad
commit c56b6ddb1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1387,30 +1387,20 @@ struct llama_server_context
{
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
{
// Shift context
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
const int n_discard = n_left / 2;
// START LOCALAI changes
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
// See: https://github.com/mudler/LocalAI/issues/1333
// Context is exhausted, release the slot
slot.release();
send_final_response(slot);
slot.cache_tokens.clear();
slot.n_past = 0;
slot.truncated = false;
slot.has_next_token = true;
LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
{
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
}
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
slot.n_past -= n_discard;
slot.truncated = true;
LOG_VERBOSE("context shift", {
{ "n_ctx", n_ctx },
{ "n_keep", params.n_keep },
{ "n_left", n_left },
});
continue;
// END LOCALAI changes
}
}
}