From 9e32fda30438c4b1e3d2b0cad7fc2efd75610fb9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 14 Feb 2025 14:55:03 +0100
Subject: [PATCH] fix(llama.cpp): improve context shift handling (#4820)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama/grpc-server.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index 4daf84c6..69a09c74 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1155,6 +1155,14 @@ struct llama_server_context
             slot.has_next_token = false;
         }
 
+        if (slot.n_past >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stopped_limit = true;
+            slot.has_next_token = false;
+
+            LOG_VERBOSE("stopped due to running out of context capacity", {});
+        }
+
         if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
         {
             slot.stopped_eos = true;
@@ -1627,17 +1635,17 @@ struct llama_server_context
             {
                 if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                 {
+                    // this check is redundant (for good)
+                    // we should never get here, because generation should already stopped in process_token()
+
                     // START LOCALAI changes
                     // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
                     // See: https://github.com/mudler/LocalAI/issues/1333
                     // Context is exhausted, release the slot
                     slot.release();
                     send_final_response(slot);
-                    slot.cache_tokens.clear();
-                    slot.n_past = 0;
-                    slot.truncated = false;
-                    slot.has_next_token = true;
-                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    slot.has_next_token = false;
+                    LOG_ERROR("context is exhausted, release the slot", {});
 
                     continue;
                     // END LOCALAI changes