From 20d637e7b70cf0e15e6bf255ab2e4c080ddde2b0 Mon Sep 17 00:00:00 2001
From: ok2sh <151505547+ok2sh@users.noreply.github.com>
Date: Tue, 21 Nov 2023 10:26:39 -0800
Subject: [PATCH] fix: ExLlama Backend Context Size & Rope Scaling (#1311)

* fix: context_size not propagated to exllama backend

* fix: exllama rope scaling
---
 backend/python/exllama/exllama.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/backend/python/exllama/exllama.py b/backend/python/exllama/exllama.py
index 25785aae..758e6f2d 100755
--- a/backend/python/exllama/exllama.py
+++ b/backend/python/exllama/exllama.py
@@ -63,6 +63,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
             config = ExLlamaConfig(model_config_path)               # create config from config.json
             config.model_path = model_path                          # supply path to model weights file
+            if (request.ContextSize):
+                config.max_seq_len = request.ContextSize            # override max sequence length
+                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
+                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
+
+            # Set Rope scaling.
+            if (request.RopeFreqScale):
+                # Alpha value for Rope scaling. 
+                # Higher value increases context but adds perplexity.
+                # alpha_value and compress_pos_emb are mutually exclusive.
+                # https://github.com/turboderp/exllama/issues/115
+                config.alpha_value = request.RopeFreqScale
+                config.calculate_rotary_embedding_base()
 
             model = ExLlama(config)                                 # create ExLlama instance and load the weights
             tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file