From 20d637e7b70cf0e15e6bf255ab2e4c080ddde2b0 Mon Sep 17 00:00:00 2001 From: ok2sh <151505547+ok2sh@users.noreply.github.com> Date: Tue, 21 Nov 2023 10:26:39 -0800 Subject: [PATCH] fix: ExLlama Backend Context Size & Rope Scaling (#1311) * fix: context_size not propagated to exllama backend * fix: exllama rope scaling --- backend/python/exllama/exllama.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/backend/python/exllama/exllama.py b/backend/python/exllama/exllama.py index 25785aae..758e6f2d 100755 --- a/backend/python/exllama/exllama.py +++ b/backend/python/exllama/exllama.py @@ -63,6 +63,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): config = ExLlamaConfig(model_config_path) # create config from config.json config.model_path = model_path # supply path to model weights file + if (request.ContextSize): + config.max_seq_len = request.ContextSize # override max sequence length + config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2. + # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163 + + # Set Rope scaling. + if (request.RopeFreqScale): + # Alpha value for Rope scaling. + # Higher value increases context but adds perplexity. + # alpha_value and compress_pos_emb are mutually exclusive. + # https://github.com/turboderp/exllama/issues/115 + config.alpha_value = request.RopeFreqScale + config.calculate_rotary_embedding_base() model = ExLlama(config) # create ExLlama instance and load the weights tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file