From a9b0e264f2dec63c7bcd9c2d8e98194297cce065 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 13 Sep 2024 19:08:50 +0200
Subject: [PATCH] chore(exllama): drop exllama backend

For polishing and cleaning up it makes now sense to drop exllama which
is completely unmaintained, and was only supporting the llamav1
architecture (nowadays it's superseded by llamav1) .

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Dockerfile                                    |   5 +-
 Makefile                                      |  13 +-
 backend/python/exllama/.gitignore             |   1 -
 backend/python/exllama/Makefile               |  25 ---
 backend/python/exllama/README.md              |   5 -
 backend/python/exllama/backend.py             | 159 ------------------
 backend/python/exllama/install.sh             |  13 --
 backend/python/exllama/requirements-cpu.txt   |   3 -
 .../python/exllama/requirements-cublas11.txt  |   4 -
 .../python/exllama/requirements-cublas12.txt  |   3 -
 backend/python/exllama/requirements.txt       |   4 -
 backend/python/exllama/run.sh                 |   7 -
 backend/python/exllama/test.sh                |   6 -
 13 files changed, 3 insertions(+), 245 deletions(-)
 delete mode 100644 backend/python/exllama/.gitignore
 delete mode 100644 backend/python/exllama/Makefile
 delete mode 100644 backend/python/exllama/README.md
 delete mode 100755 backend/python/exllama/backend.py
 delete mode 100755 backend/python/exllama/install.sh
 delete mode 100644 backend/python/exllama/requirements-cpu.txt
 delete mode 100644 backend/python/exllama/requirements-cublas11.txt
 delete mode 100644 backend/python/exllama/requirements-cublas12.txt
 delete mode 100644 backend/python/exllama/requirements.txt
 delete mode 100755 backend/python/exllama/run.sh
 delete mode 100755 backend/python/exllama/test.sh

diff --git a/Dockerfile b/Dockerfile
index b86cc706..f08cb9a0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,7 +13,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 
 
 RUN apt-get update && \
@@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
     ; fi && \
     if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
         make -C backend/python/transformers-musicgen \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/exllama \
     ; fi
 
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
diff --git a/Makefile b/Makefile
index 3d9ea592..a3f0ffd0 100644
--- a/Makefile
+++ b/Makefile
@@ -534,10 +534,10 @@ protogen-go-clean:
 	$(RM) bin/*
 
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
 
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
 
 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -571,14 +571,6 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 
-.PHONY: exllama-protogen
-exllama-protogen:
-	$(MAKE) -C backend/python/exllama protogen
-
-.PHONY: exllama-protogen-clean
-exllama-protogen-clean:
-	$(MAKE) -C backend/python/exllama protogen-clean
-
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
-	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/exllama2
 
 prepare-test-extra: protogen-python
diff --git a/backend/python/exllama/.gitignore b/backend/python/exllama/.gitignore
deleted file mode 100644
index 1d3a0654..00000000
--- a/backend/python/exllama/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-source
\ No newline at end of file
diff --git a/backend/python/exllama/Makefile b/backend/python/exllama/Makefile
deleted file mode 100644
index e6a67881..00000000
--- a/backend/python/exllama/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-export CONDA_ENV_PATH = "exllama.yml"
-
-.PHONY: exllama
-exllama: protogen
-	bash install.sh ${CONDA_ENV_PATH}
-
-.PHONY: run
-run: protogen
-	@echo "Running exllama..."
-	bash run.sh
-	@echo "exllama run."
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
-
-.PHONY: clean
-clean: protogen-clean
-	$(RM) -r venv source __pycache__
\ No newline at end of file
diff --git a/backend/python/exllama/README.md b/backend/python/exllama/README.md
deleted file mode 100644
index f9ed5e9f..00000000
--- a/backend/python/exllama/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Creating a separate environment for the exllama project
-
-```
-make exllama
-```
\ No newline at end of file
diff --git a/backend/python/exllama/backend.py b/backend/python/exllama/backend.py
deleted file mode 100755
index 58d1392c..00000000
--- a/backend/python/exllama/backend.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python3
-import grpc
-from concurrent import futures
-import time
-import backend_pb2
-import backend_pb2_grpc
-import argparse
-import signal
-import sys
-import os, glob
-
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from torch import version as torch_version
-
-from source.tokenizer import ExLlamaTokenizer
-from source.generator import ExLlamaGenerator
-from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def generate(self,prompt, max_new_tokens):
-        self.generator.end_beam_search()
-
-        # Tokenizing the input
-        ids = self.generator.tokenizer.encode(prompt)
-
-        self.generator.gen_begin_reuse(ids)
-        initial_len = self.generator.sequence[0].shape[0]
-        has_leading_space = False
-        decoded_text = ''
-        for i in range(max_new_tokens):
-            token = self.generator.gen_single_token()
-            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
-                has_leading_space = True
-
-            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
-
-            if token.item() == self.generator.tokenizer.eos_token_id:
-                break
-        return decoded_text
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-        try:
-            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
-            model_directory = request.ModelFile
-
-            # Locate files we need within that directory
-            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
-            model_config_path = os.path.join(model_directory, "config.json")
-            st_pattern = os.path.join(model_directory, "*.safetensors")
-            model_path = glob.glob(st_pattern)[0]
-
-            # Create config, model, tokenizer and generator
-
-            config = ExLlamaConfig(model_config_path)               # create config from config.json
-            config.model_path = model_path                          # supply path to model weights file
-            if (request.ContextSize):
-                config.max_seq_len = request.ContextSize            # override max sequence length
-                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
-                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
-
-            # Set Rope scaling.
-            if (request.RopeFreqScale):
-                # Alpha value for Rope scaling. 
-                # Higher value increases context but adds perplexity.
-                # alpha_value and compress_pos_emb are mutually exclusive.
-                # https://github.com/turboderp/exllama/issues/115
-                config.alpha_value = request.RopeFreqScale
-                config.calculate_rotary_embedding_base()
-
-            model = ExLlama(config)                                 # create ExLlama instance and load the weights
-            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
-
-            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
-            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
-
-            self.generator= generator
-            self.model = model
-            self.tokenizer = tokenizer
-            self.cache = cache
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        penalty = 1.15
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-        self.generator.settings.token_repetition_penalty_max = penalty
-        self.generator.settings.temperature = request.Temperature
-        self.generator.settings.top_k = request.TopK
-        self.generator.settings.top_p = request.TopP
-
-        tokens = 512
-        if request.Tokens != 0:
-            tokens = request.Tokens
-
-        if self.cache.batch_size == 1:
-            del self.cache
-            self.cache = ExLlamaCache(self.model, batch_size=2)
-            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
-
-        t = self.generate(request.Prompt, tokens)
-
-        # Remove prompt from response if present
-        if request.Prompt in t:
-            t = t.replace(request.Prompt, "")
-
-        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
\ No newline at end of file
diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh
deleted file mode 100755
index d33c4356..00000000
--- a/backend/python/exllama/install.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -e
-
-LIMIT_TARGETS="cublas"
-
-source $(dirname $0)/../common/libbackend.sh
-
-installRequirements
-
-git clone https://github.com/turboderp/exllama $MY_DIR/source
-uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
-
-cp -v ./*py $MY_DIR/source/
diff --git a/backend/python/exllama/requirements-cpu.txt b/backend/python/exllama/requirements-cpu.txt
deleted file mode 100644
index bbcdc8cd..00000000
--- a/backend/python/exllama/requirements-cpu.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch
\ No newline at end of file
diff --git a/backend/python/exllama/requirements-cublas11.txt b/backend/python/exllama/requirements-cublas11.txt
deleted file mode 100644
index 1dfb5b98..00000000
--- a/backend/python/exllama/requirements-cublas11.txt
+++ /dev/null
@@ -1,4 +0,0 @@
---extra-index-url https://download.pytorch.org/whl/cu118
-torch
-transformers
-accelerate
\ No newline at end of file
diff --git a/backend/python/exllama/requirements-cublas12.txt b/backend/python/exllama/requirements-cublas12.txt
deleted file mode 100644
index 1ec544cd..00000000
--- a/backend/python/exllama/requirements-cublas12.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-torch
-transformers
-accelerate
\ No newline at end of file
diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt
deleted file mode 100644
index b9c192d5..00000000
--- a/backend/python/exllama/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-grpcio==1.66.1
-protobuf
-certifi
-setuptools
\ No newline at end of file
diff --git a/backend/python/exllama/run.sh b/backend/python/exllama/run.sh
deleted file mode 100755
index 63119689..00000000
--- a/backend/python/exllama/run.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-LIMIT_TARGETS="cublas"
-BACKEND_FILE="${MY_DIR}/source/backend.py"
-
-source $(dirname $0)/../common/libbackend.sh
-
-startBackend $@
\ No newline at end of file
diff --git a/backend/python/exllama/test.sh b/backend/python/exllama/test.sh
deleted file mode 100755
index 6940b066..00000000
--- a/backend/python/exllama/test.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-runUnittests