From a9b0e264f2dec63c7bcd9c2d8e98194297cce065 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 13 Sep 2024 19:08:50 +0200 Subject: [PATCH] chore(exllama): drop exllama backend For polishing and cleaning up it makes now sense to drop exllama which is completely unmaintained, and was only supporting the llamav1 architecture (nowadays it's superseded by llamav1) . Signed-off-by: Ettore Di Giacinto --- Dockerfile | 5 +- Makefile | 13 +- backend/python/exllama/.gitignore | 1 - backend/python/exllama/Makefile | 25 --- backend/python/exllama/README.md | 5 - backend/python/exllama/backend.py | 159 ------------------ backend/python/exllama/install.sh | 13 -- backend/python/exllama/requirements-cpu.txt | 3 - .../python/exllama/requirements-cublas11.txt | 4 - .../python/exllama/requirements-cublas12.txt | 3 - backend/python/exllama/requirements.txt | 4 - backend/python/exllama/run.sh | 7 - backend/python/exllama/test.sh | 6 - 13 files changed, 3 insertions(+), 245 deletions(-) delete mode 100644 backend/python/exllama/.gitignore delete mode 100644 backend/python/exllama/Makefile delete mode 100644 backend/python/exllama/README.md delete mode 100755 backend/python/exllama/backend.py delete mode 100755 backend/python/exllama/install.sh delete mode 100644 backend/python/exllama/requirements-cpu.txt delete mode 100644 backend/python/exllama/requirements-cublas11.txt delete mode 100644 backend/python/exllama/requirements-cublas12.txt delete mode 100644 backend/python/exllama/requirements.txt delete mode 100755 backend/python/exllama/run.sh delete mode 100755 backend/python/exllama/test.sh diff --git a/Dockerfile b/Dockerfile index b86cc706..f08cb9a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ARG TARGETARCH ARG TARGETVARIANT ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" RUN apt-get update && \ @@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG ; fi && \ if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/transformers-musicgen \ - ; fi && \ - if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ - make -C backend/python/exllama \ ; fi RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ diff --git a/Makefile b/Makefile index 3d9ea592..a3f0ffd0 100644 --- a/Makefile +++ b/Makefile @@ -534,10 +534,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen +protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean +protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean .PHONY: autogptq-protogen autogptq-protogen: @@ -571,14 +571,6 @@ diffusers-protogen: diffusers-protogen-clean: $(MAKE) -C backend/python/diffusers protogen-clean -.PHONY: exllama-protogen -exllama-protogen: - $(MAKE) -C backend/python/exllama protogen - -.PHONY: exllama-protogen-clean -exllama-protogen-clean: - $(MAKE) -C backend/python/exllama protogen-clean - .PHONY: exllama2-protogen exllama2-protogen: $(MAKE) -C backend/python/exllama2 protogen @@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python $(MAKE) -C backend/python/parler-tts $(MAKE) -C backend/python/vall-e-x $(MAKE) -C backend/python/openvoice - $(MAKE) -C backend/python/exllama $(MAKE) -C backend/python/exllama2 prepare-test-extra: protogen-python diff --git a/backend/python/exllama/.gitignore b/backend/python/exllama/.gitignore deleted file mode 100644 index 1d3a0654..00000000 --- a/backend/python/exllama/.gitignore +++ /dev/null @@ -1 +0,0 @@ -source \ No newline at end of file diff --git a/backend/python/exllama/Makefile b/backend/python/exllama/Makefile deleted file mode 100644 index e6a67881..00000000 --- a/backend/python/exllama/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -export CONDA_ENV_PATH = "exllama.yml" - -.PHONY: exllama -exllama: protogen - bash install.sh ${CONDA_ENV_PATH} - -.PHONY: run -run: protogen - @echo "Running exllama..." - bash run.sh - @echo "exllama run." - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto - -.PHONY: clean -clean: protogen-clean - $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/exllama/README.md b/backend/python/exllama/README.md deleted file mode 100644 index f9ed5e9f..00000000 --- a/backend/python/exllama/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Creating a separate environment for the exllama project - -``` -make exllama -``` \ No newline at end of file diff --git a/backend/python/exllama/backend.py b/backend/python/exllama/backend.py deleted file mode 100755 index 58d1392c..00000000 --- a/backend/python/exllama/backend.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 -import grpc -from concurrent import futures -import time -import backend_pb2 -import backend_pb2_grpc -import argparse -import signal -import sys -import os, glob - -from pathlib import Path -import torch -import torch.nn.functional as F -from torch import version as torch_version - -from source.tokenizer import ExLlamaTokenizer -from source.generator import ExLlamaGenerator -from source.model import ExLlama, ExLlamaCache, ExLlamaConfig - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - def generate(self,prompt, max_new_tokens): - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text - def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - def LoadModel(self, request, context): - try: - # https://github.com/turboderp/exllama/blob/master/example_cfg.py - model_directory = request.ModelFile - - # Locate files we need within that directory - tokenizer_path = os.path.join(model_directory, "tokenizer.model") - model_config_path = os.path.join(model_directory, "config.json") - st_pattern = os.path.join(model_directory, "*.safetensors") - model_path = glob.glob(st_pattern)[0] - - # Create config, model, tokenizer and generator - - config = ExLlamaConfig(model_config_path) # create config from config.json - config.model_path = model_path # supply path to model weights file - if (request.ContextSize): - config.max_seq_len = request.ContextSize # override max sequence length - config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2. - # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163 - - # Set Rope scaling. - if (request.RopeFreqScale): - # Alpha value for Rope scaling. - # Higher value increases context but adds perplexity. - # alpha_value and compress_pos_emb are mutually exclusive. - # https://github.com/turboderp/exllama/issues/115 - config.alpha_value = request.RopeFreqScale - config.calculate_rotary_embedding_base() - - model = ExLlama(config) # create ExLlama instance and load the weights - tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file - - cache = ExLlamaCache(model, batch_size = 2) # create cache for inference - generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - - self.generator= generator - self.model = model - self.tokenizer = tokenizer - self.cache = cache - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - penalty = 1.15 - if request.Penalty != 0.0: - penalty = request.Penalty - self.generator.settings.token_repetition_penalty_max = penalty - self.generator.settings.temperature = request.Temperature - self.generator.settings.top_k = request.TopK - self.generator.settings.top_p = request.TopP - - tokens = 512 - if request.Tokens != 0: - tokens = request.Tokens - - if self.cache.batch_size == 1: - del self.cache - self.cache = ExLlamaCache(self.model, batch_size=2) - self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache) - - t = self.generate(request.Prompt, tokens) - - # Remove prompt from response if present - if request.Prompt in t: - t = t.replace(request.Prompt, "") - - return backend_pb2.Result(message=bytes(t, encoding='utf-8')) - - def PredictStream(self, request, context): - # Implement PredictStream RPC - #for reply in some_data_generator(): - # yield reply - # Not implemented yet - return self.Predict(request, context) - - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) \ No newline at end of file diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh deleted file mode 100755 index d33c4356..00000000 --- a/backend/python/exllama/install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -LIMIT_TARGETS="cublas" - -source $(dirname $0)/../common/libbackend.sh - -installRequirements - -git clone https://github.com/turboderp/exllama $MY_DIR/source -uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt - -cp -v ./*py $MY_DIR/source/ diff --git a/backend/python/exllama/requirements-cpu.txt b/backend/python/exllama/requirements-cpu.txt deleted file mode 100644 index bbcdc8cd..00000000 --- a/backend/python/exllama/requirements-cpu.txt +++ /dev/null @@ -1,3 +0,0 @@ -transformers -accelerate -torch \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas11.txt b/backend/python/exllama/requirements-cublas11.txt deleted file mode 100644 index 1dfb5b98..00000000 --- a/backend/python/exllama/requirements-cublas11.txt +++ /dev/null @@ -1,4 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas12.txt b/backend/python/exllama/requirements-cublas12.txt deleted file mode 100644 index 1ec544cd..00000000 --- a/backend/python/exllama/requirements-cublas12.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt deleted file mode 100644 index b9c192d5..00000000 --- a/backend/python/exllama/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -grpcio==1.66.1 -protobuf -certifi -setuptools \ No newline at end of file diff --git a/backend/python/exllama/run.sh b/backend/python/exllama/run.sh deleted file mode 100755 index 63119689..00000000 --- a/backend/python/exllama/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -LIMIT_TARGETS="cublas" -BACKEND_FILE="${MY_DIR}/source/backend.py" - -source $(dirname $0)/../common/libbackend.sh - -startBackend $@ \ No newline at end of file diff --git a/backend/python/exllama/test.sh b/backend/python/exllama/test.sh deleted file mode 100755 index 6940b066..00000000 --- a/backend/python/exllama/test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -runUnittests