diff --git a/Dockerfile b/Dockerfile index b86cc706..f08cb9a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ARG TARGETARCH ARG TARGETVARIANT ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" RUN apt-get update && \ @@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG ; fi && \ if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/transformers-musicgen \ - ; fi && \ - if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ - make -C backend/python/exllama \ ; fi RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ diff --git a/Makefile b/Makefile index 3d9ea592..a3f0ffd0 100644 --- a/Makefile +++ b/Makefile @@ -534,10 +534,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen +protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean +protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean .PHONY: autogptq-protogen autogptq-protogen: @@ -571,14 +571,6 @@ diffusers-protogen: diffusers-protogen-clean: $(MAKE) -C backend/python/diffusers protogen-clean -.PHONY: exllama-protogen -exllama-protogen: - $(MAKE) -C backend/python/exllama protogen - -.PHONY: exllama-protogen-clean -exllama-protogen-clean: - $(MAKE) -C backend/python/exllama protogen-clean - .PHONY: exllama2-protogen exllama2-protogen: $(MAKE) -C backend/python/exllama2 protogen @@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python $(MAKE) -C backend/python/parler-tts $(MAKE) -C backend/python/vall-e-x $(MAKE) -C backend/python/openvoice - $(MAKE) -C backend/python/exllama $(MAKE) -C backend/python/exllama2 prepare-test-extra: protogen-python diff --git a/backend/python/exllama/.gitignore b/backend/python/exllama/.gitignore deleted file mode 100644 index 1d3a0654..00000000 --- a/backend/python/exllama/.gitignore +++ /dev/null @@ -1 +0,0 @@ -source \ No newline at end of file diff --git a/backend/python/exllama/Makefile b/backend/python/exllama/Makefile deleted file mode 100644 index e6a67881..00000000 --- a/backend/python/exllama/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -export CONDA_ENV_PATH = "exllama.yml" - -.PHONY: exllama -exllama: protogen - bash install.sh ${CONDA_ENV_PATH} - -.PHONY: run -run: protogen - @echo "Running exllama..." - bash run.sh - @echo "exllama run." - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto - -.PHONY: clean -clean: protogen-clean - $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/exllama/README.md b/backend/python/exllama/README.md deleted file mode 100644 index f9ed5e9f..00000000 --- a/backend/python/exllama/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Creating a separate environment for the exllama project - -``` -make exllama -``` \ No newline at end of file diff --git a/backend/python/exllama/backend.py b/backend/python/exllama/backend.py deleted file mode 100755 index 58d1392c..00000000 --- a/backend/python/exllama/backend.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 -import grpc -from concurrent import futures -import time -import backend_pb2 -import backend_pb2_grpc -import argparse -import signal -import sys -import os, glob - -from pathlib import Path -import torch -import torch.nn.functional as F -from torch import version as torch_version - -from source.tokenizer import ExLlamaTokenizer -from source.generator import ExLlamaGenerator -from source.model import ExLlama, ExLlamaCache, ExLlamaConfig - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - def generate(self,prompt, max_new_tokens): - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text - def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - def LoadModel(self, request, context): - try: - # https://github.com/turboderp/exllama/blob/master/example_cfg.py - model_directory = request.ModelFile - - # Locate files we need within that directory - tokenizer_path = os.path.join(model_directory, "tokenizer.model") - model_config_path = os.path.join(model_directory, "config.json") - st_pattern = os.path.join(model_directory, "*.safetensors") - model_path = glob.glob(st_pattern)[0] - - # Create config, model, tokenizer and generator - - config = ExLlamaConfig(model_config_path) # create config from config.json - config.model_path = model_path # supply path to model weights file - if (request.ContextSize): - config.max_seq_len = request.ContextSize # override max sequence length - config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2. - # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163 - - # Set Rope scaling. - if (request.RopeFreqScale): - # Alpha value for Rope scaling. - # Higher value increases context but adds perplexity. - # alpha_value and compress_pos_emb are mutually exclusive. - # https://github.com/turboderp/exllama/issues/115 - config.alpha_value = request.RopeFreqScale - config.calculate_rotary_embedding_base() - - model = ExLlama(config) # create ExLlama instance and load the weights - tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file - - cache = ExLlamaCache(model, batch_size = 2) # create cache for inference - generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - - self.generator= generator - self.model = model - self.tokenizer = tokenizer - self.cache = cache - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - penalty = 1.15 - if request.Penalty != 0.0: - penalty = request.Penalty - self.generator.settings.token_repetition_penalty_max = penalty - self.generator.settings.temperature = request.Temperature - self.generator.settings.top_k = request.TopK - self.generator.settings.top_p = request.TopP - - tokens = 512 - if request.Tokens != 0: - tokens = request.Tokens - - if self.cache.batch_size == 1: - del self.cache - self.cache = ExLlamaCache(self.model, batch_size=2) - self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache) - - t = self.generate(request.Prompt, tokens) - - # Remove prompt from response if present - if request.Prompt in t: - t = t.replace(request.Prompt, "") - - return backend_pb2.Result(message=bytes(t, encoding='utf-8')) - - def PredictStream(self, request, context): - # Implement PredictStream RPC - #for reply in some_data_generator(): - # yield reply - # Not implemented yet - return self.Predict(request, context) - - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) \ No newline at end of file diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh deleted file mode 100755 index d33c4356..00000000 --- a/backend/python/exllama/install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -LIMIT_TARGETS="cublas" - -source $(dirname $0)/../common/libbackend.sh - -installRequirements - -git clone https://github.com/turboderp/exllama $MY_DIR/source -uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt - -cp -v ./*py $MY_DIR/source/ diff --git a/backend/python/exllama/requirements-cpu.txt b/backend/python/exllama/requirements-cpu.txt deleted file mode 100644 index bbcdc8cd..00000000 --- a/backend/python/exllama/requirements-cpu.txt +++ /dev/null @@ -1,3 +0,0 @@ -transformers -accelerate -torch \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas11.txt b/backend/python/exllama/requirements-cublas11.txt deleted file mode 100644 index 1dfb5b98..00000000 --- a/backend/python/exllama/requirements-cublas11.txt +++ /dev/null @@ -1,4 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements-cublas12.txt b/backend/python/exllama/requirements-cublas12.txt deleted file mode 100644 index 1ec544cd..00000000 --- a/backend/python/exllama/requirements-cublas12.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt deleted file mode 100644 index b9c192d5..00000000 --- a/backend/python/exllama/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -grpcio==1.66.1 -protobuf -certifi -setuptools \ No newline at end of file diff --git a/backend/python/exllama/run.sh b/backend/python/exllama/run.sh deleted file mode 100755 index 63119689..00000000 --- a/backend/python/exllama/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -LIMIT_TARGETS="cublas" -BACKEND_FILE="${MY_DIR}/source/backend.py" - -source $(dirname $0)/../common/libbackend.sh - -startBackend $@ \ No newline at end of file diff --git a/backend/python/exllama/test.sh b/backend/python/exllama/test.sh deleted file mode 100755 index 6940b066..00000000 --- a/backend/python/exllama/test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -runUnittests