mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
exllama(v2): fix exllamav1, add exllamav2 (#1384)
* fix(exllama): fix exllama deps with anaconda Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(exllamav2): add exllamav2 backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
563c5b7ea0
commit
2b2d6673ff
11
Dockerfile
11
Dockerfile
@ -12,7 +12,7 @@ ARG TARGETARCH
|
|||||||
ARG TARGETVARIANT
|
ARG TARGETVARIANT
|
||||||
|
|
||||||
ENV BUILD_TYPE=${BUILD_TYPE}
|
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||||
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh"
|
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||||
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
|
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
|
||||||
ARG GO_TAGS="stablediffusion tts"
|
ARG GO_TAGS="stablediffusion tts"
|
||||||
|
|
||||||
@ -181,16 +181,13 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
|||||||
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
||||||
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
|
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
|
||||||
; fi
|
; fi
|
||||||
|
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
||||||
|
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
|
||||||
|
; fi
|
||||||
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
||||||
PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
|
PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
|
||||||
; fi
|
; fi
|
||||||
|
|
||||||
# we also copy exllama libs over to resolve exllama import error
|
|
||||||
# TODO: check if this is still needed
|
|
||||||
RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
|
|
||||||
cp -rfv /usr/local/lib/python3.9/dist-packages/exllama backend/python/exllama/;\
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Define the health check command
|
# Define the health check command
|
||||||
HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
|
HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
|
||||||
CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
|
CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
|
||||||
|
2
Makefile
2
Makefile
@ -396,6 +396,7 @@ protogen-python:
|
|||||||
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
|
||||||
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
|
||||||
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
|
||||||
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
|
||||||
|
|
||||||
## GRPC
|
## GRPC
|
||||||
# Note: it is duplicated in the Dockerfile
|
# Note: it is duplicated in the Dockerfile
|
||||||
@ -409,6 +410,7 @@ prepare-extra-conda-environments:
|
|||||||
$(MAKE) -C backend/python/vall-e-x
|
$(MAKE) -C backend/python/vall-e-x
|
||||||
$(MAKE) -C backend/python/exllama
|
$(MAKE) -C backend/python/exllama
|
||||||
$(MAKE) -C backend/python/petals
|
$(MAKE) -C backend/python/petals
|
||||||
|
$(MAKE) -C backend/python/exllama2
|
||||||
|
|
||||||
|
|
||||||
backend-assets/grpc:
|
backend-assets/grpc:
|
||||||
|
@ -3,6 +3,7 @@ exllama:
|
|||||||
@echo "Creating virtual environment..."
|
@echo "Creating virtual environment..."
|
||||||
@conda env create --name exllama --file exllama.yml
|
@conda env create --name exllama --file exllama.yml
|
||||||
@echo "Virtual environment created."
|
@echo "Virtual environment created."
|
||||||
|
bash install.sh
|
||||||
|
|
||||||
.PHONY: run
|
.PHONY: run
|
||||||
run:
|
run:
|
||||||
|
@ -13,9 +13,10 @@ from pathlib import Path
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import version as torch_version
|
from torch import version as torch_version
|
||||||
from exllama.generator import ExLlamaGenerator
|
|
||||||
from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
|
from tokenizer import ExLlamaTokenizer
|
||||||
from exllama.tokenizer import ExLlamaTokenizer
|
from generator import ExLlamaGenerator
|
||||||
|
from model import ExLlama, ExLlamaCache, ExLlamaConfig
|
||||||
|
|
||||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ dependencies:
|
|||||||
- mpmath==1.3.0
|
- mpmath==1.3.0
|
||||||
- networkx==3.1
|
- networkx==3.1
|
||||||
- ninja==1.11.1
|
- ninja==1.11.1
|
||||||
|
- protobuf==4.24.4
|
||||||
- nvidia-cublas-cu12==12.1.3.1
|
- nvidia-cublas-cu12==12.1.3.1
|
||||||
- nvidia-cuda-cupti-cu12==12.1.105
|
- nvidia-cuda-cupti-cu12==12.1.105
|
||||||
- nvidia-cuda-nvrtc-cu12==12.1.105
|
- nvidia-cuda-nvrtc-cu12==12.1.105
|
||||||
@ -45,11 +46,11 @@ dependencies:
|
|||||||
- nvidia-nccl-cu12==2.18.1
|
- nvidia-nccl-cu12==2.18.1
|
||||||
- nvidia-nvjitlink-cu12==12.2.140
|
- nvidia-nvjitlink-cu12==12.2.140
|
||||||
- nvidia-nvtx-cu12==12.1.105
|
- nvidia-nvtx-cu12==12.1.105
|
||||||
- protobuf==4.24.4
|
|
||||||
- safetensors==0.3.2
|
- safetensors==0.3.2
|
||||||
- sentencepiece==0.1.99
|
- sentencepiece==0.1.99
|
||||||
- sympy==1.12
|
- sympy==1.12
|
||||||
- torch==2.1.0
|
- torch==2.1.0
|
||||||
- triton==2.1.0
|
- triton==2.1.0
|
||||||
- typing-extensions==4.8.0
|
- typing-extensions==4.8.0
|
||||||
|
- numpy
|
||||||
prefix: /opt/conda/envs/exllama
|
prefix: /opt/conda/envs/exllama
|
||||||
|
15
backend/python/exllama/install.sh
Executable file
15
backend/python/exllama/install.sh
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
##
|
||||||
|
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
|
||||||
|
export PATH=$PATH:/opt/conda/bin
|
||||||
|
|
||||||
|
# Activate conda environment
|
||||||
|
source activate exllama
|
||||||
|
|
||||||
|
echo $CONDA_PREFIX
|
||||||
|
|
||||||
|
|
||||||
|
git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
|
||||||
|
|
||||||
|
cp -rfv $CONDA_PREFIX/exllama/* ./
|
12
backend/python/exllama2/Makefile
Normal file
12
backend/python/exllama2/Makefile
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
.PHONY: exllama2
|
||||||
|
exllama2:
|
||||||
|
@echo "Creating virtual environment..."
|
||||||
|
@conda env create --name exllama2 --file exllama2.yml
|
||||||
|
@echo "Virtual environment created."
|
||||||
|
bash install.sh
|
||||||
|
|
||||||
|
.PHONY: run
|
||||||
|
run:
|
||||||
|
@echo "Running exllama2..."
|
||||||
|
bash run.sh
|
||||||
|
@echo "exllama2 run."
|
61
backend/python/exllama2/backend_pb2.py
Normal file
61
backend/python/exllama2/backend_pb2.py
Normal file
File diff suppressed because one or more lines are too long
363
backend/python/exllama2/backend_pb2_grpc.py
Normal file
363
backend/python/exllama2/backend_pb2_grpc.py
Normal file
@ -0,0 +1,363 @@
|
|||||||
|
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
||||||
|
"""Client and server classes corresponding to protobuf-defined services."""
|
||||||
|
import grpc
|
||||||
|
|
||||||
|
import backend_pb2 as backend__pb2
|
||||||
|
|
||||||
|
|
||||||
|
class BackendStub(object):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
|
||||||
|
def __init__(self, channel):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel: A grpc.Channel.
|
||||||
|
"""
|
||||||
|
self.Health = channel.unary_unary(
|
||||||
|
'/backend.Backend/Health',
|
||||||
|
request_serializer=backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Reply.FromString,
|
||||||
|
)
|
||||||
|
self.Predict = channel.unary_unary(
|
||||||
|
'/backend.Backend/Predict',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Reply.FromString,
|
||||||
|
)
|
||||||
|
self.LoadModel = channel.unary_unary(
|
||||||
|
'/backend.Backend/LoadModel',
|
||||||
|
request_serializer=backend__pb2.ModelOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Result.FromString,
|
||||||
|
)
|
||||||
|
self.PredictStream = channel.unary_stream(
|
||||||
|
'/backend.Backend/PredictStream',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Reply.FromString,
|
||||||
|
)
|
||||||
|
self.Embedding = channel.unary_unary(
|
||||||
|
'/backend.Backend/Embedding',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.EmbeddingResult.FromString,
|
||||||
|
)
|
||||||
|
self.GenerateImage = channel.unary_unary(
|
||||||
|
'/backend.Backend/GenerateImage',
|
||||||
|
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Result.FromString,
|
||||||
|
)
|
||||||
|
self.AudioTranscription = channel.unary_unary(
|
||||||
|
'/backend.Backend/AudioTranscription',
|
||||||
|
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.TranscriptResult.FromString,
|
||||||
|
)
|
||||||
|
self.TTS = channel.unary_unary(
|
||||||
|
'/backend.Backend/TTS',
|
||||||
|
request_serializer=backend__pb2.TTSRequest.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Result.FromString,
|
||||||
|
)
|
||||||
|
self.TokenizeString = channel.unary_unary(
|
||||||
|
'/backend.Backend/TokenizeString',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.TokenizationResponse.FromString,
|
||||||
|
)
|
||||||
|
self.Status = channel.unary_unary(
|
||||||
|
'/backend.Backend/Status',
|
||||||
|
request_serializer=backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.StatusResponse.FromString,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BackendServicer(object):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
|
||||||
|
def Health(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def Predict(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def LoadModel(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def PredictStream(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def Embedding(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def GenerateImage(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def AudioTranscription(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def TTS(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def TokenizeString(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def Status(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
|
||||||
|
def add_BackendServicer_to_server(servicer, server):
|
||||||
|
rpc_method_handlers = {
|
||||||
|
'Health': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Health,
|
||||||
|
request_deserializer=backend__pb2.HealthMessage.FromString,
|
||||||
|
response_serializer=backend__pb2.Reply.SerializeToString,
|
||||||
|
),
|
||||||
|
'Predict': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Predict,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.Reply.SerializeToString,
|
||||||
|
),
|
||||||
|
'LoadModel': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.LoadModel,
|
||||||
|
request_deserializer=backend__pb2.ModelOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.Result.SerializeToString,
|
||||||
|
),
|
||||||
|
'PredictStream': grpc.unary_stream_rpc_method_handler(
|
||||||
|
servicer.PredictStream,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.Reply.SerializeToString,
|
||||||
|
),
|
||||||
|
'Embedding': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Embedding,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
|
||||||
|
),
|
||||||
|
'GenerateImage': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.GenerateImage,
|
||||||
|
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
|
||||||
|
response_serializer=backend__pb2.Result.SerializeToString,
|
||||||
|
),
|
||||||
|
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.AudioTranscription,
|
||||||
|
request_deserializer=backend__pb2.TranscriptRequest.FromString,
|
||||||
|
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
|
||||||
|
),
|
||||||
|
'TTS': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.TTS,
|
||||||
|
request_deserializer=backend__pb2.TTSRequest.FromString,
|
||||||
|
response_serializer=backend__pb2.Result.SerializeToString,
|
||||||
|
),
|
||||||
|
'TokenizeString': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.TokenizeString,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
|
||||||
|
),
|
||||||
|
'Status': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Status,
|
||||||
|
request_deserializer=backend__pb2.HealthMessage.FromString,
|
||||||
|
response_serializer=backend__pb2.StatusResponse.SerializeToString,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
generic_handler = grpc.method_handlers_generic_handler(
|
||||||
|
'backend.Backend', rpc_method_handlers)
|
||||||
|
server.add_generic_rpc_handlers((generic_handler,))
|
||||||
|
|
||||||
|
|
||||||
|
# This class is part of an EXPERIMENTAL API.
|
||||||
|
class Backend(object):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Health(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
|
||||||
|
backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
backend__pb2.Reply.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Predict(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.Reply.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def LoadModel(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
|
||||||
|
backend__pb2.ModelOptions.SerializeToString,
|
||||||
|
backend__pb2.Result.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def PredictStream(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.Reply.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Embedding(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.EmbeddingResult.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def GenerateImage(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
|
||||||
|
backend__pb2.GenerateImageRequest.SerializeToString,
|
||||||
|
backend__pb2.Result.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def AudioTranscription(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
|
||||||
|
backend__pb2.TranscriptRequest.SerializeToString,
|
||||||
|
backend__pb2.TranscriptResult.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def TTS(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
|
||||||
|
backend__pb2.TTSRequest.SerializeToString,
|
||||||
|
backend__pb2.Result.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def TokenizeString(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.TokenizationResponse.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Status(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
|
||||||
|
backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
backend__pb2.StatusResponse.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
57
backend/python/exllama2/exllama2.yml
Normal file
57
backend/python/exllama2/exllama2.yml
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
name: exllama2
|
||||||
|
channels:
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- _libgcc_mutex=0.1=main
|
||||||
|
- _openmp_mutex=5.1=1_gnu
|
||||||
|
- bzip2=1.0.8=h7b6447c_0
|
||||||
|
- ca-certificates=2023.08.22=h06a4308_0
|
||||||
|
- ld_impl_linux-64=2.38=h1181459_1
|
||||||
|
- libffi=3.4.4=h6a678d5_0
|
||||||
|
- libgcc-ng=11.2.0=h1234567_1
|
||||||
|
- libgomp=11.2.0=h1234567_1
|
||||||
|
- libstdcxx-ng=11.2.0=h1234567_1
|
||||||
|
- libuuid=1.41.5=h5eee18b_0
|
||||||
|
- ncurses=6.4=h6a678d5_0
|
||||||
|
- openssl=3.0.11=h7f8727e_2
|
||||||
|
- pip=23.2.1=py311h06a4308_0
|
||||||
|
- python=3.11.5=h955ad1f_0
|
||||||
|
- readline=8.2=h5eee18b_0
|
||||||
|
- setuptools=68.0.0=py311h06a4308_0
|
||||||
|
- sqlite=3.41.2=h5eee18b_0
|
||||||
|
- tk=8.6.12=h1ccaba5_0
|
||||||
|
- tzdata=2023c=h04d1e81_0
|
||||||
|
- wheel=0.41.2=py311h06a4308_0
|
||||||
|
- xz=5.4.2=h5eee18b_0
|
||||||
|
- zlib=1.2.13=h5eee18b_0
|
||||||
|
- pip:
|
||||||
|
- filelock==3.12.4
|
||||||
|
- fsspec==2023.9.2
|
||||||
|
- grpcio==1.59.0
|
||||||
|
- markupsafe==2.1.3
|
||||||
|
- mpmath==1.3.0
|
||||||
|
- networkx==3.1
|
||||||
|
- protobuf==4.24.4
|
||||||
|
- nvidia-cublas-cu12==12.1.3.1
|
||||||
|
- nvidia-cuda-cupti-cu12==12.1.105
|
||||||
|
- nvidia-cuda-nvrtc-cu12==12.1.105
|
||||||
|
- nvidia-cuda-runtime-cu12==12.1.105
|
||||||
|
- nvidia-cudnn-cu12==8.9.2.26
|
||||||
|
- nvidia-cufft-cu12==11.0.2.54
|
||||||
|
- nvidia-curand-cu12==10.3.2.106
|
||||||
|
- nvidia-cusolver-cu12==11.4.5.107
|
||||||
|
- nvidia-cusparse-cu12==12.1.0.106
|
||||||
|
- nvidia-nccl-cu12==2.18.1
|
||||||
|
- nvidia-nvjitlink-cu12==12.2.140
|
||||||
|
- nvidia-nvtx-cu12==12.1.105
|
||||||
|
- pandas
|
||||||
|
- numpy
|
||||||
|
- ninja
|
||||||
|
- fastparquet
|
||||||
|
- torch>=2.1.0
|
||||||
|
- safetensors>=0.3.2
|
||||||
|
- sentencepiece>=0.1.97
|
||||||
|
- pygments
|
||||||
|
- websockets
|
||||||
|
- regex
|
||||||
|
prefix: /opt/conda/envs/exllama2
|
134
backend/python/exllama2/exllama2_backend.py
Executable file
134
backend/python/exllama2/exllama2_backend.py
Executable file
@ -0,0 +1,134 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import grpc
|
||||||
|
from concurrent import futures
|
||||||
|
import time
|
||||||
|
import backend_pb2
|
||||||
|
import backend_pb2_grpc
|
||||||
|
import argparse
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import os, glob
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch import version as torch_version
|
||||||
|
|
||||||
|
|
||||||
|
from exllamav2.generator import (
|
||||||
|
ExLlamaV2BaseGenerator,
|
||||||
|
ExLlamaV2Sampler
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
from exllamav2 import(
|
||||||
|
ExLlamaV2,
|
||||||
|
ExLlamaV2Config,
|
||||||
|
ExLlamaV2Cache,
|
||||||
|
ExLlamaV2Cache_8bit,
|
||||||
|
ExLlamaV2Tokenizer,
|
||||||
|
model_init,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||||
|
|
||||||
|
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||||
|
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||||
|
|
||||||
|
# Implement the BackendServicer class with the service methods
|
||||||
|
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
def Health(self, request, context):
|
||||||
|
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||||
|
def LoadModel(self, request, context):
|
||||||
|
try:
|
||||||
|
model_directory = request.ModelFile
|
||||||
|
|
||||||
|
config = ExLlamaV2Config()
|
||||||
|
config.model_dir = model_directory
|
||||||
|
config.prepare()
|
||||||
|
|
||||||
|
model = ExLlamaV2(config)
|
||||||
|
|
||||||
|
cache = ExLlamaV2Cache(model, lazy = True)
|
||||||
|
model.load_autosplit(cache)
|
||||||
|
|
||||||
|
tokenizer = ExLlamaV2Tokenizer(config)
|
||||||
|
|
||||||
|
# Initialize generator
|
||||||
|
|
||||||
|
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
|
||||||
|
|
||||||
|
self.generator= generator
|
||||||
|
|
||||||
|
generator.warmup()
|
||||||
|
self.model = model
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.cache = cache
|
||||||
|
except Exception as err:
|
||||||
|
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||||
|
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||||
|
|
||||||
|
def Predict(self, request, context):
|
||||||
|
|
||||||
|
penalty = 1.15
|
||||||
|
if request.Penalty != 0.0:
|
||||||
|
penalty = request.Penalty
|
||||||
|
|
||||||
|
settings = ExLlamaV2Sampler.Settings()
|
||||||
|
settings.temperature = request.Temperature
|
||||||
|
settings.top_k = request.TopK
|
||||||
|
settings.top_p = request.TopP
|
||||||
|
settings.token_repetition_penalty = penalty
|
||||||
|
settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
|
||||||
|
tokens = 512
|
||||||
|
|
||||||
|
if request.Tokens != 0:
|
||||||
|
tokens = request.Tokens
|
||||||
|
output = self.generator.generate_simple(request.Prompt, settings, tokens, seed = self.seed)
|
||||||
|
|
||||||
|
# Remove prompt from response if present
|
||||||
|
if request.Prompt in output:
|
||||||
|
output = output.replace(request.Prompt, "")
|
||||||
|
|
||||||
|
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
||||||
|
|
||||||
|
def PredictStream(self, request, context):
|
||||||
|
# Implement PredictStream RPC
|
||||||
|
#for reply in some_data_generator():
|
||||||
|
# yield reply
|
||||||
|
# Not implemented yet
|
||||||
|
return self.Predict(request, context)
|
||||||
|
|
||||||
|
|
||||||
|
def serve(address):
|
||||||
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||||
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
|
server.add_insecure_port(address)
|
||||||
|
server.start()
|
||||||
|
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||||
|
|
||||||
|
# Define the signal handler function
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
print("Received termination signal. Shutting down...")
|
||||||
|
server.stop(0)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Set the signal handlers for SIGINT and SIGTERM
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
server.stop(0)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
serve(args.addr)
|
14
backend/python/exllama2/install.sh
Executable file
14
backend/python/exllama2/install.sh
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
##
|
||||||
|
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
|
||||||
|
export PATH=$PATH:/opt/conda/bin
|
||||||
|
|
||||||
|
# Activate conda environment
|
||||||
|
source activate exllama2
|
||||||
|
|
||||||
|
echo $CONDA_PREFIX
|
||||||
|
|
||||||
|
git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
|
||||||
|
|
||||||
|
cp -rfv $CONDA_PREFIX/exllamav2/* ./
|
14
backend/python/exllama2/run.sh
Executable file
14
backend/python/exllama2/run.sh
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
##
|
||||||
|
## A bash script wrapper that runs the exllama server with conda
|
||||||
|
|
||||||
|
export PATH=$PATH:/opt/conda/bin
|
||||||
|
|
||||||
|
# Activate conda environment
|
||||||
|
source activate exllama2
|
||||||
|
|
||||||
|
# get the directory where the bash script is located
|
||||||
|
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||||
|
|
||||||
|
python $DIR/exllama2_backend.py $@
|
Loading…
Reference in New Issue
Block a user