feat(vllm): Initial vllm backend implementation

Related to: https://github.com/go-skynet/LocalAI/issues/1015

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2023-09-09 16:46:13 +02:00
parent cc74fc93b4
commit c0bb5c4bf6
5 changed files with 527 additions and 2 deletions

View File

@ -11,7 +11,7 @@ ARG TARGETARCH
ARG TARGETVARIANT
ENV BUILD_TYPE=${BUILD_TYPE}
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py"
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/backend_vllm.py"
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
ARG GO_TAGS="stablediffusion tts"
@ -43,7 +43,7 @@ RUN if [ "${TARGETARCH}" = "amd64" ]; then \
pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
fi
RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
pip install torch && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
fi
RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt

View File

@ -362,6 +362,7 @@ protogen-python:
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
## GRPC

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,363 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import backend_pb2 as backend__pb2
class BackendStub(object):
"""Missing associated documentation comment in .proto file."""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Health = channel.unary_unary(
'/backend.Backend/Health',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Predict = channel.unary_unary(
'/backend.Backend/Predict',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.LoadModel = channel.unary_unary(
'/backend.Backend/LoadModel',
request_serializer=backend__pb2.ModelOptions.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.PredictStream = channel.unary_stream(
'/backend.Backend/PredictStream',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Embedding = channel.unary_unary(
'/backend.Backend/Embedding',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.EmbeddingResult.FromString,
)
self.GenerateImage = channel.unary_unary(
'/backend.Backend/GenerateImage',
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.AudioTranscription = channel.unary_unary(
'/backend.Backend/AudioTranscription',
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
response_deserializer=backend__pb2.TranscriptResult.FromString,
)
self.TTS = channel.unary_unary(
'/backend.Backend/TTS',
request_serializer=backend__pb2.TTSRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.TokenizeString = channel.unary_unary(
'/backend.Backend/TokenizeString',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.TokenizationResponse.FromString,
)
self.Status = channel.unary_unary(
'/backend.Backend/Status',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.StatusResponse.FromString,
)
class BackendServicer(object):
"""Missing associated documentation comment in .proto file."""
def Health(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Predict(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def LoadModel(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def PredictStream(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Embedding(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GenerateImage(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def AudioTranscription(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TTS(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TokenizeString(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Status(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_BackendServicer_to_server(servicer, server):
rpc_method_handlers = {
'Health': grpc.unary_unary_rpc_method_handler(
servicer.Health,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Predict': grpc.unary_unary_rpc_method_handler(
servicer.Predict,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'LoadModel': grpc.unary_unary_rpc_method_handler(
servicer.LoadModel,
request_deserializer=backend__pb2.ModelOptions.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'PredictStream': grpc.unary_stream_rpc_method_handler(
servicer.PredictStream,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Embedding': grpc.unary_unary_rpc_method_handler(
servicer.Embedding,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
),
'GenerateImage': grpc.unary_unary_rpc_method_handler(
servicer.GenerateImage,
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
servicer.AudioTranscription,
request_deserializer=backend__pb2.TranscriptRequest.FromString,
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
),
'TTS': grpc.unary_unary_rpc_method_handler(
servicer.TTS,
request_deserializer=backend__pb2.TTSRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'TokenizeString': grpc.unary_unary_rpc_method_handler(
servicer.TokenizeString,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
),
'Status': grpc.unary_unary_rpc_method_handler(
servicer.Status,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.StatusResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'backend.Backend', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
# This class is part of an EXPERIMENTAL API.
class Backend(object):
"""Missing associated documentation comment in .proto file."""
@staticmethod
def Health(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Predict(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def LoadModel(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
backend__pb2.ModelOptions.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def PredictStream(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Embedding(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.EmbeddingResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def GenerateImage(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
backend__pb2.GenerateImageRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def AudioTranscription(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
backend__pb2.TranscriptRequest.SerializeToString,
backend__pb2.TranscriptResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TTS(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
backend__pb2.TTSRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TokenizeString(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.TokenizationResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Status(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.StatusResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

View File

@ -0,0 +1,100 @@
#!/usr/bin/env python3
import grpc
from concurrent import futures
import time
import backend_pb2
import backend_pb2_grpc
import argparse
import signal
import sys
import os, glob
from pathlib import Path
from vllm import LLM, SamplingParams
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
def generate(self,prompt, max_new_tokens):
self.generator.end_beam_search()
# Tokenizing the input
ids = self.generator.tokenizer.encode(prompt)
self.generator.gen_begin_reuse(ids)
initial_len = self.generator.sequence[0].shape[0]
has_leading_space = False
decoded_text = ''
for i in range(max_new_tokens):
token = self.generator.gen_single_token()
if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith(''):
has_leading_space = True
decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
if has_leading_space:
decoded_text = ' ' + decoded_text
if token.item() == self.generator.tokenizer.eos_token_id:
break
return decoded_text
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
try:
# https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py
self.llm = LLM(model=request.Model)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
sampling_params = SamplingParams(temperature=request.Temperature, top_p=request.TopP)
outputs = self.llm.generate([request.Prompt], sampling_params)
generated_text = outputs[0].outputs[0].text
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Result(message=bytes(generated_text, encoding='utf-8'))
def PredictStream(self, request, context):
# Implement PredictStream RPC
#for reply in some_data_generator():
# yield reply
# Not implemented yet
return self.Predict(request, context)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)