mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-09 12:03:15 +00:00
feat(petals): add backend (#1350)
* feat(petals): add backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fixups --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
63e1f8fffd
commit
b7821361c3
@ -12,7 +12,7 @@ ARG TARGETARCH
|
|||||||
ARG TARGETVARIANT
|
ARG TARGETVARIANT
|
||||||
|
|
||||||
ENV BUILD_TYPE=${BUILD_TYPE}
|
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||||
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh"
|
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh"
|
||||||
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
|
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
|
||||||
ARG GO_TAGS="stablediffusion tts"
|
ARG GO_TAGS="stablediffusion tts"
|
||||||
|
|
||||||
@ -181,13 +181,18 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
|||||||
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
||||||
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
|
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
|
||||||
; fi
|
; fi
|
||||||
|
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
|
||||||
|
PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
|
||||||
|
; fi
|
||||||
|
|
||||||
# Copy VALLE-X as it's not a real "lib"
|
# Copy VALLE-X as it's not a real "lib"
|
||||||
|
# TODO: this is wrong - we should copy the lib into the conda env path
|
||||||
RUN if [ -d /usr/lib/vall-e-x ]; then \
|
RUN if [ -d /usr/lib/vall-e-x ]; then \
|
||||||
cp -rfv /usr/lib/vall-e-x/* ./ ; \
|
cp -rfv /usr/lib/vall-e-x/* ./ ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# we also copy exllama libs over to resolve exllama import error
|
# we also copy exllama libs over to resolve exllama import error
|
||||||
|
# TODO: check if this is still needed
|
||||||
RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
|
RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
|
||||||
cp -rfv /usr/local/lib/python3.9/dist-packages/exllama backend/python/exllama/;\
|
cp -rfv /usr/local/lib/python3.9/dist-packages/exllama backend/python/exllama/;\
|
||||||
fi
|
fi
|
||||||
|
2
Makefile
2
Makefile
@ -388,6 +388,7 @@ protogen-python:
|
|||||||
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
|
||||||
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
|
||||||
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
|
||||||
|
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
|
||||||
|
|
||||||
## GRPC
|
## GRPC
|
||||||
# Note: it is duplicated in the Dockerfile
|
# Note: it is duplicated in the Dockerfile
|
||||||
@ -400,6 +401,7 @@ prepare-extra-conda-environments:
|
|||||||
$(MAKE) -C backend/python/transformers
|
$(MAKE) -C backend/python/transformers
|
||||||
$(MAKE) -C backend/python/vall-e-x
|
$(MAKE) -C backend/python/vall-e-x
|
||||||
$(MAKE) -C backend/python/exllama
|
$(MAKE) -C backend/python/exllama
|
||||||
|
$(MAKE) -C backend/python/petals
|
||||||
|
|
||||||
|
|
||||||
backend-assets/grpc:
|
backend-assets/grpc:
|
||||||
|
11
backend/python/petals/Makefile
Normal file
11
backend/python/petals/Makefile
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
.PONY: petals
|
||||||
|
petals:
|
||||||
|
@echo "Creating virtual environment..."
|
||||||
|
@conda env create --name petals --file petals.yml
|
||||||
|
@echo "Virtual environment created."
|
||||||
|
|
||||||
|
.PONY: run
|
||||||
|
run:
|
||||||
|
@echo "Running petals..."
|
||||||
|
bash run.sh
|
||||||
|
@echo "petals run."
|
61
backend/python/petals/backend_pb2.py
Normal file
61
backend/python/petals/backend_pb2.py
Normal file
File diff suppressed because one or more lines are too long
363
backend/python/petals/backend_pb2_grpc.py
Normal file
363
backend/python/petals/backend_pb2_grpc.py
Normal file
@ -0,0 +1,363 @@
|
|||||||
|
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
||||||
|
"""Client and server classes corresponding to protobuf-defined services."""
|
||||||
|
import grpc
|
||||||
|
|
||||||
|
import backend_pb2 as backend__pb2
|
||||||
|
|
||||||
|
|
||||||
|
class BackendStub(object):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
|
||||||
|
def __init__(self, channel):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel: A grpc.Channel.
|
||||||
|
"""
|
||||||
|
self.Health = channel.unary_unary(
|
||||||
|
'/backend.Backend/Health',
|
||||||
|
request_serializer=backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Reply.FromString,
|
||||||
|
)
|
||||||
|
self.Predict = channel.unary_unary(
|
||||||
|
'/backend.Backend/Predict',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Reply.FromString,
|
||||||
|
)
|
||||||
|
self.LoadModel = channel.unary_unary(
|
||||||
|
'/backend.Backend/LoadModel',
|
||||||
|
request_serializer=backend__pb2.ModelOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Result.FromString,
|
||||||
|
)
|
||||||
|
self.PredictStream = channel.unary_stream(
|
||||||
|
'/backend.Backend/PredictStream',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Reply.FromString,
|
||||||
|
)
|
||||||
|
self.Embedding = channel.unary_unary(
|
||||||
|
'/backend.Backend/Embedding',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.EmbeddingResult.FromString,
|
||||||
|
)
|
||||||
|
self.GenerateImage = channel.unary_unary(
|
||||||
|
'/backend.Backend/GenerateImage',
|
||||||
|
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Result.FromString,
|
||||||
|
)
|
||||||
|
self.AudioTranscription = channel.unary_unary(
|
||||||
|
'/backend.Backend/AudioTranscription',
|
||||||
|
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.TranscriptResult.FromString,
|
||||||
|
)
|
||||||
|
self.TTS = channel.unary_unary(
|
||||||
|
'/backend.Backend/TTS',
|
||||||
|
request_serializer=backend__pb2.TTSRequest.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.Result.FromString,
|
||||||
|
)
|
||||||
|
self.TokenizeString = channel.unary_unary(
|
||||||
|
'/backend.Backend/TokenizeString',
|
||||||
|
request_serializer=backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.TokenizationResponse.FromString,
|
||||||
|
)
|
||||||
|
self.Status = channel.unary_unary(
|
||||||
|
'/backend.Backend/Status',
|
||||||
|
request_serializer=backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
response_deserializer=backend__pb2.StatusResponse.FromString,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BackendServicer(object):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
|
||||||
|
def Health(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def Predict(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def LoadModel(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def PredictStream(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def Embedding(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def GenerateImage(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def AudioTranscription(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def TTS(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def TokenizeString(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
def Status(self, request, context):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||||
|
context.set_details('Method not implemented!')
|
||||||
|
raise NotImplementedError('Method not implemented!')
|
||||||
|
|
||||||
|
|
||||||
|
def add_BackendServicer_to_server(servicer, server):
|
||||||
|
rpc_method_handlers = {
|
||||||
|
'Health': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Health,
|
||||||
|
request_deserializer=backend__pb2.HealthMessage.FromString,
|
||||||
|
response_serializer=backend__pb2.Reply.SerializeToString,
|
||||||
|
),
|
||||||
|
'Predict': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Predict,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.Reply.SerializeToString,
|
||||||
|
),
|
||||||
|
'LoadModel': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.LoadModel,
|
||||||
|
request_deserializer=backend__pb2.ModelOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.Result.SerializeToString,
|
||||||
|
),
|
||||||
|
'PredictStream': grpc.unary_stream_rpc_method_handler(
|
||||||
|
servicer.PredictStream,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.Reply.SerializeToString,
|
||||||
|
),
|
||||||
|
'Embedding': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Embedding,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
|
||||||
|
),
|
||||||
|
'GenerateImage': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.GenerateImage,
|
||||||
|
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
|
||||||
|
response_serializer=backend__pb2.Result.SerializeToString,
|
||||||
|
),
|
||||||
|
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.AudioTranscription,
|
||||||
|
request_deserializer=backend__pb2.TranscriptRequest.FromString,
|
||||||
|
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
|
||||||
|
),
|
||||||
|
'TTS': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.TTS,
|
||||||
|
request_deserializer=backend__pb2.TTSRequest.FromString,
|
||||||
|
response_serializer=backend__pb2.Result.SerializeToString,
|
||||||
|
),
|
||||||
|
'TokenizeString': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.TokenizeString,
|
||||||
|
request_deserializer=backend__pb2.PredictOptions.FromString,
|
||||||
|
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
|
||||||
|
),
|
||||||
|
'Status': grpc.unary_unary_rpc_method_handler(
|
||||||
|
servicer.Status,
|
||||||
|
request_deserializer=backend__pb2.HealthMessage.FromString,
|
||||||
|
response_serializer=backend__pb2.StatusResponse.SerializeToString,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
generic_handler = grpc.method_handlers_generic_handler(
|
||||||
|
'backend.Backend', rpc_method_handlers)
|
||||||
|
server.add_generic_rpc_handlers((generic_handler,))
|
||||||
|
|
||||||
|
|
||||||
|
# This class is part of an EXPERIMENTAL API.
|
||||||
|
class Backend(object):
|
||||||
|
"""Missing associated documentation comment in .proto file."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Health(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
|
||||||
|
backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
backend__pb2.Reply.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Predict(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.Reply.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def LoadModel(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
|
||||||
|
backend__pb2.ModelOptions.SerializeToString,
|
||||||
|
backend__pb2.Result.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def PredictStream(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.Reply.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Embedding(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.EmbeddingResult.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def GenerateImage(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
|
||||||
|
backend__pb2.GenerateImageRequest.SerializeToString,
|
||||||
|
backend__pb2.Result.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def AudioTranscription(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
|
||||||
|
backend__pb2.TranscriptRequest.SerializeToString,
|
||||||
|
backend__pb2.TranscriptResult.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def TTS(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
|
||||||
|
backend__pb2.TTSRequest.SerializeToString,
|
||||||
|
backend__pb2.Result.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def TokenizeString(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
|
||||||
|
backend__pb2.PredictOptions.SerializeToString,
|
||||||
|
backend__pb2.TokenizationResponse.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Status(request,
|
||||||
|
target,
|
||||||
|
options=(),
|
||||||
|
channel_credentials=None,
|
||||||
|
call_credentials=None,
|
||||||
|
insecure=False,
|
||||||
|
compression=None,
|
||||||
|
wait_for_ready=None,
|
||||||
|
timeout=None,
|
||||||
|
metadata=None):
|
||||||
|
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
|
||||||
|
backend__pb2.HealthMessage.SerializeToString,
|
||||||
|
backend__pb2.StatusResponse.FromString,
|
||||||
|
options, channel_credentials,
|
||||||
|
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
140
backend/python/petals/backend_petals.py
Executable file
140
backend/python/petals/backend_petals.py
Executable file
@ -0,0 +1,140 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from concurrent import futures
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
import backend_pb2
|
||||||
|
import backend_pb2_grpc
|
||||||
|
|
||||||
|
import grpc
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from petals import AutoDistributedModelForCausalLM
|
||||||
|
|
||||||
|
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||||
|
|
||||||
|
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||||
|
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||||
|
|
||||||
|
# Implement the BackendServicer class with the service methods
|
||||||
|
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
"""
|
||||||
|
A gRPC servicer that implements the Backend service defined in backend.proto.
|
||||||
|
"""
|
||||||
|
def Health(self, request, context):
|
||||||
|
"""
|
||||||
|
Returns a health check message.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The health check request.
|
||||||
|
context: The gRPC context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
backend_pb2.Reply: The health check reply.
|
||||||
|
"""
|
||||||
|
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||||
|
|
||||||
|
def LoadModel(self, request, context):
|
||||||
|
"""
|
||||||
|
Loads a language model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The load model request.
|
||||||
|
context: The gRPC context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
backend_pb2.Result: The load model result.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=False, add_bos_token=False)
|
||||||
|
self.model = AutoDistributedModelForCausalLM.from_pretrained(request.Model)
|
||||||
|
self.cuda = False
|
||||||
|
if request.CUDA:
|
||||||
|
self.model = self.model.cuda()
|
||||||
|
self.cuda = True
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||||
|
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||||
|
|
||||||
|
def Predict(self, request, context):
|
||||||
|
"""
|
||||||
|
Generates text based on the given prompt and sampling parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The predict request.
|
||||||
|
context: The gRPC context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
backend_pb2.Result: The predict result.
|
||||||
|
"""
|
||||||
|
|
||||||
|
inputs = self.tokenizer(request.Prompt, return_tensors="pt")["input_ids"]
|
||||||
|
if self.cuda:
|
||||||
|
inputs = inputs.cuda()
|
||||||
|
|
||||||
|
if request.Tokens == 0:
|
||||||
|
# Max to max value if tokens are not specified
|
||||||
|
request.Tokens = 8192
|
||||||
|
|
||||||
|
# TODO: kwargs and map all parameters
|
||||||
|
outputs = self.model.generate(inputs, max_new_tokens=request.Tokens)
|
||||||
|
|
||||||
|
generated_text = self.tokenizer.decode(outputs[0])
|
||||||
|
# Remove prompt from response if present
|
||||||
|
if request.Prompt in generated_text:
|
||||||
|
generated_text = generated_text.replace(request.Prompt, "")
|
||||||
|
|
||||||
|
return backend_pb2.Result(message=bytes(generated_text, encoding='utf-8'))
|
||||||
|
|
||||||
|
def PredictStream(self, request, context):
|
||||||
|
"""
|
||||||
|
Generates text based on the given prompt and sampling parameters, and streams the results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The predict stream request.
|
||||||
|
context: The gRPC context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
backend_pb2.Result: The predict stream result.
|
||||||
|
"""
|
||||||
|
# Implement PredictStream RPC
|
||||||
|
#for reply in some_data_generator():
|
||||||
|
# yield reply
|
||||||
|
# Not implemented yet
|
||||||
|
return self.Predict(request, context)
|
||||||
|
|
||||||
|
def serve(address):
|
||||||
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||||
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
|
server.add_insecure_port(address)
|
||||||
|
server.start()
|
||||||
|
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||||
|
|
||||||
|
# Define the signal handler function
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
print("Received termination signal. Shutting down...")
|
||||||
|
server.stop(0)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Set the signal handlers for SIGINT and SIGTERM
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
server.stop(0)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
serve(args.addr)
|
29
backend/python/petals/petals.yml
Normal file
29
backend/python/petals/petals.yml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
name: petals
|
||||||
|
channels:
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
# - _libgcc_mutex=0.1=main
|
||||||
|
# - _openmp_mutex=5.1=1_gnu
|
||||||
|
# - bzip2=1.0.8=h7b6447c_0
|
||||||
|
# - ca-certificates=2023.08.22=h06a4308_0
|
||||||
|
# - ld_impl_linux-64=2.38=h1181459_1
|
||||||
|
# - libffi=3.4.4=h6a678d5_0
|
||||||
|
# - libgcc-ng=11.2.0=h1234567_1
|
||||||
|
# - libgomp=11.2.0=h1234567_1
|
||||||
|
# - libstdcxx-ng=11.2.0=h1234567_1
|
||||||
|
# - libuuid=1.41.5=h5eee18b_0
|
||||||
|
# - ncurses=6.4=h6a678d5_0
|
||||||
|
# - openssl=3.0.11=h7f8727e_2
|
||||||
|
# - pip=23.2.1=py311h06a4308_0
|
||||||
|
# - python=3.11.5=h955ad1f_0
|
||||||
|
# - readline=8.2=h5eee18b_0
|
||||||
|
# - setuptools=68.0.0=py311h06a4308_0
|
||||||
|
# - sqlite=3.41.2=h5eee18b_0
|
||||||
|
# - tk=8.6.12=h1ccaba5_0
|
||||||
|
# - tzdata=2023c=h04d1e81_0
|
||||||
|
# - wheel=0.41.2=py311h06a4308_0
|
||||||
|
# - xz=5.4.2=h5eee18b_0
|
||||||
|
# - zlib=1.2.13=h5eee18b_0
|
||||||
|
- pip:
|
||||||
|
- git+https://github.com/bigscience-workshop/petals
|
||||||
|
prefix: /opt/conda/envs/petals
|
21
backend/python/petals/run.sh
Executable file
21
backend/python/petals/run.sh
Executable file
@ -0,0 +1,21 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
##
|
||||||
|
## A bash script wrapper that runs the exllama server with conda
|
||||||
|
|
||||||
|
export PATH=$PATH:/opt/conda/bin
|
||||||
|
|
||||||
|
# Activate conda environment
|
||||||
|
# if source is available use it, or use conda
|
||||||
|
#
|
||||||
|
if [ -f /opt/conda/bin/activate ]; then
|
||||||
|
source activate petals
|
||||||
|
else
|
||||||
|
eval "$(conda shell.bash hook)"
|
||||||
|
conda activate petals
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get the directory where the bash script is located
|
||||||
|
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||||
|
|
||||||
|
python $DIR/backend_petals.py $@
|
Loading…
x
Reference in New Issue
Block a user