diff --git a/.env b/.env index ec53a8f4..c067f7b8 100644 --- a/.env +++ b/.env @@ -61,4 +61,9 @@ MODELS_PATH=/models # LD_PRELOAD= ### Huggingface cache for models -# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface \ No newline at end of file +# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface + +### Python backends GRPC max workers +### Default number of workers for GRPC Python backends. +### This actually controls wether a backend can process multiple requests or not. +# PYTHON_GRPC_MAX_WORKERS=1 \ No newline at end of file diff --git a/extra/grpc/autogptq/autogptq.py b/extra/grpc/autogptq/autogptq.py index 6a5f9c7c..7f0f609f 100755 --- a/extra/grpc/autogptq/autogptq.py +++ b/extra/grpc/autogptq/autogptq.py @@ -15,6 +15,9 @@ from transformers import TextGenerationPipeline _ONE_DAY_IN_SECONDS = 60 * 60 * 24 +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): def Health(self, request, context): @@ -77,7 +80,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/extra/grpc/bark/ttsbark.py b/extra/grpc/bark/ttsbark.py index a14c632d..313dc3a4 100644 --- a/extra/grpc/bark/ttsbark.py +++ b/extra/grpc/bark/ttsbark.py @@ -15,6 +15,9 @@ from scipy.io.wavfile import write as write_wav _ONE_DAY_IN_SECONDS = 60 * 60 * 24 +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): def Health(self, request, context): @@ -51,7 +54,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=True) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/extra/grpc/diffusers/backend_diffusers.py b/extra/grpc/diffusers/backend_diffusers.py index c23fc22a..693db1fa 100755 --- a/extra/grpc/diffusers/backend_diffusers.py +++ b/extra/grpc/diffusers/backend_diffusers.py @@ -26,6 +26,9 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24 COMPEL=os.environ.get("COMPEL", "1") == "1" CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1" +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # https://github.com/CompVis/stable-diffusion/issues/239#issuecomment-1627615287 def sc(self, clip_input, images) : return images, [False for i in images] # edit the StableDiffusionSafetyChecker class so that, when called, it just returns the images and an array of True values @@ -346,7 +349,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(message="Model loaded successfully", success=True) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/extra/grpc/exllama/exllama.py b/extra/grpc/exllama/exllama.py index c8eddf4e..25785aae 100755 --- a/extra/grpc/exllama/exllama.py +++ b/extra/grpc/exllama/exllama.py @@ -19,6 +19,9 @@ from exllama.tokenizer import ExLlamaTokenizer _ONE_DAY_IN_SECONDS = 60 * 60 * 24 +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): def generate(self,prompt, max_new_tokens): @@ -110,7 +113,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/extra/grpc/huggingface/huggingface.py b/extra/grpc/huggingface/huggingface.py index 680c2739..8a61b3fa 100755 --- a/extra/grpc/huggingface/huggingface.py +++ b/extra/grpc/huggingface/huggingface.py @@ -12,6 +12,9 @@ from sentence_transformers import SentenceTransformer _ONE_DAY_IN_SECONDS = 60 * 60 * 24 +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): def Health(self, request, context): @@ -34,7 +37,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/extra/grpc/vall-e-x/ttsvalle.py b/extra/grpc/vall-e-x/ttsvalle.py index 714a3415..be7f3cab 100644 --- a/extra/grpc/vall-e-x/ttsvalle.py +++ b/extra/grpc/vall-e-x/ttsvalle.py @@ -16,6 +16,9 @@ from utils.prompt_making import make_prompt _ONE_DAY_IN_SECONDS = 60 * 60 * 24 +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): def Health(self, request, context): @@ -65,7 +68,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=True) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() diff --git a/extra/grpc/vllm/backend_vllm.py b/extra/grpc/vllm/backend_vllm.py index a35cbc74..43231713 100644 --- a/extra/grpc/vllm/backend_vllm.py +++ b/extra/grpc/vllm/backend_vllm.py @@ -14,6 +14,9 @@ from vllm import LLM, SamplingParams _ONE_DAY_IN_SECONDS = 60 * 60 * 24 +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): def generate(self,prompt, max_new_tokens): @@ -70,7 +73,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return self.Predict(request, context) def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start()