feat(vllm): add support for image-to-text and video-to-text (#3729)

* feat(vllm): add support for image-to-text Related to https://github.com/mudler/LocalAI/issues/3670 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(vllm): add support for video-to-text Closes: https://github.com/mudler/LocalAI/issues/2318 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(vllm): support CPU installations Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(vllm): add bnb Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: add docs reference Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Apply suggestions from code review Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2025-05-09 12:03:15 +00:00 · 2024-10-04 23:42:05 +02:00 · 2024-10-04 23:42:05 +02:00 · 2553de0187
commit 2553de0187
parent 408dfe62ee
6 changed files with 91 additions and 10 deletions
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@ -5,6 +5,8 @@ import argparse
 import signal
 import sys
 import os
 from typing import List
 from PIL import Image
 import backend_pb2
 import backend_pb2_grpc
@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
 from vllm.assets.video import VideoAsset
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        try:
@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
           )
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    async def Predict(self, request, context):
@ -196,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed
        # Extract image paths and process images
        prompt = request.Prompt
-        
+
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+        image_paths = request.Images
        image_data = [self.load_image(img_path) for img_path in image_paths]
        videos_path = request.Videos
        video_data = [self.load_video(video_path) for video_path in videos_path]
        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
-        # Generate text
+        # Generate text using the LLM engine
        request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
        outputs = self.llm.generate(
            {
                "prompt": prompt,
                "multi_modal_data": {
                    "image": image_data if image_data else None,
                    "video": video_data if video_data else None,
                } if image_data or video_data else None,
            },
            sampling_params=sampling_params,
            request_id=request_id,
        )
        # Stream the results
        generated_text = ""
@ -227,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if streaming:
            return
        # Remove the image files from /tmp folder
        for img_path in image_paths:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        # Sending the final generated text
        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
    def load_image(self, image_path: str):
        """
        Load an image from the given file path.
        Args:
            image_path (str): The path to the image file.
        Returns:
            Image: The loaded image.
        """
        try:
            return Image.open(image_path)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
            return self.load_video(image_path)
    def load_video(self, video_path: str):
        """
        Load a video from the given file path.
        Args:
            video_path (str): The path to the image file.
        Returns:
            Video: The loaded video.
        """
        try:
            video = VideoAsset(name=video_path).np_ndarrays
            return video
        except Exception as e:
            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
            return None
 async def serve(address):
    # Start asyncio gRPC server
    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
-installRequirements
+if [ "x${BUILD_TYPE}" == "x" ]; then
        ensureVenv
        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
        if [ ! -d vllm ]; then
            git clone https://github.com/vllm-project/vllm
        fi
        pushd vllm
            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
            VLLM_TARGET_DEVICE=cpu python setup.py install
        popd
        rm -rf vllm
    else
        installRequirements
 fi
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
 torch
-transformers
+transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@ -1,3 +1,4 @@
 accelerate
 torch
-transformers
+transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 accelerate
 torch
-transformers
+transformers
 bitsandbytes
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@ -4,4 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 bitsandbytes