From 6a6e1a0ea9f8e656f2587786a2f41334fb04c8a3 Mon Sep 17 00:00:00 2001 From: Brandon Beiler Date: Tue, 18 Feb 2025 13:27:58 -0500 Subject: [PATCH] feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (#4855) * Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt Signed-off-by: TheDropZone * using " marks in the config.yaml file Signed-off-by: TheDropZone * adding in missing colon Signed-off-by: TheDropZone --------- Signed-off-by: TheDropZone --- backend/backend.proto | 7 ++++- backend/python/vllm/backend.py | 17 +++++++++--- core/backend/options.go | 6 +++++ core/config/backend_config.go | 48 ++++++++++++++++++++-------------- gallery/vllm.yaml | 9 +++++++ 5 files changed, 64 insertions(+), 23 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index bd75adc5..aa34687a 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -165,7 +165,7 @@ message Reply { message GrammarTrigger { string word = 1; - bool at_start = 2; + bool at_start = 2; } message ModelOptions { @@ -229,6 +229,11 @@ message ModelOptions { int32 MaxModelLen = 54; int32 TensorParallelSize = 55; string LoadFormat = 58; + bool DisableLogStatus = 66; + string DType = 67; + int32 LimitImagePerPrompt = 68; + int32 LimitVideoPerPrompt = 69; + int32 LimitAudioPerPrompt = 70; string MMProj = 41; diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 98ac5081..238ba0e3 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): engine_args.swap_space = request.SwapSpace if request.MaxModelLen != 0: engine_args.max_model_len = request.MaxModelLen + if request.DisableLogStatus: + engine_args.disable_log_status = request.DisableLogStatus + if request.DType != "": + engine_args.dtype = request.DType + if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0: + # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs + engine_args.limit_mm_per_prompt = { + "image": max(request.LimitImagePerPrompt, 1), + "video": max(request.LimitVideoPerPrompt, 1), + "audio": max(request.LimitAudioPerPrompt, 1) + } try: self.llm = AsyncLLMEngine.from_engine_args(engine_args) @@ -269,7 +280,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def load_image(self, image_path: str): """ Load an image from the given file path or base64 encoded data. - + Args: image_path (str): The path to the image file or base64 encoded data. @@ -288,7 +299,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def load_video(self, video_path: str): """ Load a video from the given file path. - + Args: video_path (str): The path to the image file. @@ -335,4 +346,4 @@ if __name__ == "__main__": ) args = parser.parse_args() - asyncio.run(serve(args.addr)) \ No newline at end of file + asyncio.run(serve(args.addr)) diff --git a/core/backend/options.go b/core/backend/options.go index 3201142d..c807e4e8 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { SwapSpace: int32(c.SwapSpace), MaxModelLen: int32(c.MaxModelLen), TensorParallelSize: int32(c.TensorParallelSize), + DisableLogStatus: c.DisableLogStatus, + DType: c.DType, + // LimitMMPerPrompt vLLM + LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt), + LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt), + LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt), MMProj: c.MMProj, FlashAttention: c.FlashAttention, CacheTypeKey: c.CacheTypeK, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index cba85640..ee87af30 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -130,25 +130,28 @@ type LLMConfig struct { TrimSpace []string `yaml:"trimspace"` TrimSuffix []string `yaml:"trimsuffix"` - ContextSize *int `yaml:"context_size"` - NUMA bool `yaml:"numa"` - LoraAdapter string `yaml:"lora_adapter"` - LoraBase string `yaml:"lora_base"` - LoraAdapters []string `yaml:"lora_adapters"` - LoraScales []float32 `yaml:"lora_scales"` - LoraScale float32 `yaml:"lora_scale"` - NoMulMatQ bool `yaml:"no_mulmatq"` - DraftModel string `yaml:"draft_model"` - NDraft int32 `yaml:"n_draft"` - Quantization string `yaml:"quantization"` - LoadFormat string `yaml:"load_format"` - GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM - TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM - EnforceEager bool `yaml:"enforce_eager"` // vLLM - SwapSpace int `yaml:"swap_space"` // vLLM - MaxModelLen int `yaml:"max_model_len"` // vLLM - TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM - MMProj string `yaml:"mmproj"` + ContextSize *int `yaml:"context_size"` + NUMA bool `yaml:"numa"` + LoraAdapter string `yaml:"lora_adapter"` + LoraBase string `yaml:"lora_base"` + LoraAdapters []string `yaml:"lora_adapters"` + LoraScales []float32 `yaml:"lora_scales"` + LoraScale float32 `yaml:"lora_scale"` + NoMulMatQ bool `yaml:"no_mulmatq"` + DraftModel string `yaml:"draft_model"` + NDraft int32 `yaml:"n_draft"` + Quantization string `yaml:"quantization"` + LoadFormat string `yaml:"load_format"` + GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM + TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM + EnforceEager bool `yaml:"enforce_eager"` // vLLM + SwapSpace int `yaml:"swap_space"` // vLLM + MaxModelLen int `yaml:"max_model_len"` // vLLM + TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM + DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM + DType string `yaml:"dtype"` // vLLM + LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM + MMProj string `yaml:"mmproj"` FlashAttention bool `yaml:"flash_attention"` NoKVOffloading bool `yaml:"no_kv_offloading"` @@ -166,6 +169,13 @@ type LLMConfig struct { CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale } +// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM +type LimitMMPerPrompt struct { + LimitImagePerPrompt int `yaml:"image"` + LimitVideoPerPrompt int `yaml:"video"` + LimitAudioPerPrompt int `yaml:"audio"` +} + // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend type AutoGPTQ struct { ModelBaseName string `yaml:"model_base_name"` diff --git a/gallery/vllm.yaml b/gallery/vllm.yaml index 5a2f16ce..f0b797cc 100644 --- a/gallery/vllm.yaml +++ b/gallery/vllm.yaml @@ -16,6 +16,8 @@ config_file: | use_tokenizer_template: true # Uncomment to specify a quantization method (optional) # quantization: "awq" + # Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16 + # dtype: "float16" # Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%) # gpu_memory_utilization: 0.5 # Uncomment to trust remote code from huggingface @@ -30,3 +32,10 @@ config_file: | # Allows you to partition and run large models. Performance gains are limited. # https://github.com/vllm-project/vllm/issues/1435 # tensor_parallel_size: 2 + # Uncomment to disable log stats + # disable_log_stats: true + # Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified + # limit_mm_per_prompt: + # image: 2 + # video: 2 + # audio: 2