mirror of
https://github.com/mudler/LocalAI.git
synced 2025-04-15 06:56:47 +00:00
feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (#4855)
* Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * using " marks in the config.yaml file Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * adding in missing colon Signed-off-by: TheDropZone <brandonbeiler@gmail.com> --------- Signed-off-by: TheDropZone <brandonbeiler@gmail.com>
This commit is contained in:
parent
5b19af99ff
commit
6a6e1a0ea9
backend
core
gallery
@ -165,7 +165,7 @@ message Reply {
|
||||
|
||||
message GrammarTrigger {
|
||||
string word = 1;
|
||||
bool at_start = 2;
|
||||
bool at_start = 2;
|
||||
}
|
||||
|
||||
message ModelOptions {
|
||||
@ -229,6 +229,11 @@ message ModelOptions {
|
||||
int32 MaxModelLen = 54;
|
||||
int32 TensorParallelSize = 55;
|
||||
string LoadFormat = 58;
|
||||
bool DisableLogStatus = 66;
|
||||
string DType = 67;
|
||||
int32 LimitImagePerPrompt = 68;
|
||||
int32 LimitVideoPerPrompt = 69;
|
||||
int32 LimitAudioPerPrompt = 70;
|
||||
|
||||
string MMProj = 41;
|
||||
|
||||
|
@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
engine_args.swap_space = request.SwapSpace
|
||||
if request.MaxModelLen != 0:
|
||||
engine_args.max_model_len = request.MaxModelLen
|
||||
if request.DisableLogStatus:
|
||||
engine_args.disable_log_status = request.DisableLogStatus
|
||||
if request.DType != "":
|
||||
engine_args.dtype = request.DType
|
||||
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
|
||||
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
|
||||
engine_args.limit_mm_per_prompt = {
|
||||
"image": max(request.LimitImagePerPrompt, 1),
|
||||
"video": max(request.LimitVideoPerPrompt, 1),
|
||||
"audio": max(request.LimitAudioPerPrompt, 1)
|
||||
}
|
||||
|
||||
try:
|
||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
@ -269,7 +280,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def load_image(self, image_path: str):
|
||||
"""
|
||||
Load an image from the given file path or base64 encoded data.
|
||||
|
||||
|
||||
Args:
|
||||
image_path (str): The path to the image file or base64 encoded data.
|
||||
|
||||
@ -288,7 +299,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def load_video(self, video_path: str):
|
||||
"""
|
||||
Load a video from the given file path.
|
||||
|
||||
|
||||
Args:
|
||||
video_path (str): The path to the image file.
|
||||
|
||||
@ -335,4 +346,4 @@ if __name__ == "__main__":
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(serve(args.addr))
|
||||
asyncio.run(serve(args.addr))
|
||||
|
@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
SwapSpace: int32(c.SwapSpace),
|
||||
MaxModelLen: int32(c.MaxModelLen),
|
||||
TensorParallelSize: int32(c.TensorParallelSize),
|
||||
DisableLogStatus: c.DisableLogStatus,
|
||||
DType: c.DType,
|
||||
// LimitMMPerPrompt vLLM
|
||||
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
|
||||
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
|
||||
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
|
||||
MMProj: c.MMProj,
|
||||
FlashAttention: c.FlashAttention,
|
||||
CacheTypeKey: c.CacheTypeK,
|
||||
|
@ -130,25 +130,28 @@ type LLMConfig struct {
|
||||
TrimSpace []string `yaml:"trimspace"`
|
||||
TrimSuffix []string `yaml:"trimsuffix"`
|
||||
|
||||
ContextSize *int `yaml:"context_size"`
|
||||
NUMA bool `yaml:"numa"`
|
||||
LoraAdapter string `yaml:"lora_adapter"`
|
||||
LoraBase string `yaml:"lora_base"`
|
||||
LoraAdapters []string `yaml:"lora_adapters"`
|
||||
LoraScales []float32 `yaml:"lora_scales"`
|
||||
LoraScale float32 `yaml:"lora_scale"`
|
||||
NoMulMatQ bool `yaml:"no_mulmatq"`
|
||||
DraftModel string `yaml:"draft_model"`
|
||||
NDraft int32 `yaml:"n_draft"`
|
||||
Quantization string `yaml:"quantization"`
|
||||
LoadFormat string `yaml:"load_format"`
|
||||
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
|
||||
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
|
||||
EnforceEager bool `yaml:"enforce_eager"` // vLLM
|
||||
SwapSpace int `yaml:"swap_space"` // vLLM
|
||||
MaxModelLen int `yaml:"max_model_len"` // vLLM
|
||||
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
||||
MMProj string `yaml:"mmproj"`
|
||||
ContextSize *int `yaml:"context_size"`
|
||||
NUMA bool `yaml:"numa"`
|
||||
LoraAdapter string `yaml:"lora_adapter"`
|
||||
LoraBase string `yaml:"lora_base"`
|
||||
LoraAdapters []string `yaml:"lora_adapters"`
|
||||
LoraScales []float32 `yaml:"lora_scales"`
|
||||
LoraScale float32 `yaml:"lora_scale"`
|
||||
NoMulMatQ bool `yaml:"no_mulmatq"`
|
||||
DraftModel string `yaml:"draft_model"`
|
||||
NDraft int32 `yaml:"n_draft"`
|
||||
Quantization string `yaml:"quantization"`
|
||||
LoadFormat string `yaml:"load_format"`
|
||||
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
|
||||
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
|
||||
EnforceEager bool `yaml:"enforce_eager"` // vLLM
|
||||
SwapSpace int `yaml:"swap_space"` // vLLM
|
||||
MaxModelLen int `yaml:"max_model_len"` // vLLM
|
||||
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
||||
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
|
||||
DType string `yaml:"dtype"` // vLLM
|
||||
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
|
||||
MMProj string `yaml:"mmproj"`
|
||||
|
||||
FlashAttention bool `yaml:"flash_attention"`
|
||||
NoKVOffloading bool `yaml:"no_kv_offloading"`
|
||||
@ -166,6 +169,13 @@ type LLMConfig struct {
|
||||
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
||||
}
|
||||
|
||||
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
|
||||
type LimitMMPerPrompt struct {
|
||||
LimitImagePerPrompt int `yaml:"image"`
|
||||
LimitVideoPerPrompt int `yaml:"video"`
|
||||
LimitAudioPerPrompt int `yaml:"audio"`
|
||||
}
|
||||
|
||||
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
|
||||
type AutoGPTQ struct {
|
||||
ModelBaseName string `yaml:"model_base_name"`
|
||||
|
@ -16,6 +16,8 @@ config_file: |
|
||||
use_tokenizer_template: true
|
||||
# Uncomment to specify a quantization method (optional)
|
||||
# quantization: "awq"
|
||||
# Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
|
||||
# dtype: "float16"
|
||||
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
||||
# gpu_memory_utilization: 0.5
|
||||
# Uncomment to trust remote code from huggingface
|
||||
@ -30,3 +32,10 @@ config_file: |
|
||||
# Allows you to partition and run large models. Performance gains are limited.
|
||||
# https://github.com/vllm-project/vllm/issues/1435
|
||||
# tensor_parallel_size: 2
|
||||
# Uncomment to disable log stats
|
||||
# disable_log_stats: true
|
||||
# Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
|
||||
# limit_mm_per_prompt:
|
||||
# image: 2
|
||||
# video: 2
|
||||
# audio: 2
|
||||
|
Loading…
x
Reference in New Issue
Block a user