feat(vllm): expose 'load_format' (#3943)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-03 07:50:52 +00:00 · 2024-10-23 15:34:57 +02:00 · 2024-10-23 15:34:57 +02:00 · ae1ec4e096
commit ae1ec4e096
parent c75ecfa009
3 changed files with 4 additions and 0 deletions
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        if request.Quantization != "":
            engine_args.quantization = request.Quantization
+        if request.LoadFormat != "":
+            engine_args.load_format = request.LoadFormat
        if request.GPUMemoryUtilization != 0:
            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
        if request.TrustRemoteCode:
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -139,6 +139,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DraftModel:           c.DraftModel,
 		AudioPath:            c.VallE.AudioPath,
 		Quantization:         c.Quantization,
+		LoadFormat:           c.LoadFormat,
 		GPUMemoryUtilization: c.GPUMemoryUtilization,
 		TrustRemoteCode:      c.TrustRemoteCode,
 		EnforceEager:         c.EnforceEager,
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -143,6 +143,7 @@ type LLMConfig struct {
 	DraftModel           string  `yaml:"draft_model"`
 	NDraft               int32   `yaml:"n_draft"`
 	Quantization         string  `yaml:"quantization"`
+	LoadFormat           string  `yaml:"load_format"`
 	GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
 	TrustRemoteCode      bool    `yaml:"trust_remote_code"`      // vLLM
 	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM