feat(vllm): expose 'load_format' (#3943)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-10-23 15:34:57 +02:00 committed by GitHub
parent c75ecfa009
commit ae1ec4e096
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 4 additions and 0 deletions

View File

@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.Quantization != "":
engine_args.quantization = request.Quantization
if request.LoadFormat != "":
engine_args.load_format = request.LoadFormat
if request.GPUMemoryUtilization != 0:
engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
if request.TrustRemoteCode:

View File

@ -139,6 +139,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,
LoadFormat: c.LoadFormat,
GPUMemoryUtilization: c.GPUMemoryUtilization,
TrustRemoteCode: c.TrustRemoteCode,
EnforceEager: c.EnforceEager,

View File

@ -143,6 +143,7 @@ type LLMConfig struct {
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM