diff --git a/backend/backend.proto b/backend/backend.proto index 62e1a1a6..ec01e4a7 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -177,6 +177,7 @@ message ModelOptions { bool EnforceEager = 52; int32 SwapSpace = 53; int32 MaxModelLen = 54; + int32 TensorParallelSize = 55; string MMProj = 41; diff --git a/backend/python/vllm/backend_vllm.py b/backend/python/vllm/backend_vllm.py index ff0f0b26..2d8b55db 100644 --- a/backend/python/vllm/backend_vllm.py +++ b/backend/python/vllm/backend_vllm.py @@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): engine_args.trust_remote_code = request.TrustRemoteCode if request.EnforceEager: engine_args.enforce_eager = request.EnforceEager + if request.TensorParallelSize: + engine_args.tensor_parallel_size = request.TensorParallelSize if request.SwapSpace != 0: engine_args.swap_space = request.SwapSpace if request.MaxModelLen != 0: diff --git a/core/backend/options.go b/core/backend/options.go index 5b303b05..60cb01ff 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -74,6 +74,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions { EnforceEager: c.EnforceEager, SwapSpace: int32(c.SwapSpace), MaxModelLen: int32(c.MaxModelLen), + TensorParallelSize: int32(c.TensorParallelSize), MMProj: c.MMProj, YarnExtFactor: c.YarnExtFactor, YarnAttnFactor: c.YarnAttnFactor, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 1161cf9f..a439ee63 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -140,6 +140,7 @@ type LLMConfig struct { EnforceEager bool `yaml:"enforce_eager"` // vLLM SwapSpace int `yaml:"swap_space"` // vLLM MaxModelLen int `yaml:"max_model_len"` // vLLM + TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM MMProj string `yaml:"mmproj"` RopeScaling string `yaml:"rope_scaling"`