diff --git a/configs/config.yaml b/configs/config.yaml index 482d756..41a31a2 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 68 +version: 70 binding_name: null model_name: null @@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434 enable_petals_service: false petals_base_url: http://localhost:8064 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -petals_device: cuda0 +petals_device: cuda # lollms service enable_lollms_service: false @@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200 enable_vllm_service: false vllm_url: http://localhost:8000 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +vllm_gpu_memory_utilization: 0.9 +vllm_max_model_len: 4096 # Audio diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 482d756..41a31a2 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 68 +version: 70 binding_name: null model_name: null @@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434 enable_petals_service: false petals_base_url: http://localhost:8064 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -petals_device: cuda0 +petals_device: cuda # lollms service enable_lollms_service: false @@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200 enable_vllm_service: false vllm_url: http://localhost:8000 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +vllm_gpu_memory_utilization: 0.9 +vllm_max_model_len: 4096 # Audio diff --git a/lollms/server/configs/config.yaml b/lollms/server/configs/config.yaml index 482d756..41a31a2 100644 --- a/lollms/server/configs/config.yaml +++ b/lollms/server/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 68 +version: 70 binding_name: null model_name: null @@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434 enable_petals_service: false petals_base_url: http://localhost:8064 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 -petals_device: cuda0 +petals_device: cuda # lollms service enable_lollms_service: false @@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200 enable_vllm_service: false vllm_url: http://localhost:8000 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +vllm_gpu_memory_utilization: 0.9 +vllm_max_model_len: 4096 # Audio diff --git a/lollms/services/vllm/lollms_vllm.py b/lollms/services/vllm/lollms_vllm.py index fc96188..c4189ab 100644 --- a/lollms/services/vllm/lollms_vllm.py +++ b/lollms/services/vllm/lollms_vllm.py @@ -11,6 +11,7 @@ import sys from lollms.app import LollmsApplication from lollms.paths import LollmsPaths from lollms.config import TypedConfig, ConfigTemplate, BaseConfig +from lollms.utilities import url2host_port import time import io import sys @@ -103,12 +104,13 @@ class Service: if not self.wait_for_service(1,False) and base_url is None: ASCIIColors.info("Loading vllm service") + host, port = url2host_port(base_url) # run vllm if platform.system() == 'Windows': #subprocess.Popen(['wsl', 'ls', '$HOME']) - subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path]) + subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)]) else: - subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path]) + subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)]) # Wait until the service is available at http://127.0.0.1:7860/ self.wait_for_service(max_retries=wait_max_retries) diff --git a/lollms/services/vllm/run_vllm.sh b/lollms/services/vllm/run_vllm.sh index 8c2a42c..0293cb4 100644 --- a/lollms/services/vllm/run_vllm.sh +++ b/lollms/services/vllm/run_vllm.sh @@ -4,7 +4,13 @@ PATH="$HOME/miniconda3/bin:$PATH" export PATH echo "Initializing conda" $HOME/miniconda3/bin/conda init --all -source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1" +echo "Initializing vllm with:" +echo "model :$1" +echo "host :$2" +echo "port :$3" +echo "max_model_len :$4" +echo "gpu_memory_utilization :$5" +source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1" --host "$2" --port "$3" --max-model-len "$4" --gpu-memory-utilization "$5" # Wait for all background processes to finish wait \ No newline at end of file diff --git a/lollms/utilities.py b/lollms/utilities.py index e52e8a7..ab70c8d 100644 --- a/lollms/utilities.py +++ b/lollms/utilities.py @@ -128,6 +128,18 @@ def discussion_path_to_url(file_path:str|Path)->str: return "/".join([urllib.parse.quote(p, safe="") for p in url.split("/")]) +def url2host_port(url, default_port =8000): + if "http" in url: + parts = url.split(":") + host = ":".join(parts[:2]) + port = url.split(":")[2] if len(parts)==3 else default_port + return host, port + else: + parts = url.split(":") + host = parts[0] + port = url.split(":")[1] if len(parts)==2 else default_port + return host, port + def is_asyncio_loop_running(): """ # This function checks if an AsyncIO event loop is currently running. If an event loop is running, it returns True. If not, it returns False.