fast ai

2025-04-01 16:20:41 +00:00 · 2024-02-21 00:42:01 +01:00 · 2024-02-21 00:42:01 +01:00 · c29acbb6a6
commit c29acbb6a6
parent 97842c708c
6 changed files with 35 additions and 9 deletions
--- a/configs/config.yaml
+++ b/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 68
+version: 70
 binding_name: null
 model_name: null

@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
 enable_petals_service: false
 petals_base_url: http://localhost:8064
 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-petals_device: cuda0
+petals_device: cuda

 # lollms service
 enable_lollms_service: false
@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
 enable_vllm_service: false
 vllm_url: http://localhost:8000
 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm_gpu_memory_utilization: 0.9
+vllm_max_model_len: 4096


 # Audio
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 68
+version: 70
 binding_name: null
 model_name: null

@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
 enable_petals_service: false
 petals_base_url: http://localhost:8064
 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-petals_device: cuda0
+petals_device: cuda

 # lollms service
 enable_lollms_service: false
@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
 enable_vllm_service: false
 vllm_url: http://localhost:8000
 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm_gpu_memory_utilization: 0.9
+vllm_max_model_len: 4096


 # Audio
--- a/lollms/server/configs/config.yaml
+++ b/lollms/server/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 68
+version: 70
 binding_name: null
 model_name: null

@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
 enable_petals_service: false
 petals_base_url: http://localhost:8064
 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-petals_device: cuda0
+petals_device: cuda

 # lollms service
 enable_lollms_service: false
@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
 enable_vllm_service: false
 vllm_url: http://localhost:8000
 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm_gpu_memory_utilization: 0.9
+vllm_max_model_len: 4096


 # Audio
--- a/lollms/services/vllm/lollms_vllm.py
+++ b/lollms/services/vllm/lollms_vllm.py
@ -11,6 +11,7 @@ import sys
 from lollms.app import LollmsApplication
 from lollms.paths import LollmsPaths
 from lollms.config import TypedConfig, ConfigTemplate, BaseConfig
+from lollms.utilities import url2host_port
 import time
 import io
 import sys
@ -103,12 +104,13 @@ class Service:
        if not self.wait_for_service(1,False) and base_url is None:
            ASCIIColors.info("Loading vllm service")

+        host, port = url2host_port(base_url)
        # run vllm
        if platform.system() == 'Windows':
            #subprocess.Popen(['wsl', 'ls', '$HOME'])
-            subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path])
+            subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)])
        else:
-            subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path])
+            subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)])

        # Wait until the service is available at http://127.0.0.1:7860/
        self.wait_for_service(max_retries=wait_max_retries)
--- a/lollms/services/vllm/run_vllm.sh
+++ b/lollms/services/vllm/run_vllm.sh
@ -4,7 +4,13 @@ PATH="$HOME/miniconda3/bin:$PATH"
 export PATH
 echo "Initializing conda"
 $HOME/miniconda3/bin/conda init --all
-source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1"
+echo "Initializing vllm with:"
+echo "model :$1"
+echo "host :$2"
+echo "port :$3"
+echo "max_model_len :$4"
+echo "gpu_memory_utilization :$5"
+source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1" --host "$2" --port "$3" --max-model-len "$4" --gpu-memory-utilization "$5"

 # Wait for all background processes to finish
 wait
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@ -128,6 +128,18 @@ def discussion_path_to_url(file_path:str|Path)->str:
    return "/".join([urllib.parse.quote(p, safe="") for p in url.split("/")])


+def url2host_port(url, default_port =8000):
+    if "http" in url:
+        parts = url.split(":")
+        host = ":".join(parts[:2])
+        port = url.split(":")[2] if len(parts)==3 else default_port
+        return host, port
+    else:
+        parts = url.split(":")
+        host = parts[0]
+        port = url.split(":")[1] if len(parts)==2 else default_port
+        return host, port
+
 def is_asyncio_loop_running():
    """
    # This function checks if an AsyncIO event loop is currently running. If an event loop is running, it returns True. If not, it returns False.