diff --git a/configs/config.yaml b/configs/config.yaml
index 482d756..41a31a2 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 68
+version: 70
 binding_name: null
 model_name: null
 
@@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
 enable_petals_service: false
 petals_base_url: http://localhost:8064
 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-petals_device: cuda0
+petals_device: cuda
 
 # lollms service
 enable_lollms_service: false
@@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
 enable_vllm_service: false
 vllm_url: http://localhost:8000
 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm_gpu_memory_utilization: 0.9
+vllm_max_model_len: 4096
 
 
 # Audio
diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml
index 482d756..41a31a2 100644
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 68
+version: 70
 binding_name: null
 model_name: null
 
@@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
 enable_petals_service: false
 petals_base_url: http://localhost:8064
 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-petals_device: cuda0
+petals_device: cuda
 
 # lollms service
 enable_lollms_service: false
@@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
 enable_vllm_service: false
 vllm_url: http://localhost:8000
 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm_gpu_memory_utilization: 0.9
+vllm_max_model_len: 4096
 
 
 # Audio
diff --git a/lollms/server/configs/config.yaml b/lollms/server/configs/config.yaml
index 482d756..41a31a2 100644
--- a/lollms/server/configs/config.yaml
+++ b/lollms/server/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 68
+version: 70
 binding_name: null
 model_name: null
 
@@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
 enable_petals_service: false
 petals_base_url: http://localhost:8064
 petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-petals_device: cuda0
+petals_device: cuda
 
 # lollms service
 enable_lollms_service: false
@@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
 enable_vllm_service: false
 vllm_url: http://localhost:8000
 vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm_gpu_memory_utilization: 0.9
+vllm_max_model_len: 4096
 
 
 # Audio
diff --git a/lollms/services/vllm/lollms_vllm.py b/lollms/services/vllm/lollms_vllm.py
index fc96188..c4189ab 100644
--- a/lollms/services/vllm/lollms_vllm.py
+++ b/lollms/services/vllm/lollms_vllm.py
@@ -11,6 +11,7 @@ import sys
 from lollms.app import LollmsApplication
 from lollms.paths import LollmsPaths
 from lollms.config import TypedConfig, ConfigTemplate, BaseConfig
+from lollms.utilities import url2host_port
 import time
 import io
 import sys
@@ -103,12 +104,13 @@ class Service:
         if not self.wait_for_service(1,False) and base_url is None:
             ASCIIColors.info("Loading vllm service")
 
+        host, port = url2host_port(base_url)
         # run vllm
         if platform.system() == 'Windows':
             #subprocess.Popen(['wsl', 'ls', '$HOME'])
-            subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path])
+            subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)])
         else:
-            subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path])
+            subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)])
 
         # Wait until the service is available at http://127.0.0.1:7860/
         self.wait_for_service(max_retries=wait_max_retries)
diff --git a/lollms/services/vllm/run_vllm.sh b/lollms/services/vllm/run_vllm.sh
index 8c2a42c..0293cb4 100644
--- a/lollms/services/vllm/run_vllm.sh
+++ b/lollms/services/vllm/run_vllm.sh
@@ -4,7 +4,13 @@ PATH="$HOME/miniconda3/bin:$PATH"
 export PATH
 echo "Initializing conda"
 $HOME/miniconda3/bin/conda init --all
-source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1"
+echo "Initializing vllm with:"
+echo "model :$1"
+echo "host :$2"
+echo "port :$3"
+echo "max_model_len :$4"
+echo "gpu_memory_utilization :$5"
+source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1" --host "$2" --port "$3" --max-model-len "$4" --gpu-memory-utilization "$5"
 
 # Wait for all background processes to finish
 wait
\ No newline at end of file
diff --git a/lollms/utilities.py b/lollms/utilities.py
index e52e8a7..ab70c8d 100644
--- a/lollms/utilities.py
+++ b/lollms/utilities.py
@@ -128,6 +128,18 @@ def discussion_path_to_url(file_path:str|Path)->str:
     return "/".join([urllib.parse.quote(p, safe="") for p in url.split("/")])
 
 
+def url2host_port(url, default_port =8000):
+    if "http" in url:
+        parts = url.split(":")
+        host = ":".join(parts[:2])
+        port = url.split(":")[2] if len(parts)==3 else default_port
+        return host, port
+    else:
+        parts = url.split(":")
+        host = parts[0]
+        port = url.split(":")[1] if len(parts)==2 else default_port
+        return host, port
+
 def is_asyncio_loop_running():
     """
     # This function checks if an AsyncIO event loop is currently running. If an event loop is running, it returns True. If not, it returns False.