mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-04-01 16:20:41 +00:00
fast ai
This commit is contained in:
parent
97842c708c
commit
c29acbb6a6
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 68
|
||||
version: 70
|
||||
binding_name: null
|
||||
model_name: null
|
||||
|
||||
@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
|
||||
enable_petals_service: false
|
||||
petals_base_url: http://localhost:8064
|
||||
petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
petals_device: cuda0
|
||||
petals_device: cuda
|
||||
|
||||
# lollms service
|
||||
enable_lollms_service: false
|
||||
@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
|
||||
enable_vllm_service: false
|
||||
vllm_url: http://localhost:8000
|
||||
vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
vllm_gpu_memory_utilization: 0.9
|
||||
vllm_max_model_len: 4096
|
||||
|
||||
|
||||
# Audio
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 68
|
||||
version: 70
|
||||
binding_name: null
|
||||
model_name: null
|
||||
|
||||
@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
|
||||
enable_petals_service: false
|
||||
petals_base_url: http://localhost:8064
|
||||
petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
petals_device: cuda0
|
||||
petals_device: cuda
|
||||
|
||||
# lollms service
|
||||
enable_lollms_service: false
|
||||
@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
|
||||
enable_vllm_service: false
|
||||
vllm_url: http://localhost:8000
|
||||
vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
vllm_gpu_memory_utilization: 0.9
|
||||
vllm_max_model_len: 4096
|
||||
|
||||
|
||||
# Audio
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 68
|
||||
version: 70
|
||||
binding_name: null
|
||||
model_name: null
|
||||
|
||||
@ -91,7 +91,7 @@ ollama_base_url: http://localhost:11434
|
||||
enable_petals_service: false
|
||||
petals_base_url: http://localhost:8064
|
||||
petals_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
petals_device: cuda0
|
||||
petals_device: cuda
|
||||
|
||||
# lollms service
|
||||
enable_lollms_service: false
|
||||
@ -105,6 +105,8 @@ elastic_search_url: http://localhost:9200
|
||||
enable_vllm_service: false
|
||||
vllm_url: http://localhost:8000
|
||||
vllm_model_path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
vllm_gpu_memory_utilization: 0.9
|
||||
vllm_max_model_len: 4096
|
||||
|
||||
|
||||
# Audio
|
||||
|
@ -11,6 +11,7 @@ import sys
|
||||
from lollms.app import LollmsApplication
|
||||
from lollms.paths import LollmsPaths
|
||||
from lollms.config import TypedConfig, ConfigTemplate, BaseConfig
|
||||
from lollms.utilities import url2host_port
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
@ -103,12 +104,13 @@ class Service:
|
||||
if not self.wait_for_service(1,False) and base_url is None:
|
||||
ASCIIColors.info("Loading vllm service")
|
||||
|
||||
host, port = url2host_port(base_url)
|
||||
# run vllm
|
||||
if platform.system() == 'Windows':
|
||||
#subprocess.Popen(['wsl', 'ls', '$HOME'])
|
||||
subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path])
|
||||
subprocess.Popen(['wsl', 'bash', '$HOME/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)])
|
||||
else:
|
||||
subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path])
|
||||
subprocess.Popen(['bash', f'{Path.home()}/run_vllm.sh', self.app.config.vllm_model_path, host, str(port), str(self.app.config.vllm_max_model_len), str(self.app.config.vllm_gpu_memory_utilization)])
|
||||
|
||||
# Wait until the service is available at http://127.0.0.1:7860/
|
||||
self.wait_for_service(max_retries=wait_max_retries)
|
||||
|
@ -4,7 +4,13 @@ PATH="$HOME/miniconda3/bin:$PATH"
|
||||
export PATH
|
||||
echo "Initializing conda"
|
||||
$HOME/miniconda3/bin/conda init --all
|
||||
source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1"
|
||||
echo "Initializing vllm with:"
|
||||
echo "model :$1"
|
||||
echo "host :$2"
|
||||
echo "port :$3"
|
||||
echo "max_model_len :$4"
|
||||
echo "gpu_memory_utilization :$5"
|
||||
source activate vllm && python -m vllm.entrypoints.openai.api_server --model "$1" --host "$2" --port "$3" --max-model-len "$4" --gpu-memory-utilization "$5"
|
||||
|
||||
# Wait for all background processes to finish
|
||||
wait
|
@ -128,6 +128,18 @@ def discussion_path_to_url(file_path:str|Path)->str:
|
||||
return "/".join([urllib.parse.quote(p, safe="") for p in url.split("/")])
|
||||
|
||||
|
||||
def url2host_port(url, default_port =8000):
|
||||
if "http" in url:
|
||||
parts = url.split(":")
|
||||
host = ":".join(parts[:2])
|
||||
port = url.split(":")[2] if len(parts)==3 else default_port
|
||||
return host, port
|
||||
else:
|
||||
parts = url.split(":")
|
||||
host = parts[0]
|
||||
port = url.split(":")[1] if len(parts)==2 else default_port
|
||||
return host, port
|
||||
|
||||
def is_asyncio_loop_running():
|
||||
"""
|
||||
# This function checks if an AsyncIO event loop is currently running. If an event loop is running, it returns True. If not, it returns False.
|
||||
|
Loading…
x
Reference in New Issue
Block a user