From 0739f1c2794803a1464fec43354910333ec6a829 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sat, 1 Mar 2025 22:54:42 +0100 Subject: [PATCH] enhanced core --- configs/config.yaml | 9 +- lollms/app.py | 12 +- lollms/configs/config.yaml | 9 +- lollms/server/configs/config.yaml | 41 +++++- lollms/server/endpoints/lollms_ttm.py | 40 ++++++ lollms/server/endpoints/lollms_ttv.py | 40 ++++++ .../ttv/novita_ai/lollms_novita_ai.py | 127 ++++++++++++++++++ lollms/ttv.py | 47 ++++++- 8 files changed, 312 insertions(+), 13 deletions(-) create mode 100644 lollms/server/endpoints/lollms_ttm.py create mode 100644 lollms/server/endpoints/lollms_ttv.py create mode 100644 lollms/services/ttv/novita_ai/lollms_novita_ai.py diff --git a/configs/config.yaml b/configs/config.yaml index 09ccbaf..fccafde 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 156 +version: 157 # video viewing and news recovering last_viewed_video: null @@ -111,7 +111,7 @@ active_tts_service: "None" # xtts (offline), openai_tts (API key required), elev active_tti_service: "None" # autosd (offline), diffusers (offline), diffusers_client (online), dall-e (online), midjourney (online) active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required) active_ttm_service: "None" # musicgen (offline) -active_ttv_service: "None" # cog_video_x, diffusers, lumalab (offline) +active_ttv_service: "None" # novita_ai, cog_video_x, diffusers, lumalab (offline) # -------------------- Services -------------------------- # ***************** STT ***************** @@ -221,6 +221,9 @@ motion_ctrl_base_url: http://localhost:7861 # ***************** TTV ***************** +# Novita_ai configuration +novita_ai_key: "" + cog_video_x_model: "THUDM/CogVideoX-5b" # lumalabs configuration @@ -360,6 +363,8 @@ thinking_prompt: "Use a think first process to answer the user: mounted_function_calls: [] # { name: the function name, +# author: the author of the function +# category: the category of the function # value: the function name without spaces, # selected: selected or not, # icon: the icon in form feather:icon name or img:url or b64:base64, diff --git a/lollms/app.py b/lollms/app.py index 83debf8..7ccd4de 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -624,12 +624,18 @@ class LollmsApplication(LoLLMsCom): ASCIIColors.execute_with_animation("Loading loacal TTI services", start_tti, ASCIIColors.color_blue) def start_ttv(*args, **kwargs): - if self.config.active_ttv_service == "lumalabs" and (self.ttv is None or self.tti.name!="lumalabs"): + if self.config.active_ttv_service == "lumalabs" and (self.ttv is None or self.ttv.name!="lumalabs"): try: from lollms.services.ttv.lumalabs.lollms_lumalabs import LollmsLumaLabs - self.sd = LollmsLumaLabs(self.config.lumalabs_key) + self.ttv = LollmsLumaLabs(self.config.lumalabs_key) except: - self.warning(f"Couldn't load SD") + self.warning(f"Couldn't create lumalabs binding") + if self.config.active_ttv_service == "novita_ai" and (self.ttv is None or self.ttv.name!="novita_ai"): + try: + from lollms.services.ttv.novita_ai.lollms_novita_ai import LollmsNovitaAITextToVideo + self.ttv = LollmsNovitaAITextToVideo(self.config.novita_ai_key) + except: + self.warning(f"Couldn't create novita ai bvinding") ASCIIColors.execute_with_animation("Loading loacal TTV services", start_ttv, ASCIIColors.color_blue) diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 09ccbaf..fccafde 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 156 +version: 157 # video viewing and news recovering last_viewed_video: null @@ -111,7 +111,7 @@ active_tts_service: "None" # xtts (offline), openai_tts (API key required), elev active_tti_service: "None" # autosd (offline), diffusers (offline), diffusers_client (online), dall-e (online), midjourney (online) active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required) active_ttm_service: "None" # musicgen (offline) -active_ttv_service: "None" # cog_video_x, diffusers, lumalab (offline) +active_ttv_service: "None" # novita_ai, cog_video_x, diffusers, lumalab (offline) # -------------------- Services -------------------------- # ***************** STT ***************** @@ -221,6 +221,9 @@ motion_ctrl_base_url: http://localhost:7861 # ***************** TTV ***************** +# Novita_ai configuration +novita_ai_key: "" + cog_video_x_model: "THUDM/CogVideoX-5b" # lumalabs configuration @@ -360,6 +363,8 @@ thinking_prompt: "Use a think first process to answer the user: mounted_function_calls: [] # { name: the function name, +# author: the author of the function +# category: the category of the function # value: the function name without spaces, # selected: selected or not, # icon: the icon in form feather:icon name or img:url or b64:base64, diff --git a/lollms/server/configs/config.yaml b/lollms/server/configs/config.yaml index 9929272..fccafde 100644 --- a/lollms/server/configs/config.yaml +++ b/lollms/server/configs/config.yaml @@ -1,15 +1,16 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 149 +version: 157 # video viewing and news recovering last_viewed_video: null +last_viewed_changelog_version: null binding_name: null model_name: null model_variant: null model_type: null -show_news_panel: true +show_news_panel: false # Security measures turn_on_setting_update_validation: true @@ -110,7 +111,7 @@ active_tts_service: "None" # xtts (offline), openai_tts (API key required), elev active_tti_service: "None" # autosd (offline), diffusers (offline), diffusers_client (online), dall-e (online), midjourney (online) active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required) active_ttm_service: "None" # musicgen (offline) -active_ttv_service: "None" # cog_video_x, diffusers, lumalab (offline) +active_ttv_service: "None" # novita_ai, cog_video_x, diffusers, lumalab (offline) # -------------------- Services -------------------------- # ***************** STT ***************** @@ -220,6 +221,9 @@ motion_ctrl_base_url: http://localhost:7861 # ***************** TTV ***************** +# Novita_ai configuration +novita_ai_key: "" + cog_video_x_model: "THUDM/CogVideoX-5b" # lumalabs configuration @@ -336,10 +340,39 @@ positive_boost: null negative_boost: null current_language: english fun_mode: false +think_first_mode: false +thinking_prompt: "Use a think first process to answer the user: + + Ask yourself about the user's request and answer it with logical details. + If the user is requesting general information that does not require internet search and you are confident about it, then prepare to answer directly. + If the user is requesting general information that does require internet search and you have in the context enough information to answer, then use that data to answer. + If the user is requesting general information that does require internet search but you do not have any information, then ask him to activate internet search. + if the user is posing a riddle or asking a math question, make sure you use regourous math hypothisis, testing and analysis. + If the user is requesting to perform a task, then plan it through steps and prepare to answer + If the user is just discussing casually, do not perform the think first process + + Make sure you continue thinking until you find a satisfactory answer + Assess any potential errors you may make + + + After thinking you can answer the user." + + + + +mounted_function_calls: [] +# { name: the function name, +# author: the author of the function +# category: the category of the function +# value: the function name without spaces, +# selected: selected or not, +# icon: the icon in form feather:icon name or img:url or b64:base64, +# help: the help +# } # webui configurations show_code_of_conduct: true activate_audio_infos: true - +keep_thoughts: false diff --git a/lollms/server/endpoints/lollms_ttm.py b/lollms/server/endpoints/lollms_ttm.py new file mode 100644 index 0000000..0f710a5 --- /dev/null +++ b/lollms/server/endpoints/lollms_ttm.py @@ -0,0 +1,40 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from typing import Optional +from base64 import b64encode +import io +from PIL import Image +from fastapi import APIRouter +from lollms_webui import LOLLMSWebUI +from pydantic import BaseModel +from typing import List +from ascii_colors import trace_exception +from lollms.security import check_access + +router = APIRouter() +lollmsElfServer = LOLLMSWebUI.get_instance() + + +# Define a Pydantic model for the request body +class TTMServiceRequest(BaseModel): + client_id: str + +@router.post("/list_ttm_services", response_model=List[str]) +async def list_ttm_services(request: TTMServiceRequest): + """ + Dumb endpoint that returns a static list of TTM services. + + Args: + request (TTMServiceRequest): The request body containing the client_id. + + Returns: + List[str]: A list of TTV service names. + """ + # Validate the client_id (dumb validation for demonstration) + check_access(lollmsElfServer, request.client_id) + + + # Static list of TTV services + ttm_services = ["suno"] + + return ttm_services \ No newline at end of file diff --git a/lollms/server/endpoints/lollms_ttv.py b/lollms/server/endpoints/lollms_ttv.py new file mode 100644 index 0000000..09c7cd5 --- /dev/null +++ b/lollms/server/endpoints/lollms_ttv.py @@ -0,0 +1,40 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from typing import Optional +from base64 import b64encode +import io +from PIL import Image +from fastapi import APIRouter +from lollms_webui import LOLLMSWebUI +from pydantic import BaseModel +from typing import List +from ascii_colors import trace_exception +from lollms.security import check_access + +router = APIRouter() +lollmsElfServer = LOLLMSWebUI.get_instance() + + +# Define a Pydantic model for the request body +class TTVServiceRequest(BaseModel): + client_id: str + +@router.post("/list_ttv_services", response_model=List[str]) +async def list_ttv_services(request: TTVServiceRequest): + """ + Dumb endpoint that returns a static list of TTV services. + + Args: + request (TTVServiceRequest): The request body containing the client_id. + + Returns: + List[str]: A list of TTV service names. + """ + # Validate the client_id (dumb validation for demonstration) + check_access(lollmsElfServer, request.client_id) + + + # Static list of TTV services + ttv_services = ["novita_ai", "cog_video_x", "diffusers", "lumalab"] + + return ttv_services \ No newline at end of file diff --git a/lollms/services/ttv/novita_ai/lollms_novita_ai.py b/lollms/services/ttv/novita_ai/lollms_novita_ai.py new file mode 100644 index 0000000..c2f4973 --- /dev/null +++ b/lollms/services/ttv/novita_ai/lollms_novita_ai.py @@ -0,0 +1,127 @@ +from pathlib import Path +from typing import List, Optional, Dict, Any +from lollms.ttv import LollmsTTV +import requests +import json +import os + +class LollmsNovitaAITextToVideo(LollmsTTV): + """ + A binding for the Novita.ai Text-to-Video API. + This class allows generating videos from text prompts using the Novita.ai service. + """ + def __init__(self, api_key: str, base_url: str = "https://api.novita.ai/v3/async"): + """ + Initializes the NovitaAITextToVideo binding. + + Args: + api_key (str): The API key for authentication. + base_url (str): The base URL for the Novita.ai API. Defaults to "https://api.novita.ai/v3/async". + """ + super().__init__("novita_ai") + if api_key is None: + # Check for the NOVITA_AI_KEY environment variable if no API key is provided + api_key = os.getenv("NOVITA_AI_KEY","") + if api_key is None: + raise ValueError("No API key provided and NOVITA_AI_KEY environment variable is not set.") + self.api_key = api_key + self.base_url = base_url + + def generate_video( + self, + prompt: str, + negative_prompt: Optional[str] = None, + model_name: str = "darkSushiMixMix_225D_64380.safetensors", + height: int = 512, + width: int = 512, + steps: int = 20, + seed: int = -1, + guidance_scale: Optional[float] = None, + loras: Optional[List[Dict[str, Any]]] = None, + embeddings: Optional[List[Dict[str, Any]]] = None, + closed_loop: Optional[bool] = None, + clip_skip: Optional[int] = None, + ) -> str: + """ + Generates a video from text prompts using the Novita.ai API. + + Args: + model_name (str): Name of the model checkpoint. + height (int): Height of the video, range [256, 1024]. + width (int): Width of the video, range [256, 1024]. + steps (int): Number of denoising steps, range [1, 50]. + prompts (List[Dict[str, Any]]): List of prompts with frames and text descriptions. + negative_prompt (Optional[str]): Text input to avoid in the video. Defaults to None. + seed (int): Random seed for reproducibility. Defaults to -1. + guidance_scale (Optional[float]): Controls adherence to the prompt. Defaults to None. + loras (Optional[List[Dict[str, Any]]]): List of LoRA parameters. Defaults to None. + embeddings (Optional[List[Dict[str, Any]]]): List of embeddings. Defaults to None. + closed_loop (Optional[bool]): Controls animation loop behavior. Defaults to None. + clip_skip (Optional[int]): Number of layers to skip during optimization. Defaults to None. + + Returns: + str: The task_id for retrieving the generated video. + """ + url = f"{self.base_url}/txt2video" + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + payload = { + "model_name": model_name, + "height": height, + "width": width, + "steps": steps, + "prompts": [prompt], + "negative_prompt": negative_prompt, + "seed": seed, + "guidance_scale": guidance_scale, + "loras": loras, + "embeddings": embeddings, + "closed_loop": closed_loop, + "clip_skip": clip_skip, + } + # Remove None values from the payload to avoid sending null fields + payload = {k: v for k, v in payload.items() if v is not None} + + response = requests.post(url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raise an exception for HTTP errors + + return response.json().get("task_id") + + def get_task_result(self, task_id: str) -> Dict[str, Any]: + """ + Retrieves the result of a video generation task using the task_id. + + Args: + task_id (str): The task_id returned by the generate_video method. + + Returns: + Dict[str, Any]: The task result containing the video URL and other details. + """ + url = f"{self.base_url}/task-result" + headers = { + "Authorization": f"Bearer {self.api_key}", + } + params = { + "task_id": task_id, + } + + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() # Raise an exception for HTTP errors + + return response.json() + + def download_video(self, video_url: str, save_path: Path) -> None: + """ + Downloads the generated video from the provided URL and saves it to the specified path. + + Args: + video_url (str): The URL of the video to download. + save_path (Path): The path where the video will be saved. + """ + response = requests.get(video_url) + response.raise_for_status() # Raise an exception for HTTP errors + + with open(save_path, "wb") as file: + file.write(response.content) diff --git a/lollms/ttv.py b/lollms/ttv.py index 315298b..4186b6b 100644 --- a/lollms/ttv.py +++ b/lollms/ttv.py @@ -2,8 +2,51 @@ from abc import ABC, abstractmethod from typing import List, Optional class LollmsTTV(ABC): + """ + Abstract base class for text-to-video generation services. + Subclasses must implement the methods to generate videos from text prompts. + """ + def __init__(self, service_name): + self.name = service_name + @abstractmethod - def generate_video(self, prompt: str, num_frames: int = 49, fps: int = 8, + def generate_video(self, prompt: str, negative_prompt: str, num_frames: int = 49, fps: int = 8, num_inference_steps: int = 50, guidance_scale: float = 6.0, seed: Optional[int] = None) -> str: - pass \ No newline at end of file + """ + Generates a video from a single text prompt. + + Args: + prompt (str): The text prompt describing the video. + negative_prompt (str): Text describing elements to avoid in the video. + num_frames (int): Number of frames in the video. Default is 49. + fps (int): Frames per second. Default is 8. + num_inference_steps (int): Number of steps for the model to infer. Default is 50. + guidance_scale (float): Controls how closely the model adheres to the prompt. Default is 6.0. + seed (Optional[int]): Random seed for reproducibility. Default is None. + + Returns: + str: The path to the generated video. + """ + pass + + @abstractmethod + def generate_video_by_frames(self, prompts: List[str], frames: List[int], negative_prompt: str, fps: int = 8, + num_inference_steps: int = 50, guidance_scale: float = 6.0, + seed: Optional[int] = None) -> str: + """ + Generates a video from a list of prompts and corresponding frames. + + Args: + prompts (List[str]): List of text prompts for each frame. + frames (List[int]): List of frame indices corresponding to each prompt. + negative_prompt (str): Text describing elements to avoid in the video. + fps (int): Frames per second. Default is 8. + num_inference_steps (int): Number of steps for the model to infer. Default is 50. + guidance_scale (float): Controls how closely the model adheres to the prompt. Default is 6.0. + seed (Optional[int]): Random seed for reproducibility. Default is None. + + Returns: + str: The path to the generated video. + """ + pass