upgraded text

2025-04-11 12:49:54 +00:00 · 2024-01-14 00:24:00 +01:00 · 2024-01-14 00:24:00 +01:00 · 94425bce30
commit 94425bce30
parent 22e1ef85ce
4 changed files with 496 additions and 303 deletions
--- a/lollms/app.py
+++ b/lollms/app.py
@ -44,6 +44,9 @@ class LollmsApplication(LoLLMsCom):
        self.config                     = config
        self.lollms_paths               = lollms_paths

+        # TODO : implement
+        self.embedding_models           = []
+
        self.menu                       = MainMenu(self, callback)
        self.mounted_personalities      = []
        self.personality:AIPersonality  = None
--- a/lollms/server/endpoints/lollms_generator.py
+++ b/lollms/server/endpoints/lollms_generator.py
@ -8,13 +8,25 @@ description:

 """

-from fastapi import APIRouter, Request
+from fastapi import APIRouter, Request, Body
 from lollms.server.elf_server import LOLLMSElfServer
 from pydantic import BaseModel
 from starlette.responses import StreamingResponse
 from lollms.types import MSG_TYPE
-from lollms.utilities import detect_antiprompt, remove_text_from_string
+from lollms.utilities import detect_antiprompt, remove_text_from_string, trace_exception
 from ascii_colors import ASCIIColors
+import time
+import threading
+from typing import List, Optional, Union
+import random
+import string
+import json
+
+def _generate_id(length=10):
+    letters_and_digits = string.ascii_letters + string.digits
+    random_id = ''.join(random.choice(letters_and_digits) for _ in range(length))
+    return random_id
+
 class GenerateRequest(BaseModel):
 
    text: str
@ -77,16 +89,30 @@ def get_generation_status():


 # ----------------------------------- Generation -----------------------------------------
-@router.post("/generate")
-def lollms_generate(request_data: Request):
-    """
-    Endpoint for generating text from prompts using the lollms fastapi server.
+class LollmsGenerateRequest(BaseModel):
+    text: str
+    model_name: Optional[str] = None
+    personality: Optional[int] = None
+    n_predict: Optional[int] = 1024
+    stream: bool = False
+    temperature: float = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    repeat_penalty: Optional[float] = None
+    repeat_last_n: Optional[int] = None
+    seed: Optional[int] = None
+    n_threads: Optional[int] = None
+
+@router.post("/lollms_generate")
+async def lollms_generate(request: LollmsGenerateRequest):
+    """ Endpoint for generating text from prompts using the LoLLMs fastAPI server.

    Args:
    Data model for the Generate Request.
-
    Attributes:
-    - text: str representing the input text prompt for text generation.
+    - text: str : representing the input text prompt for text generation.
+    - model_name: Optional[str] = None : The name of the model to be used (it should be one of the current models)
+    - personality_id: Optional[int] = None : The name of the mounted personality to be used (if a personality is None, the endpoint will just return a completion text). To get the list of mounted personalities, just use /list_mounted_personalities
    - n_predict: int representing the number of predictions to generate.
    - stream: bool indicating whether to stream the generated text or not.
    - temperature: float representing the temperature parameter for text generation.
@ -97,21 +123,79 @@ def lollms_generate(request_data: Request):
    - seed: int representing the seed for text generation.
    - n_threads: int representing the number of threads for text generation.

-    
    Returns:
    - If the elf_server binding is not None:
-        - If stream is True, returns a StreamingResponse of generated text chunks.
-        - If stream is False, returns the generated text as a string.
+    - If stream is True, returns a StreamingResponse of generated text chunks.
+    - If stream is False, returns the generated text as a string.
    - If the elf_server binding is None, returns None.
-    """ 
-    text = request_data["text"]
-    n_predict = request_data.get("n_predict", 1024)
-    stream = request_data.get("stream", False)
-    
-    if elf_server.binding is not None:
-        if stream:
-            output = {"text":""}
-            def generate_chunks():
+    """
+
+    try:
+        text = request.text
+        n_predict = request.n_predict
+        stream = request.stream
+        
+        if elf_server.binding is not None:
+            if stream:
+
+                output = {"text":"","waiting":True,"new":[]}
+                def generate_chunks():
+                    lk = threading.Lock()
+
+                    def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
+                        if elf_server.cancel_gen:
+                            return False
+                        if chunk is None:
+                            return
+                        output["text"] += chunk
+                        # Yield each chunk of data
+                        lk.acquire()
+                        try:
+                            antiprompt = detect_antiprompt(output["text"])
+                            if antiprompt:
+                                ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}")
+                                output["text"] = remove_text_from_string(output["text"],antiprompt)
+                                lk.release()
+                                return False
+                            else:
+                                output["new"].append(chunk)
+                                lk.release()
+                                return True
+                        except Exception as ex:
+                            trace_exception(ex)
+                            lk.release()
+                            return True
+                    def chunks_builder():
+                        elf_server.binding.generate(
+                                                text, 
+                                                n_predict, 
+                                                callback=callback, 
+                                                temperature=request.temperature if request.temperature is not None else elf_server.config.temperature,
+                                                top_k=request.top_k if request.top_k is not None else elf_server.config.top_k, 
+                                                top_p=request.top_p if request.top_p is not None else elf_server.config.top_p,
+                                                repeat_penalty=request.repeat_penalty if request.repeat_penalty is not None else elf_server.config.repeat_penalty,
+                                                repeat_last_n=request.repeat_last_n if request.repeat_last_n is not None else elf_server.config.repeat_last_n,
+                                                seed=request.seed if request.seed is not None else elf_server.config.seed,
+                                                n_threads=request.n_threads if request.n_threads is not None else elf_server.config.n_threads
+                                            )
+                        output["waiting"] = False
+                    thread = threading.Thread(target=chunks_builder)
+                    thread.start()
+                    current_index = 0
+                    while (output["waiting"] and elf_server.cancel_gen == False):
+                        while (output["waiting"] and len(output["new"])==0):
+                            time.sleep(0.001)
+                        lk.acquire()
+                        for i in range(len(output["new"])):
+                            current_index += 1                        
+                            yield output["new"][i]
+                        output["new"]=[]
+                        lk.release()
+                    elf_server.cancel_gen = False
+
+                return StreamingResponse(iter(generate_chunks()))
+            else:
+                output = {"text":""}
                def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
                    # Yield each chunk of data
                    output["text"] += chunk
@ -121,79 +205,199 @@ def lollms_generate(request_data: Request):
                        output["text"] = remove_text_from_string(output["text"],antiprompt)
                        return False
                    else:
-                        yield chunk
                        return True
-                return iter(elf_server.binding.generate(
-                                            text, 
-                                            n_predict, 
-                                            callback=callback, 
-                                            temperature=request_data.temperature,
-                                            top_k=request_data.top_k, 
-                                            top_p=request_data.top_p,
-                                            repeat_penalty=request_data.repeat_penalty,
-                                            repeat_last_n=request_data.repeat_last_n,
-                                            seed=request_data.seed,
-                                            n_threads=request_data.n_threads
-                                        ))
-            
-            return StreamingResponse(generate_chunks())
+                elf_server.binding.generate(
+                                                text, 
+                                                n_predict, 
+                                                callback=callback,
+                                                temperature=request.temperature if request.temperature is not None else elf_server.config.temperature,
+                                                top_k=request.top_k if request.top_k is not None else elf_server.config.top_k, 
+                                                top_p=request.top_p if request.top_p is not None else elf_server.config.top_p,
+                                                repeat_penalty=request.repeat_penalty if request.repeat_penalty is not None else elf_server.config.repeat_penalty,
+                                                repeat_last_n=request.repeat_last_n if request.repeat_last_n is not None else elf_server.config.repeat_last_n,
+                                                seed=request.seed if request.seed is not None else elf_server.config.seed,
+                                                n_threads=request.n_threads if request.n_threads is not None else elf_server.config.n_threads
+                                            )
+                return output["text"]
        else:
-            output = {"text":""}
-            def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
-                # Yield each chunk of data
-                output["text"] += chunk
-                antiprompt = detect_antiprompt(output["text"])
-                if antiprompt:
-                    ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}")
-                    output["text"] = remove_text_from_string(output["text"],antiprompt)
-                    return False
-                else:
-                    return True
-            elf_server.binding.generate(
-                                            text, 
-                                            n_predict, 
-                                            callback=callback,
-                                            temperature=request_data.temperature,
-                                            top_k=request_data.top_k, 
-                                            top_p=request_data.top_p,
-                                            repeat_penalty=request_data.repeat_penalty,
-                                            repeat_last_n=request_data.repeat_last_n,
-                                            seed=request_data.seed,
-                                            n_threads=request_data.n_threads
-                                        )
-            return output["text"]
-    else:
-        return None
-    
+            return None
+    except Exception as ex:
+        trace_exception(ex)
+        elf_server.error(ex)
+        return {"status":False,"error":str(ex)}
+
+    
+# ----------------------- Open AI ----------------------------------------
+class Message(BaseModel):
+    role: str
+    content: str
+
+class Delta(BaseModel):
+    content : str = ""
+    role : str = "assistant"
+
+
+class Choices(BaseModel):
+    finish_reason: Optional[str] = None,
+    index: Optional[int] = 0,
+    message: Optional[str] = "",
+    logprobs: Optional[float] = None
+
+
+
+class Usage(BaseModel):
+    prompt_tokens: Optional[int]=0,
+    completion_tokens : Optional[int]=0,
+    completion_tokens : Optional[int]=0,
+
+
+class StreamingChoices(BaseModel):
+    finish_reason : Optional[str] = "stop"
+    index : Optional[int] = 0
+    delta : Optional[Delta] = None
+    logprobs : Optional[List[float]|None] = None
+
+class StreamingModelResponse(BaseModel):
+    id: str
+    """A unique identifier for the completion."""
+
+    choices: List[StreamingChoices]
+    """The list of completion choices the model generated for the input prompt."""
+
+    created: int
+    """The Unix timestamp (in seconds) of when the completion was created."""
+
+    model: Optional[str] = None
+    """The model used for completion."""
+
+    object: Optional[str] = "text_completion"
+    """The object type, which is always "text_completion" """
+
+    system_fingerprint: Optional[str] = None
+    """This fingerprint represents the backend configuration that the model runs with.
+
+    Can be used in conjunction with the `seed` request parameter to understand when
+    backend changes have been made that might impact determinism.
+    """
+
+    usage: Optional[Usage] = None
+    """Usage statistics for the completion request."""
+
+    _hidden_params: dict = {}
+    def encode(self, charset):
+        encoded = json.dumps(self.dict()).encode(charset)
+        return encoded
+
+class ModelResponse(BaseModel):
+    id: str
+    """A unique identifier for the completion."""
+
+    choices: List[Choices]
+    """The list of completion choices the model generated for the input prompt."""
+
+    created: int
+    """The Unix timestamp (in seconds) of when the completion was created."""
+
+    model: Optional[str] = None
+    """The model used for completion."""
+
+    object: Optional[str] = "text_completion"
+    """The object type, which is always "text_completion" """
+
+    system_fingerprint: Optional[str] = None
+    """This fingerprint represents the backend configuration that the model runs with.
+
+    Can be used in conjunction with the `seed` request parameter to understand when
+    backend changes have been made that might impact determinism.
+    """
+
+    usage: Optional[Usage] = None
+    """Usage statistics for the completion request."""
+
+    _hidden_params: dict = {}
+
+class GenerationRequest(BaseModel):
+    messages: List[Message]
+    max_tokens: Optional[int] = 1024
+    stream: Optional[bool] = False
+    temperature: Optional[float] = 0.1
+

-# openai compatible generation
@router.post("/v1/chat/completions")
-def v1_chat_generate(request_data: V1ChatGenerateRequest):
-    """
-    Endpoint for generating text from prompts using the lollms fastapi server in chat completion mode.
-    This endpoint is compatible with open ai API and mistralAI API
-    Args:
-    - request_data: GenerateRequest object containing the input text, number of predictions, and stream flag.
+async def v1_chat_completions(request: GenerationRequest):
+    try:
+        messages = request.messages
+        text = ""
+        for message in messages:
+            text += f"{message.role}: {message.content}\n"
+        n_predict = request.max_tokens if request.max_tokens>0 else 1024
+        stream = request.stream

-    Returns:
-    - If the elf_server binding is not None:
-        - If stream is True, returns a StreamingResponse of generated text chunks.
-        - If stream is False, returns the generated text as a string.
-    - If the elf_server binding is None, returns None.
-    """    
-    messages = request_data.messages
-    text = ""
-    for message in messages:
-        text += f"{message['role']}: {message['content']}\n"
-    n_predict = request_data.max_tokens
-    stream = request_data.stream
-    
-    if elf_server.binding is not None:
-        if stream:
-            output = {"text":""}
-            def generate_chunks():
+        if elf_server.binding is not None:
+            if stream:
+                output = {"text":"","waiting":True,"new":[]}
+                def generate_chunks():
+                    lk = threading.Lock()
+
+                    def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
+                        if elf_server.cancel_gen:
+                            return False
+                        if chunk is None:
+                            return
+                        output["text"] += chunk
+                        # Yield each chunk of data
+                        lk.acquire()
+                        try:
+                            antiprompt = detect_antiprompt(output["text"])
+                            if antiprompt:
+                                ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}")
+                                output["text"] = remove_text_from_string(output["text"],antiprompt)
+                                lk.release()
+                                return False
+                            else:
+                                output["new"].append(chunk)
+                                lk.release()
+                                return True
+                        except Exception as ex:
+                            trace_exception(ex)
+                            lk.release()
+                            return True
+                    def chunks_builder():
+                        elf_server.binding.generate(
+                                                text, 
+                                                n_predict, 
+                                                callback=callback, 
+                                                temperature=request.temperature or elf_server.config.temperature
+                                            )
+                        output["waiting"] = False
+                    thread = threading.Thread(target=chunks_builder)
+                    thread.start()
+                    current_index = 0
+                    while (output["waiting"] and elf_server.cancel_gen == False):
+                        while (output["waiting"] and len(output["new"])==0):
+                            time.sleep(0.001)
+                        lk.acquire()
+                        for i in range(len(output["new"])):
+                            output_val = StreamingModelResponse(
+                                id = _generate_id(), 
+                                choices = [StreamingChoices(index= current_index, delta=Delta(content=output["new"][i]))], 
+                                created=int(time.time()),
+                                model=elf_server.config.model_name,
+                                usage=Usage(prompt_tokens= 0, completion_tokens= 10)
+                                )
+                            current_index += 1                        
+                            yield output_val
+                        output["new"]=[]
+                        lk.release()
+                    elf_server.cancel_gen = False
+
+                return StreamingResponse(iter(generate_chunks()))
+            else:
+                output = {"text":""}
                def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
                    # Yield each chunk of data
+                    if chunk is None:
+                        return
                    output["text"] += chunk
                    antiprompt = detect_antiprompt(output["text"])
                    if antiprompt:
@ -201,65 +405,62 @@ def v1_chat_generate(request_data: V1ChatGenerateRequest):
                        output["text"] = remove_text_from_string(output["text"],antiprompt)
                        return False
                    else:
-                        yield chunk
                        return True
-                return iter(elf_server.binding.generate(
-                                            text, 
-                                            n_predict, 
-                                            callback=callback, 
-                                            temperature=request_data.temperature
-                                        ))
-            
-            return StreamingResponse(generate_chunks())
+                elf_server.binding.generate(
+                                                text, 
+                                                n_predict, 
+                                                callback=callback,
+                                                temperature=request.temperature or elf_server.config.temperature
+                                            )
+                return ModelResponse(id = _generate_id(), choices = [Choices(message=output["text"])], created=int(time.time()))
        else:
-            output = {"text":""}
-            def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
-                # Yield each chunk of data
-                output["text"] += chunk
-                antiprompt = detect_antiprompt(output["text"])
-                if antiprompt:
-                    ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}")
-                    output["text"] = remove_text_from_string(output["text"],antiprompt)
-                    return False
-                else:
-                    return True
-            elf_server.binding.generate(
-                                            text, 
-                                            n_predict, 
-                                            callback=callback,
-                                            temperature=request_data.temperature
-                                        )
-            return output["text"]
-    else:
-        return None
+            return None
+    except Exception as ex:
+        trace_exception(ex)
+        elf_server.error(ex)
+        return {"status":False,"error":str(ex)}


-
-
-# openai compatible generation
@router.post("/v1/completions")
-def v1_instruct_generate(request_data: V1InstructGenerateRequest):
+async def v1_completion(request: Request):
    """
-    Endpoint for generating text from prompts using the lollms fastapi server in instruct completion mode.
-    This endpoint is compatible with open ai API and mistralAI API
-    Args:
-    - request_data: GenerateRequest object containing the input text, number of predictions, and stream flag.
+    Executes Python code and returns the output.

-    Returns:
-    - If the elf_server binding is not None:
-        - If stream is True, returns a StreamingResponse of generated text chunks.
-        - If stream is False, returns the generated text as a string.
-    - If the elf_server binding is None, returns None.
-    """    
-   
-    text = request_data.prompt
-    n_predict = request_data.max_tokens
-    stream = request_data.stream
-    
-    if elf_server.binding is not None:
-        if stream:
-            output = {"text":""}
-            def generate_chunks():
+    :param request: The HTTP request object.
+    :return: A JSON response with the status of the operation.
+    """
+
+    try:
+        data = (await request.json())
+        text = data.get("prompt")
+        n_predict = data.get("max_tokens")
+        stream = data.get("stream")
+        
+        if elf_server.binding is not None:
+            if stream:
+                output = {"text":""}
+                def generate_chunks():
+                    def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
+                        # Yield each chunk of data
+                        output["text"] += chunk
+                        antiprompt = detect_antiprompt(output["text"])
+                        if antiprompt:
+                            ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}")
+                            output["text"] = remove_text_from_string(output["text"],antiprompt)
+                            return False
+                        else:
+                            yield chunk
+                            return True
+                    return iter(elf_server.binding.generate(
+                                                text, 
+                                                n_predict, 
+                                                callback=callback, 
+                                                temperature=data.get("temperature", elf_server.config.temperature)
+                                            ))
+                
+                return StreamingResponse(generate_chunks())
+            else:
+                output = {"text":""}
                def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
                    # Yield each chunk of data
                    output["text"] += chunk
@ -269,38 +470,20 @@ def v1_instruct_generate(request_data: V1InstructGenerateRequest):
                        output["text"] = remove_text_from_string(output["text"],antiprompt)
                        return False
                    else:
-                        yield chunk
                        return True
-                return iter(elf_server.binding.generate(
-                                            text, 
-                                            n_predict, 
-                                            callback=callback, 
-                                            temperature=request_data.temperature
-                                        ))
-            
-            return StreamingResponse(generate_chunks())
+                elf_server.binding.generate(
+                                                text, 
+                                                n_predict, 
+                                                callback=callback,
+                                                temperature=data.get("temperature", elf_server.config.temperature)
+                                            )
+                return output["text"]
        else:
-            output = {"text":""}
-            def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
-                # Yield each chunk of data
-                output["text"] += chunk
-                antiprompt = detect_antiprompt(output["text"])
-                if antiprompt:
-                    ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}")
-                    output["text"] = remove_text_from_string(output["text"],antiprompt)
-                    return False
-                else:
-                    return True
-            elf_server.binding.generate(
-                                            text, 
-                                            n_predict, 
-                                            callback=callback,
-                                            temperature=request_data.temperature
-                                        )
-            return output["text"]
-    else:
-        return None
-
+            return None
+    except Exception as ex:
+        trace_exception(ex)
+        elf_server.error(ex)
+        return {"status":False,"error":str(ex)}


@router.post("/stop_gen")
--- a/lollms/server/endpoints/lollms_personalities_infos.py
+++ b/lollms/server/endpoints/lollms_personalities_infos.py
@ -61,6 +61,9 @@ def list_personalities(category:str):
        ASCIIColors.error(f"No personalities found. Using default one {ex}")
    return personalities

+@router.get("/list_mounted_personalities")
+def list_mounted_personalities():
+    return lollmsElfServer.config.personalities

@router.get("/get_all_personalities")
 def get_all_personalities():
@ -501,10 +504,17 @@ async def set_active_personality_settings(request: Request):

 # ------------------------------------------- Interaction with personas ------------------------------------------------
@router.post("/post_to_personality")
-def post_to_personality(data):
+async def post_to_personality(request: Request):
    """Post data to a personality"""
-    if hasattr(lollmsElfServer.personality.processor,'handle_request'):
-        return lollmsElfServer.personality.processor.handle_request(data)
-    else:
-        return {}
+
+    try:
+        config_data = (await request.json())
+        if hasattr(lollmsElfServer.personality.processor,'handle_request'):
+            return lollmsElfServer.personality.processor.handle_request(config_data)
+        else:
+            return {}
+    except Exception as ex:
+        trace_exception(ex)
+        lollmsElfServer.error(ex)
+        return {"status":False,"error":str(ex)}
    
--- a/lollms/server/events/lollms_generation_events.py
+++ b/lollms/server/events/lollms_generation_events.py
@ -58,152 +58,149 @@ def add_events(sio:socketio):
            run_async(partial(lollmsElfServer.socketio.emit,"busy", {"message":"I am busy. Come back later."}, to=client_id))
            ASCIIColors.warning(f"OOps request {client_id}  refused!! Server busy")
            return
-        def generate_text():
-            lollmsElfServer.busy = True
-            try:
-                model = lollmsElfServer.model
-                lollmsElfServer.connections[client_id]["is_generating"]=True
-                lollmsElfServer.connections[client_id]["requested_stop"]=False
-                prompt          = data['prompt']
-                tokenized = model.tokenize(prompt)
-                personality_id  = data.get('personality', -1)
+        lollmsElfServer.busy = True
+        try:
+            model = lollmsElfServer.model
+            lollmsElfServer.connections[client_id]["is_generating"]=True
+            lollmsElfServer.connections[client_id]["requested_stop"]=False
+            prompt          = data['prompt']
+            tokenized = model.tokenize(prompt)
+            personality_id  = data.get('personality', -1)

-                n_crop          = data.get('n_crop', len(tokenized))
-                if n_crop!=-1:
-                    prompt          = model.detokenize(tokenized[-n_crop:])
+            n_crop          = data.get('n_crop', len(tokenized))
+            if n_crop!=-1:
+                prompt          = model.detokenize(tokenized[-n_crop:])

-                n_predicts      = data["n_predicts"]
-                parameters      = data.get("parameters",{
-                    "temperature":lollmsElfServer.config["temperature"],
-                    "top_k":lollmsElfServer.config["top_k"],
-                    "top_p":lollmsElfServer.config["top_p"],
-                    "repeat_penalty":lollmsElfServer.config["repeat_penalty"],
-                    "repeat_last_n":lollmsElfServer.config["repeat_last_n"],
-                    "seed":lollmsElfServer.config["seed"]
-                })
+            n_predicts      = data["n_predicts"]
+            parameters      = data.get("parameters",{
+                "temperature":lollmsElfServer.config["temperature"],
+                "top_k":lollmsElfServer.config["top_k"],
+                "top_p":lollmsElfServer.config["top_p"],
+                "repeat_penalty":lollmsElfServer.config["repeat_penalty"],
+                "repeat_last_n":lollmsElfServer.config["repeat_last_n"],
+                "seed":lollmsElfServer.config["seed"]
+            })
+
+            if personality_id==-1:
+                # Raw text generation
+                lollmsElfServer.answer = {"full_text":""}
+                def callback(text, message_type: MSG_TYPE, metadata:dict={}):
+                    if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
+                        ASCIIColors.success(f"generated:{len(lollmsElfServer.answer['full_text'].split())} words", end='\r')
+                        if text is not None:
+                            lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text
+                            run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text, 'type':MSG_TYPE.MSG_TYPE_CHUNK.value}, to=client_id))
+                    if client_id in lollmsElfServer.connections:# Client disconnected                      
+                        if lollmsElfServer.connections[client_id]["requested_stop"]:
+                            return False
+                        else:
+                            return True
+                    else:
+                        return False                            
+
+                tk = model.tokenize(prompt)
+                n_tokens = len(tk)
+                fd = model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_predicts,n_tokens):])
+
+                try:
+                    ASCIIColors.print("warming up", ASCIIColors.color_bright_cyan)
+                    
+                    generated_text = model.generate(fd, 
+                                                    n_predict=n_predicts, 
+                                                    callback=callback,
+                                                    temperature = parameters["temperature"],
+                                                    top_k = parameters["top_k"],
+                                                    top_p = parameters["top_p"],
+                                                    repeat_penalty = parameters["repeat_penalty"],
+                                                    repeat_last_n = parameters["repeat_last_n"],
+                                                    seed = parameters["seed"],                                           
+                                                    )
+                    ASCIIColors.success(f"\ndone")
+
+                    if client_id in lollmsElfServer.connections:
+                        if not lollmsElfServer.connections[client_id]["requested_stop"]:
+                            # Emit the generated text to the client
+                            run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id))   
+                except Exception as ex:
+                    run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id))
+                    ASCIIColors.error(f"\ndone")
+                lollmsElfServer.busy = False
+            else:
+                try:
+                    personality: AIPersonality = lollmsElfServer.personalities[personality_id]
+                    ump = lollmsElfServer.config.discussion_prompt_separator +lollmsElfServer.config.user_name.strip() if lollmsElfServer.config.use_user_name_in_discussions else lollmsElfServer.personality.user_message_prefix
+                    personality.model = model
+                    cond_tk = personality.model.tokenize(personality.personality_conditioning)
+                    n_cond_tk = len(cond_tk)
+                    # Placeholder code for text generation
+                    # Replace this with your actual text generation logic
+                    print(f"Text generation requested by client: {client_id}")
+
+                    lollmsElfServer.answer["full_text"] = ''
+                    full_discussion_blocks = lollmsElfServer.connections[client_id]["full_discussion_blocks"]
+
+                    if prompt != '':
+                        if personality.processor is not None and personality.processor_cfg["process_model_input"]:
+                            preprocessed_prompt = personality.processor.process_model_input(prompt)
+                        else:
+                            preprocessed_prompt = prompt
+                        
+                        if personality.processor is not None and personality.processor_cfg["custom_workflow"]:
+                            full_discussion_blocks.append(ump)
+                            full_discussion_blocks.append(preprocessed_prompt)
+                    
+                        else:
+
+                            full_discussion_blocks.append(ump)
+                            full_discussion_blocks.append(preprocessed_prompt)
+                            full_discussion_blocks.append(personality.link_text)
+                            full_discussion_blocks.append(personality.ai_message_prefix)
+
+                    full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks)

-                if personality_id==-1:
-                    # Raw text generation
-                    lollmsElfServer.answer = {"full_text":""}
                    def callback(text, message_type: MSG_TYPE, metadata:dict={}):
                        if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
-                            ASCIIColors.success(f"generated:{len(lollmsElfServer.answer['full_text'].split())} words", end='\r')
-                            if text is not None:
-                                lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text
-                                run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text, 'type':MSG_TYPE.MSG_TYPE_CHUNK.value}, to=client_id))
-                        if client_id in lollmsElfServer.connections:# Client disconnected                      
+                            lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text
+                            run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text}, to=client_id))
+                        try:
                            if lollmsElfServer.connections[client_id]["requested_stop"]:
                                return False
                            else:
                                return True
-                        else:
-                            return False                            
+                        except: # If the client is disconnected then we stop talking to it
+                            return False

-                    tk = model.tokenize(prompt)
+                    tk = personality.model.tokenize(full_discussion)
                    n_tokens = len(tk)
-                    fd = model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_predicts,n_tokens):])
+                    fd = personality.model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_cond_tk-personality.model_n_predicts,n_tokens):])
+                    
+                    if personality.processor is not None and personality.processor_cfg["custom_workflow"]:
+                        ASCIIColors.info("processing...")
+                        generated_text = personality.processor.run_workflow(prompt, previous_discussion_text=personality.personality_conditioning+fd, callback=callback)
+                    else:
+                        ASCIIColors.info("generating...")
+                        generated_text = personality.model.generate(
+                                                                    personality.personality_conditioning+fd, 
+                                                                    n_predict=personality.model_n_predicts, 
+                                                                    callback=callback)

-                    try:
-                        ASCIIColors.print("warming up", ASCIIColors.color_bright_cyan)
-                        
-                        generated_text = model.generate(fd, 
-                                                        n_predict=n_predicts, 
-                                                        callback=callback,
-                                                        temperature = parameters["temperature"],
-                                                        top_k = parameters["top_k"],
-                                                        top_p = parameters["top_p"],
-                                                        repeat_penalty = parameters["repeat_penalty"],
-                                                        repeat_last_n = parameters["repeat_last_n"],
-                                                        seed = parameters["seed"],                                           
-                                                        )
-                        ASCIIColors.success(f"\ndone")
+                    if personality.processor is not None and personality.processor_cfg["process_model_output"]: 
+                        generated_text = personality.processor.process_model_output(generated_text)

-                        if client_id in lollmsElfServer.connections:
-                            if not lollmsElfServer.connections[client_id]["requested_stop"]:
-                                # Emit the generated text to the client
-                                run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id))   
-                    except Exception as ex:
-                        run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id))
-                        ASCIIColors.error(f"\ndone")
-                    lollmsElfServer.busy = False
-                else:
-                    try:
-                        personality: AIPersonality = lollmsElfServer.personalities[personality_id]
-                        ump = lollmsElfServer.config.discussion_prompt_separator +lollmsElfServer.config.user_name.strip() if lollmsElfServer.config.use_user_name_in_discussions else lollmsElfServer.personality.user_message_prefix
-                        personality.model = model
-                        cond_tk = personality.model.tokenize(personality.personality_conditioning)
-                        n_cond_tk = len(cond_tk)
-                        # Placeholder code for text generation
-                        # Replace this with your actual text generation logic
-                        print(f"Text generation requested by client: {client_id}")
+                    full_discussion_blocks.append(generated_text.strip())
+                    ASCIIColors.success("\ndone")

-                        lollmsElfServer.answer["full_text"] = ''
-                        full_discussion_blocks = lollmsElfServer.connections[client_id]["full_discussion_blocks"]
-
-                        if prompt != '':
-                            if personality.processor is not None and personality.processor_cfg["process_model_input"]:
-                                preprocessed_prompt = personality.processor.process_model_input(prompt)
-                            else:
-                                preprocessed_prompt = prompt
-                            
-                            if personality.processor is not None and personality.processor_cfg["custom_workflow"]:
-                                full_discussion_blocks.append(ump)
-                                full_discussion_blocks.append(preprocessed_prompt)
-                        
-                            else:
-
-                                full_discussion_blocks.append(ump)
-                                full_discussion_blocks.append(preprocessed_prompt)
-                                full_discussion_blocks.append(personality.link_text)
-                                full_discussion_blocks.append(personality.ai_message_prefix)
-
-                        full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks)
-
-                        def callback(text, message_type: MSG_TYPE, metadata:dict={}):
-                            if message_type == MSG_TYPE.MSG_TYPE_CHUNK:
-                                lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text
-                                run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text}, to=client_id))
-                            try:
-                                if lollmsElfServer.connections[client_id]["requested_stop"]:
-                                    return False
-                                else:
-                                    return True
-                            except: # If the client is disconnected then we stop talking to it
-                                return False
-
-                        tk = personality.model.tokenize(full_discussion)
-                        n_tokens = len(tk)
-                        fd = personality.model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_cond_tk-personality.model_n_predicts,n_tokens):])
-                        
-                        if personality.processor is not None and personality.processor_cfg["custom_workflow"]:
-                            ASCIIColors.info("processing...")
-                            generated_text = personality.processor.run_workflow(prompt, previous_discussion_text=personality.personality_conditioning+fd, callback=callback)
-                        else:
-                            ASCIIColors.info("generating...")
-                            generated_text = personality.model.generate(
-                                                                        personality.personality_conditioning+fd, 
-                                                                        n_predict=personality.model_n_predicts, 
-                                                                        callback=callback)
-
-                        if personality.processor is not None and personality.processor_cfg["process_model_output"]: 
-                            generated_text = personality.processor.process_model_output(generated_text)
-
-                        full_discussion_blocks.append(generated_text.strip())
-                        ASCIIColors.success("\ndone")
-
-                        # Emit the generated text to the client
-                        run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id))
-                    except Exception as ex:
-                        run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id))
-                        ASCIIColors.error(f"\ndone")
-                    lollmsElfServer.busy = False
-            except Exception as ex:
-                    trace_exception(ex)
+                    # Emit the generated text to the client
+                    run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id))
+                except Exception as ex:
                    run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id))
-                    lollmsElfServer.busy = False
+                    ASCIIColors.error(f"\ndone")
+                lollmsElfServer.busy = False
+        except Exception as ex:
+                trace_exception(ex)
+                run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id))
+                lollmsElfServer.busy = False

-        # Start the text generation task in a separate thread
-        task = lollmsElfServer.socketio.start_background_task(target=generate_text)