diff --git a/lollms/app.py b/lollms/app.py index 9131b9b..74ba258 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -44,6 +44,9 @@ class LollmsApplication(LoLLMsCom): self.config = config self.lollms_paths = lollms_paths + # TODO : implement + self.embedding_models = [] + self.menu = MainMenu(self, callback) self.mounted_personalities = [] self.personality:AIPersonality = None diff --git a/lollms/server/endpoints/lollms_generator.py b/lollms/server/endpoints/lollms_generator.py index e684ef2..11191b9 100644 --- a/lollms/server/endpoints/lollms_generator.py +++ b/lollms/server/endpoints/lollms_generator.py @@ -8,13 +8,25 @@ description: """ -from fastapi import APIRouter, Request +from fastapi import APIRouter, Request, Body from lollms.server.elf_server import LOLLMSElfServer from pydantic import BaseModel from starlette.responses import StreamingResponse from lollms.types import MSG_TYPE -from lollms.utilities import detect_antiprompt, remove_text_from_string +from lollms.utilities import detect_antiprompt, remove_text_from_string, trace_exception from ascii_colors import ASCIIColors +import time +import threading +from typing import List, Optional, Union +import random +import string +import json + +def _generate_id(length=10): + letters_and_digits = string.ascii_letters + string.digits + random_id = ''.join(random.choice(letters_and_digits) for _ in range(length)) + return random_id + class GenerateRequest(BaseModel): text: str @@ -77,16 +89,30 @@ def get_generation_status(): # ----------------------------------- Generation ----------------------------------------- -@router.post("/generate") -def lollms_generate(request_data: Request): - """ - Endpoint for generating text from prompts using the lollms fastapi server. +class LollmsGenerateRequest(BaseModel): + text: str + model_name: Optional[str] = None + personality: Optional[int] = None + n_predict: Optional[int] = 1024 + stream: bool = False + temperature: float = None + top_k: Optional[int] = None + top_p: Optional[float] = None + repeat_penalty: Optional[float] = None + repeat_last_n: Optional[int] = None + seed: Optional[int] = None + n_threads: Optional[int] = None + +@router.post("/lollms_generate") +async def lollms_generate(request: LollmsGenerateRequest): + """ Endpoint for generating text from prompts using the LoLLMs fastAPI server. Args: Data model for the Generate Request. - Attributes: - - text: str representing the input text prompt for text generation. + - text: str : representing the input text prompt for text generation. + - model_name: Optional[str] = None : The name of the model to be used (it should be one of the current models) + - personality_id: Optional[int] = None : The name of the mounted personality to be used (if a personality is None, the endpoint will just return a completion text). To get the list of mounted personalities, just use /list_mounted_personalities - n_predict: int representing the number of predictions to generate. - stream: bool indicating whether to stream the generated text or not. - temperature: float representing the temperature parameter for text generation. @@ -97,21 +123,79 @@ def lollms_generate(request_data: Request): - seed: int representing the seed for text generation. - n_threads: int representing the number of threads for text generation. - Returns: - If the elf_server binding is not None: - - If stream is True, returns a StreamingResponse of generated text chunks. - - If stream is False, returns the generated text as a string. + - If stream is True, returns a StreamingResponse of generated text chunks. + - If stream is False, returns the generated text as a string. - If the elf_server binding is None, returns None. - """ - text = request_data["text"] - n_predict = request_data.get("n_predict", 1024) - stream = request_data.get("stream", False) - - if elf_server.binding is not None: - if stream: - output = {"text":""} - def generate_chunks(): + """ + + try: + text = request.text + n_predict = request.n_predict + stream = request.stream + + if elf_server.binding is not None: + if stream: + + output = {"text":"","waiting":True,"new":[]} + def generate_chunks(): + lk = threading.Lock() + + def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): + if elf_server.cancel_gen: + return False + if chunk is None: + return + output["text"] += chunk + # Yield each chunk of data + lk.acquire() + try: + antiprompt = detect_antiprompt(output["text"]) + if antiprompt: + ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}") + output["text"] = remove_text_from_string(output["text"],antiprompt) + lk.release() + return False + else: + output["new"].append(chunk) + lk.release() + return True + except Exception as ex: + trace_exception(ex) + lk.release() + return True + def chunks_builder(): + elf_server.binding.generate( + text, + n_predict, + callback=callback, + temperature=request.temperature if request.temperature is not None else elf_server.config.temperature, + top_k=request.top_k if request.top_k is not None else elf_server.config.top_k, + top_p=request.top_p if request.top_p is not None else elf_server.config.top_p, + repeat_penalty=request.repeat_penalty if request.repeat_penalty is not None else elf_server.config.repeat_penalty, + repeat_last_n=request.repeat_last_n if request.repeat_last_n is not None else elf_server.config.repeat_last_n, + seed=request.seed if request.seed is not None else elf_server.config.seed, + n_threads=request.n_threads if request.n_threads is not None else elf_server.config.n_threads + ) + output["waiting"] = False + thread = threading.Thread(target=chunks_builder) + thread.start() + current_index = 0 + while (output["waiting"] and elf_server.cancel_gen == False): + while (output["waiting"] and len(output["new"])==0): + time.sleep(0.001) + lk.acquire() + for i in range(len(output["new"])): + current_index += 1 + yield output["new"][i] + output["new"]=[] + lk.release() + elf_server.cancel_gen = False + + return StreamingResponse(iter(generate_chunks())) + else: + output = {"text":""} def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): # Yield each chunk of data output["text"] += chunk @@ -121,79 +205,199 @@ def lollms_generate(request_data: Request): output["text"] = remove_text_from_string(output["text"],antiprompt) return False else: - yield chunk return True - return iter(elf_server.binding.generate( - text, - n_predict, - callback=callback, - temperature=request_data.temperature, - top_k=request_data.top_k, - top_p=request_data.top_p, - repeat_penalty=request_data.repeat_penalty, - repeat_last_n=request_data.repeat_last_n, - seed=request_data.seed, - n_threads=request_data.n_threads - )) - - return StreamingResponse(generate_chunks()) + elf_server.binding.generate( + text, + n_predict, + callback=callback, + temperature=request.temperature if request.temperature is not None else elf_server.config.temperature, + top_k=request.top_k if request.top_k is not None else elf_server.config.top_k, + top_p=request.top_p if request.top_p is not None else elf_server.config.top_p, + repeat_penalty=request.repeat_penalty if request.repeat_penalty is not None else elf_server.config.repeat_penalty, + repeat_last_n=request.repeat_last_n if request.repeat_last_n is not None else elf_server.config.repeat_last_n, + seed=request.seed if request.seed is not None else elf_server.config.seed, + n_threads=request.n_threads if request.n_threads is not None else elf_server.config.n_threads + ) + return output["text"] else: - output = {"text":""} - def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): - # Yield each chunk of data - output["text"] += chunk - antiprompt = detect_antiprompt(output["text"]) - if antiprompt: - ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}") - output["text"] = remove_text_from_string(output["text"],antiprompt) - return False - else: - return True - elf_server.binding.generate( - text, - n_predict, - callback=callback, - temperature=request_data.temperature, - top_k=request_data.top_k, - top_p=request_data.top_p, - repeat_penalty=request_data.repeat_penalty, - repeat_last_n=request_data.repeat_last_n, - seed=request_data.seed, - n_threads=request_data.n_threads - ) - return output["text"] - else: - return None - + return None + except Exception as ex: + trace_exception(ex) + elf_server.error(ex) + return {"status":False,"error":str(ex)} + + +# ----------------------- Open AI ---------------------------------------- +class Message(BaseModel): + role: str + content: str + +class Delta(BaseModel): + content : str = "" + role : str = "assistant" + + +class Choices(BaseModel): + finish_reason: Optional[str] = None, + index: Optional[int] = 0, + message: Optional[str] = "", + logprobs: Optional[float] = None + + + +class Usage(BaseModel): + prompt_tokens: Optional[int]=0, + completion_tokens : Optional[int]=0, + completion_tokens : Optional[int]=0, + + +class StreamingChoices(BaseModel): + finish_reason : Optional[str] = "stop" + index : Optional[int] = 0 + delta : Optional[Delta] = None + logprobs : Optional[List[float]|None] = None + +class StreamingModelResponse(BaseModel): + id: str + """A unique identifier for the completion.""" + + choices: List[StreamingChoices] + """The list of completion choices the model generated for the input prompt.""" + + created: int + """The Unix timestamp (in seconds) of when the completion was created.""" + + model: Optional[str] = None + """The model used for completion.""" + + object: Optional[str] = "text_completion" + """The object type, which is always "text_completion" """ + + system_fingerprint: Optional[str] = None + """This fingerprint represents the backend configuration that the model runs with. + + Can be used in conjunction with the `seed` request parameter to understand when + backend changes have been made that might impact determinism. + """ + + usage: Optional[Usage] = None + """Usage statistics for the completion request.""" + + _hidden_params: dict = {} + def encode(self, charset): + encoded = json.dumps(self.dict()).encode(charset) + return encoded + +class ModelResponse(BaseModel): + id: str + """A unique identifier for the completion.""" + + choices: List[Choices] + """The list of completion choices the model generated for the input prompt.""" + + created: int + """The Unix timestamp (in seconds) of when the completion was created.""" + + model: Optional[str] = None + """The model used for completion.""" + + object: Optional[str] = "text_completion" + """The object type, which is always "text_completion" """ + + system_fingerprint: Optional[str] = None + """This fingerprint represents the backend configuration that the model runs with. + + Can be used in conjunction with the `seed` request parameter to understand when + backend changes have been made that might impact determinism. + """ + + usage: Optional[Usage] = None + """Usage statistics for the completion request.""" + + _hidden_params: dict = {} + +class GenerationRequest(BaseModel): + messages: List[Message] + max_tokens: Optional[int] = 1024 + stream: Optional[bool] = False + temperature: Optional[float] = 0.1 + -# openai compatible generation @router.post("/v1/chat/completions") -def v1_chat_generate(request_data: V1ChatGenerateRequest): - """ - Endpoint for generating text from prompts using the lollms fastapi server in chat completion mode. - This endpoint is compatible with open ai API and mistralAI API - Args: - - request_data: GenerateRequest object containing the input text, number of predictions, and stream flag. +async def v1_chat_completions(request: GenerationRequest): + try: + messages = request.messages + text = "" + for message in messages: + text += f"{message.role}: {message.content}\n" + n_predict = request.max_tokens if request.max_tokens>0 else 1024 + stream = request.stream - Returns: - - If the elf_server binding is not None: - - If stream is True, returns a StreamingResponse of generated text chunks. - - If stream is False, returns the generated text as a string. - - If the elf_server binding is None, returns None. - """ - messages = request_data.messages - text = "" - for message in messages: - text += f"{message['role']}: {message['content']}\n" - n_predict = request_data.max_tokens - stream = request_data.stream - - if elf_server.binding is not None: - if stream: - output = {"text":""} - def generate_chunks(): + if elf_server.binding is not None: + if stream: + output = {"text":"","waiting":True,"new":[]} + def generate_chunks(): + lk = threading.Lock() + + def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): + if elf_server.cancel_gen: + return False + if chunk is None: + return + output["text"] += chunk + # Yield each chunk of data + lk.acquire() + try: + antiprompt = detect_antiprompt(output["text"]) + if antiprompt: + ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}") + output["text"] = remove_text_from_string(output["text"],antiprompt) + lk.release() + return False + else: + output["new"].append(chunk) + lk.release() + return True + except Exception as ex: + trace_exception(ex) + lk.release() + return True + def chunks_builder(): + elf_server.binding.generate( + text, + n_predict, + callback=callback, + temperature=request.temperature or elf_server.config.temperature + ) + output["waiting"] = False + thread = threading.Thread(target=chunks_builder) + thread.start() + current_index = 0 + while (output["waiting"] and elf_server.cancel_gen == False): + while (output["waiting"] and len(output["new"])==0): + time.sleep(0.001) + lk.acquire() + for i in range(len(output["new"])): + output_val = StreamingModelResponse( + id = _generate_id(), + choices = [StreamingChoices(index= current_index, delta=Delta(content=output["new"][i]))], + created=int(time.time()), + model=elf_server.config.model_name, + usage=Usage(prompt_tokens= 0, completion_tokens= 10) + ) + current_index += 1 + yield output_val + output["new"]=[] + lk.release() + elf_server.cancel_gen = False + + return StreamingResponse(iter(generate_chunks())) + else: + output = {"text":""} def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): # Yield each chunk of data + if chunk is None: + return output["text"] += chunk antiprompt = detect_antiprompt(output["text"]) if antiprompt: @@ -201,65 +405,62 @@ def v1_chat_generate(request_data: V1ChatGenerateRequest): output["text"] = remove_text_from_string(output["text"],antiprompt) return False else: - yield chunk return True - return iter(elf_server.binding.generate( - text, - n_predict, - callback=callback, - temperature=request_data.temperature - )) - - return StreamingResponse(generate_chunks()) + elf_server.binding.generate( + text, + n_predict, + callback=callback, + temperature=request.temperature or elf_server.config.temperature + ) + return ModelResponse(id = _generate_id(), choices = [Choices(message=output["text"])], created=int(time.time())) else: - output = {"text":""} - def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): - # Yield each chunk of data - output["text"] += chunk - antiprompt = detect_antiprompt(output["text"]) - if antiprompt: - ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}") - output["text"] = remove_text_from_string(output["text"],antiprompt) - return False - else: - return True - elf_server.binding.generate( - text, - n_predict, - callback=callback, - temperature=request_data.temperature - ) - return output["text"] - else: - return None + return None + except Exception as ex: + trace_exception(ex) + elf_server.error(ex) + return {"status":False,"error":str(ex)} - - -# openai compatible generation @router.post("/v1/completions") -def v1_instruct_generate(request_data: V1InstructGenerateRequest): +async def v1_completion(request: Request): """ - Endpoint for generating text from prompts using the lollms fastapi server in instruct completion mode. - This endpoint is compatible with open ai API and mistralAI API - Args: - - request_data: GenerateRequest object containing the input text, number of predictions, and stream flag. + Executes Python code and returns the output. - Returns: - - If the elf_server binding is not None: - - If stream is True, returns a StreamingResponse of generated text chunks. - - If stream is False, returns the generated text as a string. - - If the elf_server binding is None, returns None. - """ - - text = request_data.prompt - n_predict = request_data.max_tokens - stream = request_data.stream - - if elf_server.binding is not None: - if stream: - output = {"text":""} - def generate_chunks(): + :param request: The HTTP request object. + :return: A JSON response with the status of the operation. + """ + + try: + data = (await request.json()) + text = data.get("prompt") + n_predict = data.get("max_tokens") + stream = data.get("stream") + + if elf_server.binding is not None: + if stream: + output = {"text":""} + def generate_chunks(): + def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): + # Yield each chunk of data + output["text"] += chunk + antiprompt = detect_antiprompt(output["text"]) + if antiprompt: + ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}") + output["text"] = remove_text_from_string(output["text"],antiprompt) + return False + else: + yield chunk + return True + return iter(elf_server.binding.generate( + text, + n_predict, + callback=callback, + temperature=data.get("temperature", elf_server.config.temperature) + )) + + return StreamingResponse(generate_chunks()) + else: + output = {"text":""} def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): # Yield each chunk of data output["text"] += chunk @@ -269,38 +470,20 @@ def v1_instruct_generate(request_data: V1InstructGenerateRequest): output["text"] = remove_text_from_string(output["text"],antiprompt) return False else: - yield chunk return True - return iter(elf_server.binding.generate( - text, - n_predict, - callback=callback, - temperature=request_data.temperature - )) - - return StreamingResponse(generate_chunks()) + elf_server.binding.generate( + text, + n_predict, + callback=callback, + temperature=data.get("temperature", elf_server.config.temperature) + ) + return output["text"] else: - output = {"text":""} - def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): - # Yield each chunk of data - output["text"] += chunk - antiprompt = detect_antiprompt(output["text"]) - if antiprompt: - ASCIIColors.warning(f"\nDetected hallucination with antiprompt: {antiprompt}") - output["text"] = remove_text_from_string(output["text"],antiprompt) - return False - else: - return True - elf_server.binding.generate( - text, - n_predict, - callback=callback, - temperature=request_data.temperature - ) - return output["text"] - else: - return None - + return None + except Exception as ex: + trace_exception(ex) + elf_server.error(ex) + return {"status":False,"error":str(ex)} @router.post("/stop_gen") diff --git a/lollms/server/endpoints/lollms_personalities_infos.py b/lollms/server/endpoints/lollms_personalities_infos.py index 07fbaba..e3adaeb 100644 --- a/lollms/server/endpoints/lollms_personalities_infos.py +++ b/lollms/server/endpoints/lollms_personalities_infos.py @@ -61,6 +61,9 @@ def list_personalities(category:str): ASCIIColors.error(f"No personalities found. Using default one {ex}") return personalities +@router.get("/list_mounted_personalities") +def list_mounted_personalities(): + return lollmsElfServer.config.personalities @router.get("/get_all_personalities") def get_all_personalities(): @@ -501,10 +504,17 @@ async def set_active_personality_settings(request: Request): # ------------------------------------------- Interaction with personas ------------------------------------------------ @router.post("/post_to_personality") -def post_to_personality(data): +async def post_to_personality(request: Request): """Post data to a personality""" - if hasattr(lollmsElfServer.personality.processor,'handle_request'): - return lollmsElfServer.personality.processor.handle_request(data) - else: - return {} + + try: + config_data = (await request.json()) + if hasattr(lollmsElfServer.personality.processor,'handle_request'): + return lollmsElfServer.personality.processor.handle_request(config_data) + else: + return {} + except Exception as ex: + trace_exception(ex) + lollmsElfServer.error(ex) + return {"status":False,"error":str(ex)} diff --git a/lollms/server/events/lollms_generation_events.py b/lollms/server/events/lollms_generation_events.py index 4f7aabf..f4981ce 100644 --- a/lollms/server/events/lollms_generation_events.py +++ b/lollms/server/events/lollms_generation_events.py @@ -58,152 +58,149 @@ def add_events(sio:socketio): run_async(partial(lollmsElfServer.socketio.emit,"busy", {"message":"I am busy. Come back later."}, to=client_id)) ASCIIColors.warning(f"OOps request {client_id} refused!! Server busy") return - def generate_text(): - lollmsElfServer.busy = True - try: - model = lollmsElfServer.model - lollmsElfServer.connections[client_id]["is_generating"]=True - lollmsElfServer.connections[client_id]["requested_stop"]=False - prompt = data['prompt'] - tokenized = model.tokenize(prompt) - personality_id = data.get('personality', -1) + lollmsElfServer.busy = True + try: + model = lollmsElfServer.model + lollmsElfServer.connections[client_id]["is_generating"]=True + lollmsElfServer.connections[client_id]["requested_stop"]=False + prompt = data['prompt'] + tokenized = model.tokenize(prompt) + personality_id = data.get('personality', -1) - n_crop = data.get('n_crop', len(tokenized)) - if n_crop!=-1: - prompt = model.detokenize(tokenized[-n_crop:]) + n_crop = data.get('n_crop', len(tokenized)) + if n_crop!=-1: + prompt = model.detokenize(tokenized[-n_crop:]) - n_predicts = data["n_predicts"] - parameters = data.get("parameters",{ - "temperature":lollmsElfServer.config["temperature"], - "top_k":lollmsElfServer.config["top_k"], - "top_p":lollmsElfServer.config["top_p"], - "repeat_penalty":lollmsElfServer.config["repeat_penalty"], - "repeat_last_n":lollmsElfServer.config["repeat_last_n"], - "seed":lollmsElfServer.config["seed"] - }) + n_predicts = data["n_predicts"] + parameters = data.get("parameters",{ + "temperature":lollmsElfServer.config["temperature"], + "top_k":lollmsElfServer.config["top_k"], + "top_p":lollmsElfServer.config["top_p"], + "repeat_penalty":lollmsElfServer.config["repeat_penalty"], + "repeat_last_n":lollmsElfServer.config["repeat_last_n"], + "seed":lollmsElfServer.config["seed"] + }) + + if personality_id==-1: + # Raw text generation + lollmsElfServer.answer = {"full_text":""} + def callback(text, message_type: MSG_TYPE, metadata:dict={}): + if message_type == MSG_TYPE.MSG_TYPE_CHUNK: + ASCIIColors.success(f"generated:{len(lollmsElfServer.answer['full_text'].split())} words", end='\r') + if text is not None: + lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text + run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text, 'type':MSG_TYPE.MSG_TYPE_CHUNK.value}, to=client_id)) + if client_id in lollmsElfServer.connections:# Client disconnected + if lollmsElfServer.connections[client_id]["requested_stop"]: + return False + else: + return True + else: + return False + + tk = model.tokenize(prompt) + n_tokens = len(tk) + fd = model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_predicts,n_tokens):]) + + try: + ASCIIColors.print("warming up", ASCIIColors.color_bright_cyan) + + generated_text = model.generate(fd, + n_predict=n_predicts, + callback=callback, + temperature = parameters["temperature"], + top_k = parameters["top_k"], + top_p = parameters["top_p"], + repeat_penalty = parameters["repeat_penalty"], + repeat_last_n = parameters["repeat_last_n"], + seed = parameters["seed"], + ) + ASCIIColors.success(f"\ndone") + + if client_id in lollmsElfServer.connections: + if not lollmsElfServer.connections[client_id]["requested_stop"]: + # Emit the generated text to the client + run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id)) + except Exception as ex: + run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id)) + ASCIIColors.error(f"\ndone") + lollmsElfServer.busy = False + else: + try: + personality: AIPersonality = lollmsElfServer.personalities[personality_id] + ump = lollmsElfServer.config.discussion_prompt_separator +lollmsElfServer.config.user_name.strip() if lollmsElfServer.config.use_user_name_in_discussions else lollmsElfServer.personality.user_message_prefix + personality.model = model + cond_tk = personality.model.tokenize(personality.personality_conditioning) + n_cond_tk = len(cond_tk) + # Placeholder code for text generation + # Replace this with your actual text generation logic + print(f"Text generation requested by client: {client_id}") + + lollmsElfServer.answer["full_text"] = '' + full_discussion_blocks = lollmsElfServer.connections[client_id]["full_discussion_blocks"] + + if prompt != '': + if personality.processor is not None and personality.processor_cfg["process_model_input"]: + preprocessed_prompt = personality.processor.process_model_input(prompt) + else: + preprocessed_prompt = prompt + + if personality.processor is not None and personality.processor_cfg["custom_workflow"]: + full_discussion_blocks.append(ump) + full_discussion_blocks.append(preprocessed_prompt) + + else: + + full_discussion_blocks.append(ump) + full_discussion_blocks.append(preprocessed_prompt) + full_discussion_blocks.append(personality.link_text) + full_discussion_blocks.append(personality.ai_message_prefix) + + full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks) - if personality_id==-1: - # Raw text generation - lollmsElfServer.answer = {"full_text":""} def callback(text, message_type: MSG_TYPE, metadata:dict={}): if message_type == MSG_TYPE.MSG_TYPE_CHUNK: - ASCIIColors.success(f"generated:{len(lollmsElfServer.answer['full_text'].split())} words", end='\r') - if text is not None: - lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text - run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text, 'type':MSG_TYPE.MSG_TYPE_CHUNK.value}, to=client_id)) - if client_id in lollmsElfServer.connections:# Client disconnected + lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text + run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text}, to=client_id)) + try: if lollmsElfServer.connections[client_id]["requested_stop"]: return False else: return True - else: - return False + except: # If the client is disconnected then we stop talking to it + return False - tk = model.tokenize(prompt) + tk = personality.model.tokenize(full_discussion) n_tokens = len(tk) - fd = model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_predicts,n_tokens):]) + fd = personality.model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_cond_tk-personality.model_n_predicts,n_tokens):]) + + if personality.processor is not None and personality.processor_cfg["custom_workflow"]: + ASCIIColors.info("processing...") + generated_text = personality.processor.run_workflow(prompt, previous_discussion_text=personality.personality_conditioning+fd, callback=callback) + else: + ASCIIColors.info("generating...") + generated_text = personality.model.generate( + personality.personality_conditioning+fd, + n_predict=personality.model_n_predicts, + callback=callback) - try: - ASCIIColors.print("warming up", ASCIIColors.color_bright_cyan) - - generated_text = model.generate(fd, - n_predict=n_predicts, - callback=callback, - temperature = parameters["temperature"], - top_k = parameters["top_k"], - top_p = parameters["top_p"], - repeat_penalty = parameters["repeat_penalty"], - repeat_last_n = parameters["repeat_last_n"], - seed = parameters["seed"], - ) - ASCIIColors.success(f"\ndone") + if personality.processor is not None and personality.processor_cfg["process_model_output"]: + generated_text = personality.processor.process_model_output(generated_text) - if client_id in lollmsElfServer.connections: - if not lollmsElfServer.connections[client_id]["requested_stop"]: - # Emit the generated text to the client - run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id)) - except Exception as ex: - run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id)) - ASCIIColors.error(f"\ndone") - lollmsElfServer.busy = False - else: - try: - personality: AIPersonality = lollmsElfServer.personalities[personality_id] - ump = lollmsElfServer.config.discussion_prompt_separator +lollmsElfServer.config.user_name.strip() if lollmsElfServer.config.use_user_name_in_discussions else lollmsElfServer.personality.user_message_prefix - personality.model = model - cond_tk = personality.model.tokenize(personality.personality_conditioning) - n_cond_tk = len(cond_tk) - # Placeholder code for text generation - # Replace this with your actual text generation logic - print(f"Text generation requested by client: {client_id}") + full_discussion_blocks.append(generated_text.strip()) + ASCIIColors.success("\ndone") - lollmsElfServer.answer["full_text"] = '' - full_discussion_blocks = lollmsElfServer.connections[client_id]["full_discussion_blocks"] - - if prompt != '': - if personality.processor is not None and personality.processor_cfg["process_model_input"]: - preprocessed_prompt = personality.processor.process_model_input(prompt) - else: - preprocessed_prompt = prompt - - if personality.processor is not None and personality.processor_cfg["custom_workflow"]: - full_discussion_blocks.append(ump) - full_discussion_blocks.append(preprocessed_prompt) - - else: - - full_discussion_blocks.append(ump) - full_discussion_blocks.append(preprocessed_prompt) - full_discussion_blocks.append(personality.link_text) - full_discussion_blocks.append(personality.ai_message_prefix) - - full_discussion = personality.personality_conditioning + ''.join(full_discussion_blocks) - - def callback(text, message_type: MSG_TYPE, metadata:dict={}): - if message_type == MSG_TYPE.MSG_TYPE_CHUNK: - lollmsElfServer.answer["full_text"] = lollmsElfServer.answer["full_text"] + text - run_async(partial(lollmsElfServer.socketio.emit,'text_chunk', {'chunk': text}, to=client_id)) - try: - if lollmsElfServer.connections[client_id]["requested_stop"]: - return False - else: - return True - except: # If the client is disconnected then we stop talking to it - return False - - tk = personality.model.tokenize(full_discussion) - n_tokens = len(tk) - fd = personality.model.detokenize(tk[-min(lollmsElfServer.config.ctx_size-n_cond_tk-personality.model_n_predicts,n_tokens):]) - - if personality.processor is not None and personality.processor_cfg["custom_workflow"]: - ASCIIColors.info("processing...") - generated_text = personality.processor.run_workflow(prompt, previous_discussion_text=personality.personality_conditioning+fd, callback=callback) - else: - ASCIIColors.info("generating...") - generated_text = personality.model.generate( - personality.personality_conditioning+fd, - n_predict=personality.model_n_predicts, - callback=callback) - - if personality.processor is not None and personality.processor_cfg["process_model_output"]: - generated_text = personality.processor.process_model_output(generated_text) - - full_discussion_blocks.append(generated_text.strip()) - ASCIIColors.success("\ndone") - - # Emit the generated text to the client - run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id)) - except Exception as ex: - run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id)) - ASCIIColors.error(f"\ndone") - lollmsElfServer.busy = False - except Exception as ex: - trace_exception(ex) + # Emit the generated text to the client + run_async(partial(lollmsElfServer.socketio.emit,'text_generated', {'text': generated_text}, to=client_id)) + except Exception as ex: run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id)) - lollmsElfServer.busy = False + ASCIIColors.error(f"\ndone") + lollmsElfServer.busy = False + except Exception as ex: + trace_exception(ex) + run_async(partial(lollmsElfServer.socketio.emit,'generation_error', {'error': str(ex)}, to=client_id)) + lollmsElfServer.busy = False - # Start the text generation task in a separate thread - task = lollmsElfServer.socketio.start_background_task(target=generate_text)