upgraded stuff

This commit is contained in:
Saifeddine ALOUI 2024-04-27 03:54:00 +02:00
parent cbe2a5363a
commit 82df5893d1
2 changed files with 189 additions and 36 deletions

View File

@ -103,6 +103,7 @@ async def lollms_generate(request: LollmsGenerateRequest):
""" """
try: try:
headers = { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',}
reception_manager=RECEPTION_MANAGER() reception_manager=RECEPTION_MANAGER()
prompt = request.prompt prompt = request.prompt
n_predict = request.n_predict if request.n_predict>0 else 1024 n_predict = request.n_predict if request.n_predict>0 else 1024
@ -161,11 +162,11 @@ async def lollms_generate(request: LollmsGenerateRequest):
lk.acquire() lk.acquire()
for i in range(len(new_output["new_values"])): for i in range(len(new_output["new_values"])):
current_index += 1 current_index += 1
yield (new_output["new_values"][i] + '\n') yield (new_output["new_values"][i])
new_output["new_values"]=[] new_output["new_values"]=[]
lk.release() lk.release()
elf_server.cancel_gen = False elf_server.cancel_gen = False
return StreamingResponse(generate_chunks(), media_type="text/plain") return StreamingResponse(generate_chunks(), media_type="text/plain", headers=headers)
else: else:
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
# Yield each chunk of data # Yield each chunk of data
@ -423,7 +424,8 @@ async def ollama_completion(request: CompletionGenerationRequest):
:return: A JSON response with the status of the operation. :return: A JSON response with the status of the operation.
""" """
try: try:
text = request.prompt reception_manager=RECEPTION_MANAGER()
prompt = request.prompt
n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict
temperature = request.temperature if request.temperature>=0 else elf_server.config.temperature temperature = request.temperature if request.temperature>=0 else elf_server.config.temperature
# top_k = request.top_k if request.top_k>=0 else elf_server.config.top_k # top_k = request.top_k if request.top_k>=0 else elf_server.config.top_k
@ -434,53 +436,200 @@ async def ollama_completion(request: CompletionGenerationRequest):
if elf_server.binding is not None: if elf_server.binding is not None:
if stream: if stream:
output = {"response":""} new_output={"new_values":[]}
def generate_chunks(): async def generate_chunks():
lk = threading.Lock()
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
# Yield each chunk of data if elf_server.cancel_gen:
output["response"] += chunk return False
antiprompt = detect_antiprompt(output["text"])
if antiprompt: if chunk is None:
ASCIIColors.warning(f"\n{antiprompt} detected. Stopping generation") return
output["response"] = remove_text_from_string(output["response"],antiprompt)
rx = reception_manager.new_chunk(chunk)
if rx.status!=ROLE_CHANGE_DECISION.MOVE_ON:
if rx.status==ROLE_CHANGE_DECISION.PROGRESSING:
return True
elif rx.status==ROLE_CHANGE_DECISION.ROLE_CHANGED:
return False return False
else: else:
yield {"response":chunk} chunk = chunk + rx.value
# Yield each chunk of data
lk.acquire()
try:
new_output["new_values"].append(reception_manager.chunk)
lk.release()
return True return True
return iter(elf_server.binding.generate( except Exception as ex:
text, trace_exception(ex)
lk.release()
return False
def chunks_builder():
if request.model in elf_server.binding.list_models() and elf_server.binding.model_name!=request.model:
elf_server.binding.build_model(request.model)
elf_server.binding.generate(
prompt,
n_predict, n_predict,
callback=callback, callback=callback,
temperature=temperature, temperature=temperature or elf_server.config.temperature
)) )
reception_manager.done = True
return StreamingResponse(generate_chunks()) thread = threading.Thread(target=chunks_builder)
thread.start()
current_index = 0
while (not reception_manager.done and elf_server.cancel_gen == False):
while (not reception_manager.done and len(new_output["new_values"])==0):
time.sleep(0.001)
lk.acquire()
for i in range(len(new_output["new_values"])):
current_index += 1
yield {"response":new_output["new_values"][i]}
new_output["new_values"]=[]
lk.release()
elf_server.cancel_gen = False
return StreamingResponse(generate_chunks(), media_type="text/plain")
else: else:
output = {"response":""}
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK): def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
# Yield each chunk of data # Yield each chunk of data
output["response"] += chunk if chunk is None:
antiprompt = detect_antiprompt(output["response"]) return True
if antiprompt:
ASCIIColors.warning(f"\n{antiprompt} detected. Stopping generation") rx = reception_manager.new_chunk(chunk)
output["response"] = remove_text_from_string(output["response"],antiprompt) if rx.status!=ROLE_CHANGE_DECISION.MOVE_ON:
if rx.status==ROLE_CHANGE_DECISION.PROGRESSING:
return True
elif rx.status==ROLE_CHANGE_DECISION.ROLE_CHANGED:
return False return False
else: else:
chunk = chunk + rx.value
return True return True
elf_server.binding.generate( elf_server.binding.generate(
text, prompt,
n_predict, n_predict,
callback=callback, callback=callback,
temperature=request.temperature if request.temperature>=0 else elf_server.config.temperature temperature=request.temperature or elf_server.config.temperature
) )
return output return {"response":reception_manager.reception_buffer}
else:
return None
except Exception as ex: except Exception as ex:
trace_exception(ex) trace_exception(ex)
elf_server.error(ex) elf_server.error(ex)
return {"status":False,"error":str(ex)} return {"status":False,"error":str(ex)}
@router.post("/api/generate")
async def ollama_chat(request: CompletionGenerationRequest):
"""
Executes Python code and returns the output.
:param request: The HTTP request object.
:return: A JSON response with the status of the operation.
"""
try:
reception_manager=RECEPTION_MANAGER()
prompt = request.prompt
n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict
temperature = request.temperature if request.temperature>=0 else elf_server.config.temperature
# top_k = request.top_k if request.top_k>=0 else elf_server.config.top_k
# top_p = request.top_p if request.top_p>=0 else elf_server.config.top_p
# repeat_last_n = request.repeat_last_n if request.repeat_last_n>=0 else elf_server.config.repeat_last_n
# repeat_penalty = request.repeat_penalty if request.repeat_penalty>=0 else elf_server.config.repeat_penalty
stream = request.stream
headers = { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',}
if elf_server.binding is not None:
if stream:
new_output={"new_values":[]}
async def generate_chunks():
lk = threading.Lock()
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
if elf_server.cancel_gen:
return False
if chunk is None:
return
rx = reception_manager.new_chunk(chunk)
if rx.status!=ROLE_CHANGE_DECISION.MOVE_ON:
if rx.status==ROLE_CHANGE_DECISION.PROGRESSING:
return True
elif rx.status==ROLE_CHANGE_DECISION.ROLE_CHANGED:
return False
else:
chunk = chunk + rx.value
# Yield each chunk of data
lk.acquire()
try:
new_output["new_values"].append(reception_manager.chunk)
lk.release()
return True
except Exception as ex:
trace_exception(ex)
lk.release()
return False
def chunks_builder():
if request.model in elf_server.binding.list_models() and elf_server.binding.model_name!=request.model:
elf_server.binding.build_model(request.model)
elf_server.binding.generate(
prompt,
n_predict,
callback=callback,
temperature=temperature or elf_server.config.temperature
)
reception_manager.done = True
thread = threading.Thread(target=chunks_builder)
thread.start()
current_index = 0
while (not reception_manager.done and elf_server.cancel_gen == False):
while (not reception_manager.done and len(new_output["new_values"])==0):
time.sleep(0.001)
lk.acquire()
for i in range(len(new_output["new_values"])):
current_index += 1
yield (json.dumps({"response":new_output["new_values"][i]})+"\n").encode("utf-8")
new_output["new_values"]=[]
lk.release()
elf_server.cancel_gen = False
return StreamingResponse(generate_chunks(), media_type="application/json", headers=headers)
else:
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
# Yield each chunk of data
if chunk is None:
return True
rx = reception_manager.new_chunk(chunk)
if rx.status!=ROLE_CHANGE_DECISION.MOVE_ON:
if rx.status==ROLE_CHANGE_DECISION.PROGRESSING:
return True
elif rx.status==ROLE_CHANGE_DECISION.ROLE_CHANGED:
return False
else:
chunk = chunk + rx.value
return True
elf_server.binding.generate(
prompt,
n_predict,
callback=callback,
temperature=request.temperature or elf_server.config.temperature
)
return json.dumps(reception_manager.reception_buffer).encode("utf-8")
except Exception as ex:
trace_exception(ex)
elf_server.error(ex)
return {"status":False,"error":str(ex)}
@router.post("/v1/completions") @router.post("/v1/completions")
async def v1_completion(request: CompletionGenerationRequest): async def v1_completion(request: CompletionGenerationRequest):
""" """

View File

@ -7,7 +7,7 @@ description:
application. These routes are specific to handling models related operations. application. These routes are specific to handling models related operations.
""" """
from fastapi import APIRouter, Request from fastapi import APIRouter, Request, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
import pkg_resources import pkg_resources
from lollms.server.elf_server import LOLLMSElfServer from lollms.server.elf_server import LOLLMSElfServer
@ -103,6 +103,10 @@ def add_reference_to_local_model(data:ModelReferenceParams):
@router.get("/api/pull")
async def ollama_pull_model():
raise HTTPException(400, "Not implemented")
@router.get("/api/tags") @router.get("/api/tags")
async def ollama_list_models(): async def ollama_list_models():
""" """
@ -119,9 +123,9 @@ async def ollama_list_models():
md = { md = {
"models": [ "models": [
{ {
"name": model, "name": model["name"],
"modified_at": "2023-11-04T14:56:49.277302595-07:00", "modified_at": model["last_commit_time"],
"size": 7365960935, "size": model["variants"][0]["size"],
"digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697", "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
"details": { "details": {
"format": "gguf", "format": "gguf",