mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-02-08 03:50:22 +00:00
STEP
This commit is contained in:
parent
d76a96cbb8
commit
8ca192ea64
@ -24,6 +24,7 @@ import string
|
|||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
def _generate_id(length=10):
|
def _generate_id(length=10):
|
||||||
@ -549,13 +550,32 @@ async def ollama_chat_completion(request: ChatGenerationRequest):
|
|||||||
return {"status":False,"error":str(ex)}
|
return {"status":False,"error":str(ex)}
|
||||||
|
|
||||||
class CompletionGenerationRequest(BaseModel):
|
class CompletionGenerationRequest(BaseModel):
|
||||||
model: Optional[str] = ""
|
model: Optional[str] = None
|
||||||
prompt: str = ""
|
prompt: str
|
||||||
max_tokens: Optional[int] = -1
|
max_tokens: Optional[int] = -1
|
||||||
stream: Optional[bool] = False
|
stream: Optional[bool] = False
|
||||||
temperature: Optional[float] = -1
|
temperature: Optional[float] = -1
|
||||||
|
mirostat: Optional[int] = None
|
||||||
|
mirostat_eta: Optional[float] = None
|
||||||
|
mirostat_tau: Optional[float] = None
|
||||||
|
num_ctx: Optional[int] = None
|
||||||
|
repeat_last_n: Optional[int] = -1
|
||||||
|
repeat_penalty: Optional[float] = -1
|
||||||
|
seed: Optional[int] = None
|
||||||
|
stop: Optional[str] = None
|
||||||
|
tfs_z: Optional[float] = None
|
||||||
|
num_predict: Optional[int] = None
|
||||||
|
top_k: Optional[int] = -1
|
||||||
|
top_p: Optional[float] = -1
|
||||||
|
format: Optional[str] = None # Added as per new request
|
||||||
|
system: Optional[str] = None # Added as per new request
|
||||||
|
template: Optional[str] = None # Added as per new request
|
||||||
|
context: Optional[str] = None # Added as per new request
|
||||||
|
raw: Optional[bool] = None # Added as per new request
|
||||||
|
keep_alive: Optional[str] = None # Added as per new request
|
||||||
|
|
||||||
|
|
||||||
|
@router.options("/api/generate")
|
||||||
@router.post("/api/generate")
|
@router.post("/api/generate")
|
||||||
async def ollama_generate(request: CompletionGenerationRequest):
|
async def ollama_generate(request: CompletionGenerationRequest):
|
||||||
"""
|
"""
|
||||||
@ -565,15 +585,27 @@ async def ollama_generate(request: CompletionGenerationRequest):
|
|||||||
:return: A JSON response with the status of the operation.
|
:return: A JSON response with the status of the operation.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
start_time = time.perf_counter_ns()
|
||||||
|
ASCIIColors.cyan("> Ollama Server emulator: Received request")
|
||||||
text = request.prompt
|
text = request.prompt
|
||||||
n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict
|
request.max_tokens = request.max_tokens if request.max_tokens>0 else elf_server.config.ctx_size
|
||||||
temperature = request.temperature if request.temperature>=0 else elf_server.config.temperature
|
n_predict = request.max_tokens if request.max_tokens>0 else elf_server.config.max_n_predict
|
||||||
# top_k = request.top_k if request.top_k>=0 else elf_server.config.top_k
|
temperature = request.temperature if request.temperature>0 else elf_server.config.temperature
|
||||||
# top_p = request.top_p if request.top_p>=0 else elf_server.config.top_p
|
top_k = request.top_k if request.top_k>0 else elf_server.config.top_k
|
||||||
# repeat_last_n = request.repeat_last_n if request.repeat_last_n>=0 else elf_server.config.repeat_last_n
|
top_p = request.top_p if request.top_p>0 else elf_server.config.top_p
|
||||||
# repeat_penalty = request.repeat_penalty if request.repeat_penalty>=0 else elf_server.config.repeat_penalty
|
repeat_last_n = request.repeat_last_n if request.repeat_last_n>0 else elf_server.config.repeat_last_n
|
||||||
|
repeat_penalty = request.repeat_penalty if request.repeat_penalty>0 else elf_server.config.repeat_penalty
|
||||||
stream = request.stream
|
stream = request.stream
|
||||||
|
created_at = datetime.now().isoformat()
|
||||||
|
response_data = {
|
||||||
|
"model": request.model if request.model is not None else "llama3",
|
||||||
|
"created_at": created_at,
|
||||||
|
"response": "",
|
||||||
|
"done": False,
|
||||||
|
"context": [1, 2, 3], # Placeholder for actual context
|
||||||
|
}
|
||||||
|
|
||||||
|
ASCIIColors.cyan("> Processing ...")
|
||||||
if elf_server.binding is not None:
|
if elf_server.binding is not None:
|
||||||
if stream:
|
if stream:
|
||||||
output = {"text":""}
|
output = {"text":""}
|
||||||
@ -595,17 +627,20 @@ async def ollama_generate(request: CompletionGenerationRequest):
|
|||||||
callback=callback,
|
callback=callback,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
))
|
))
|
||||||
|
ASCIIColors.success("> Streaming ...")
|
||||||
return StreamingResponse(generate_chunks())
|
return StreamingResponse(generate_chunks())
|
||||||
else:
|
else:
|
||||||
output = {"text":""}
|
output = {"text":""}
|
||||||
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
|
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
|
||||||
|
if chunk is None:
|
||||||
|
return
|
||||||
# Yield each chunk of data
|
# Yield each chunk of data
|
||||||
output["text"] += chunk
|
output["text"] += chunk
|
||||||
antiprompt = detect_antiprompt(output["text"])
|
antiprompt = detect_antiprompt(output["text"])
|
||||||
if antiprompt:
|
if antiprompt:
|
||||||
ASCIIColors.warning(f"\n{antiprompt} detected. Stopping generation")
|
ASCIIColors.warning(f"\n{antiprompt} detected. Stopping generation")
|
||||||
output["text"] = remove_text_from_string(output["text"],antiprompt)
|
output["text"] = remove_text_from_string(output["text"],antiprompt)
|
||||||
|
ASCIIColors.success("Done")
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
@ -615,8 +650,18 @@ async def ollama_generate(request: CompletionGenerationRequest):
|
|||||||
callback=callback,
|
callback=callback,
|
||||||
temperature=request.temperature if request.temperature>=0 else elf_server.config.temperature
|
temperature=request.temperature if request.temperature>=0 else elf_server.config.temperature
|
||||||
)
|
)
|
||||||
return output["text"]
|
ASCIIColors.success("> Done")
|
||||||
|
response_data["total_duration"] = time.perf_counter_ns() - start_time
|
||||||
|
response_data["load_duration"] = 0
|
||||||
|
response_data["prompt_eval_count"] = len(request.prompt.split())
|
||||||
|
response_data["prompt_eval_duration"] = time.perf_counter_ns() - start_time
|
||||||
|
response_data["eval_count"] = len(elf_server.binding.tokenize(output["text"])) # Simulated number of tokens in the response
|
||||||
|
response_data["eval_duration"] = time.perf_counter_ns() - start_time
|
||||||
|
response_data["response"] = output["text"]
|
||||||
|
response_data["done"] = True
|
||||||
|
return response_data
|
||||||
else:
|
else:
|
||||||
|
ASCIIColors.error("> Failed")
|
||||||
return None
|
return None
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
trace_exception(ex)
|
trace_exception(ex)
|
||||||
@ -733,114 +778,6 @@ async def ollama_completion(request: CompletionGenerationRequest):
|
|||||||
return {"status":False,"error":str(ex)}
|
return {"status":False,"error":str(ex)}
|
||||||
|
|
||||||
|
|
||||||
@router.post("/api/generate")
|
|
||||||
async def ollama_chat(request: CompletionGenerationRequest):
|
|
||||||
"""
|
|
||||||
Executes Python code and returns the output.
|
|
||||||
|
|
||||||
:param request: The HTTP request object.
|
|
||||||
:return: A JSON response with the status of the operation.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
reception_manager=RECEPTION_MANAGER()
|
|
||||||
prompt = request.prompt
|
|
||||||
n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict
|
|
||||||
temperature = request.temperature if request.temperature>=0 else elf_server.config.temperature
|
|
||||||
# top_k = request.top_k if request.top_k>=0 else elf_server.config.top_k
|
|
||||||
# top_p = request.top_p if request.top_p>=0 else elf_server.config.top_p
|
|
||||||
# repeat_last_n = request.repeat_last_n if request.repeat_last_n>=0 else elf_server.config.repeat_last_n
|
|
||||||
# repeat_penalty = request.repeat_penalty if request.repeat_penalty>=0 else elf_server.config.repeat_penalty
|
|
||||||
stream = request.stream
|
|
||||||
headers = { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',}
|
|
||||||
if elf_server.binding is not None:
|
|
||||||
if stream:
|
|
||||||
new_output={"new_values":[]}
|
|
||||||
async def generate_chunks():
|
|
||||||
lk = threading.Lock()
|
|
||||||
|
|
||||||
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
|
|
||||||
if elf_server.cancel_gen:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if chunk is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
rx = reception_manager.new_chunk(chunk)
|
|
||||||
if rx.status!=ROLE_CHANGE_DECISION.MOVE_ON:
|
|
||||||
if rx.status==ROLE_CHANGE_DECISION.PROGRESSING:
|
|
||||||
return True
|
|
||||||
elif rx.status==ROLE_CHANGE_DECISION.ROLE_CHANGED:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
chunk = chunk + rx.value
|
|
||||||
|
|
||||||
# Yield each chunk of data
|
|
||||||
lk.acquire()
|
|
||||||
try:
|
|
||||||
new_output["new_values"].append(reception_manager.chunk)
|
|
||||||
lk.release()
|
|
||||||
return True
|
|
||||||
except Exception as ex:
|
|
||||||
trace_exception(ex)
|
|
||||||
lk.release()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def chunks_builder():
|
|
||||||
if request.model in elf_server.binding.list_models() and elf_server.binding.model_name!=request.model:
|
|
||||||
elf_server.binding.build_model(request.model)
|
|
||||||
|
|
||||||
elf_server.binding.generate(
|
|
||||||
prompt,
|
|
||||||
n_predict,
|
|
||||||
callback=callback,
|
|
||||||
temperature=temperature or elf_server.config.temperature
|
|
||||||
)
|
|
||||||
reception_manager.done = True
|
|
||||||
thread = threading.Thread(target=chunks_builder)
|
|
||||||
thread.start()
|
|
||||||
current_index = 0
|
|
||||||
while (not reception_manager.done and elf_server.cancel_gen == False):
|
|
||||||
while (not reception_manager.done and len(new_output["new_values"])==0):
|
|
||||||
time.sleep(0.001)
|
|
||||||
lk.acquire()
|
|
||||||
for i in range(len(new_output["new_values"])):
|
|
||||||
current_index += 1
|
|
||||||
yield (json.dumps({"response":new_output["new_values"][i]})+"\n").encode("utf-8")
|
|
||||||
new_output["new_values"]=[]
|
|
||||||
lk.release()
|
|
||||||
elf_server.cancel_gen = False
|
|
||||||
return StreamingResponse(generate_chunks(), media_type="application/json", headers=headers)
|
|
||||||
else:
|
|
||||||
def callback(chunk, chunk_type:MSG_TYPE=MSG_TYPE.MSG_TYPE_CHUNK):
|
|
||||||
# Yield each chunk of data
|
|
||||||
if chunk is None:
|
|
||||||
return True
|
|
||||||
|
|
||||||
rx = reception_manager.new_chunk(chunk)
|
|
||||||
if rx.status!=ROLE_CHANGE_DECISION.MOVE_ON:
|
|
||||||
if rx.status==ROLE_CHANGE_DECISION.PROGRESSING:
|
|
||||||
return True
|
|
||||||
elif rx.status==ROLE_CHANGE_DECISION.ROLE_CHANGED:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
chunk = chunk + rx.value
|
|
||||||
|
|
||||||
|
|
||||||
return True
|
|
||||||
elf_server.binding.generate(
|
|
||||||
prompt,
|
|
||||||
n_predict,
|
|
||||||
callback=callback,
|
|
||||||
temperature=request.temperature or elf_server.config.temperature
|
|
||||||
)
|
|
||||||
return json.dumps(reception_manager.reception_buffer).encode("utf-8")
|
|
||||||
except Exception as ex:
|
|
||||||
trace_exception(ex)
|
|
||||||
elf_server.error(ex)
|
|
||||||
return {"status":False,"error":str(ex)}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/v1/completions")
|
@router.post("/v1/completions")
|
||||||
async def v1_completion(request: CompletionGenerationRequest):
|
async def v1_completion(request: CompletionGenerationRequest):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user