diff --git a/configs/config.yaml b/configs/config.yaml index 0d192c9..d168b31 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -47,7 +47,7 @@ use_continue_message: true seed: -1 ctx_size: 4084 -max_n_predict: 4096 +max_n_predict: None min_n_predict: 1024 temperature: 0.9 top_k: 50 diff --git a/elf_docker_cfg/personal/configs/lollms_elf_config.yaml b/elf_docker_cfg/personal/configs/lollms_elf_config.yaml index 8dfe638..725f0af 100644 --- a/elf_docker_cfg/personal/configs/lollms_elf_config.yaml +++ b/elf_docker_cfg/personal/configs/lollms_elf_config.yaml @@ -47,7 +47,7 @@ use_continue_message: true seed: -1 ctx_size: 4084 -max_n_predict: 4096 +max_n_predict: None min_n_predict: 1024 temperature: 0.9 top_k: 50 diff --git a/elf_test_cfg/personal/configs/lollms_elf_config.yaml b/elf_test_cfg/personal/configs/lollms_elf_config.yaml index 8dfe638..725f0af 100644 --- a/elf_test_cfg/personal/configs/lollms_elf_config.yaml +++ b/elf_test_cfg/personal/configs/lollms_elf_config.yaml @@ -47,7 +47,7 @@ use_continue_message: true seed: -1 ctx_size: 4084 -max_n_predict: 4096 +max_n_predict: None min_n_predict: 1024 temperature: 0.9 top_k: 50 diff --git a/elf_test_cfg/personal/configs/lollms_elf_local_config.yaml b/elf_test_cfg/personal/configs/lollms_elf_local_config.yaml index 8dfe638..725f0af 100644 --- a/elf_test_cfg/personal/configs/lollms_elf_local_config.yaml +++ b/elf_test_cfg/personal/configs/lollms_elf_local_config.yaml @@ -47,7 +47,7 @@ use_continue_message: true seed: -1 ctx_size: 4084 -max_n_predict: 4096 +max_n_predict: None min_n_predict: 1024 temperature: 0.9 top_k: 50 diff --git a/lollms/app.py b/lollms/app.py index 5ca7442..0cc40fd 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -315,7 +315,7 @@ class LollmsApplication(LoLLMsCom): def _generate_text(self, prompt): - max_tokens = min(self.config.ctx_size - self.model.get_nb_tokens(prompt),self.config.max_n_predict) + max_tokens = min(self.config.ctx_size - self.model.get_nb_tokens(prompt),self.config.max_n_predict if self.config.max_n_predict else self.config.ctx_size- self.model.get_nb_tokens(prompt)) generated_text = self.model.generate(prompt, max_tokens) return generated_text.strip() diff --git a/lollms/personality.py b/lollms/personality.py index 6262c35..622a5cd 100644 --- a/lollms/personality.py +++ b/lollms/personality.py @@ -1046,7 +1046,7 @@ Use this structure: self.print_prompt("gen",prompt) if max_size is None: - max_size = min(self.config.max_n_predict, self.config.ctx_size-len(self.model.tokenize(prompt))) + max_size = min(self.config.max_n_predict if self.config.max_n_predict else self.config.ctx_size-len(self.model.tokenize(prompt)), self.config.ctx_size-len(self.model.tokenize(prompt))) self.model.generate_with_images( prompt, @@ -1071,7 +1071,7 @@ Use this structure: self.model.generate( prompt, - max_size if max_size else min(self.config.ctx_size-ntokens,self.config.max_n_predict), + max_size if max_size else min(self.config.ctx_size-ntokens,self.config.max_n_predict if self.config.max_n_predict else self.config.ctx_size-ntokens), partial(self.process, callback=callback, show_progress=show_progress), temperature=self.model_temperature if temperature is None else temperature, top_k=self.model_top_k if top_k is None else top_k, @@ -3575,7 +3575,7 @@ Use this structure: if self.config.debug: nb_prompt_tokens = len(self.personality.model.tokenize(prompt)) - nb_tokens = min(self.config.ctx_size - nb_prompt_tokens, self.config.max_n_predict) + nb_tokens = min(self.config.ctx_size - nb_prompt_tokens, self.config.max_n_predict if self.config.max_n_predict else self.config.ctx_size-nb_prompt_tokens) ASCIIColors.info(f"Prompt size : {nb_prompt_tokens}") ASCIIColors.info(f"Requested generation max size : {nb_tokens}") @@ -4740,7 +4740,7 @@ transition-all duration-300 ease-in-out"> out = self.fast_gen(full_prompt) nb_tokens = len(self.personality.model.tokenize(out)) - if nb_tokens >= self.config.max_n_predict-1: + if nb_tokens >= (self.config.max_n_predict if self.config.max_n_predict else self.config.ctx_size)-1: out = out+self.fast_gen(full_prompt+out, callback=callback) if context_details["is_continue"]: out = context_details["previous_chunk"] + out diff --git a/lollms/server/endpoints/lollms_generator.py b/lollms/server/endpoints/lollms_generator.py index c9eaadf..5bdbf95 100644 --- a/lollms/server/endpoints/lollms_generator.py +++ b/lollms/server/endpoints/lollms_generator.py @@ -85,7 +85,7 @@ class LollmsGenerateRequest(BaseModel): prompt: str model_name: Optional[str] = None personality: Optional[int] = -1 - n_predict: Optional[int] = 1024 + n_predict: Optional[int] = None stream: bool = False temperature: float = 0.1 top_k: Optional[int] = 50 @@ -131,7 +131,10 @@ async def lollms_generate(request: LollmsGenerateRequest): tokens = elf_server.model.tokenize(prompt) n_tokens = len(tokens) ASCIIColors.info(f"Prompt input size {n_tokens}") - n_predict = min(min(elf_server.config.ctx_size-n_tokens-1,elf_server.config.max_n_predict), request.n_predict) if request.n_predict>0 else min(elf_server.config.ctx_size-n_tokens-1,elf_server.config.max_n_predict) + if request.n_predict is None: + n_predict = min(elf_server.config.ctx_size-n_tokens-1,elf_server.config.max_n_predict if elf_server.config.max_n_predict else elf_server.config.ctx_size) + else: + n_predict = min(min(elf_server.config.ctx_size-n_tokens-1,elf_server.config.max_n_predict if elf_server.config.max_n_predict else elf_server.config.ctx_size), request.n_predict) if request.n_predict>0 else min(elf_server.config.ctx_size-n_tokens-1,elf_server.config.max_n_predict if elf_server.config.max_n_predict else elf_server.config.ctx_size) stream = request.stream if elf_server.binding is not None: if stream: @@ -491,7 +494,7 @@ async def v1_chat_completions(request: ChatGenerationRequest): try: reception_manager=RECEPTION_MANAGER() messages = request.messages - max_tokens = request.max_tokens if request.max_tokens>0 else elf_server.config.max_n_predict + max_tokens = request.max_tokens if request.max_tokens>0 else elf_server.config.max_n_predict if elf_server.config.max_n_predict else elf_server.config.ctx_size temperature = request.temperature if elf_server.config.temperature else elf_server.config.temperature prompt = "" roles= False @@ -633,7 +636,7 @@ async def ollama_chat_completion(request: ChatGenerationRequest): try: reception_manager=RECEPTION_MANAGER() messages = request.messages - max_tokens = request.max_tokens if request.max_tokens>0 else elf_server.config.max_n_predict + max_tokens = request.max_tokens if request.max_tokens>0 else elf_server.config.max_n_predict if elf_server.config.max_n_predict else elf_server.config.ctx_size temperature = request.temperature if elf_server.config.temperature else elf_server.config.temperature prompt = "" roles= False @@ -986,7 +989,7 @@ async def v1_completion(request: CompletionGenerationRequest): """ try: text = request.prompt - n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict + n_predict = request.max_tokens if request.max_tokens>=0 else elf_server.config.max_n_predict if elf_server.config.max_n_predict else elf_server.config.ctx_size temperature = request.temperature if request.temperature>=0 else elf_server.config.temperature # top_k = request.top_k if request.top_k>=0 else elf_server.config.top_k # top_p = request.top_p if request.top_p>=0 else elf_server.config.top_p