mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-02 08:43:10 +00:00
Transformers Backend: max_tokens adherence to OpenAI API (#2108)
max token adherence to OpenAI API improve adherence to OpenAI API when max tokens is omitted or equal to 0 in the request
This commit is contained in:
parent
0d8bf91699
commit
8e36fe9b6f
@ -159,6 +159,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
quantization_config=quantization,
|
quantization_config=quantization,
|
||||||
device_map=device_map,
|
device_map=device_map,
|
||||||
torch_dtype=compute)
|
torch_dtype=compute)
|
||||||
|
if request.ContextSize > 0:
|
||||||
|
self.max_tokens = request.ContextSize
|
||||||
|
else:
|
||||||
|
self.max_tokens = self.model.config.max_position_embeddings
|
||||||
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
|
||||||
self.XPU = False
|
self.XPU = False
|
||||||
|
|
||||||
@ -217,10 +222,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
if request.TopK == 0:
|
if request.TopK == 0:
|
||||||
request.TopK = 40
|
request.TopK = 40
|
||||||
|
|
||||||
max_tokens = 200
|
|
||||||
if request.Tokens > 0:
|
|
||||||
max_tokens = request.Tokens
|
|
||||||
|
|
||||||
prompt = request.Prompt
|
prompt = request.Prompt
|
||||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||||
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||||
@ -232,6 +233,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
|
eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
|
||||||
|
|
||||||
inputs = self.tokenizer(prompt, return_tensors="pt")
|
inputs = self.tokenizer(prompt, return_tensors="pt")
|
||||||
|
|
||||||
|
if request.Tokens > 0:
|
||||||
|
max_tokens = request.Tokens
|
||||||
|
else:
|
||||||
|
max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1]
|
||||||
|
|
||||||
if self.CUDA:
|
if self.CUDA:
|
||||||
inputs = inputs.to("cuda")
|
inputs = inputs.to("cuda")
|
||||||
if XPU and self.OV == False:
|
if XPU and self.OV == False:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user