mirror of
https://github.com/mudler/LocalAI.git
synced 2025-01-29 15:44:17 +00:00
Transformer Backend: Implementing use_tokenizer_template and stop_prompts options (#2090)
* fix regression #1971 fixes regression #1971 introduced by intel_extension_for_transformers==1.4 * UseTokenizerTemplate and StopPrompt Implementation of use_tokenizer_template and stopwords options
This commit is contained in:
parent
39814cab32
commit
66b002458d
@ -148,7 +148,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
else:
|
||||
device_map="CPU"
|
||||
self.model = OVModelForCausalLM.from_pretrained(model_name,
|
||||
compile=True,
|
||||
compile=True,
|
||||
ov_config={"PERFORMANCE_HINT": "LATENCY"},
|
||||
device=device_map)
|
||||
self.OV = True
|
||||
else:
|
||||
@ -212,12 +213,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
set_seed(request.Seed)
|
||||
if request.TopP == 0:
|
||||
request.TopP = 0.9
|
||||
|
||||
if request.TopK == 0:
|
||||
request.TopK = 40
|
||||
|
||||
max_tokens = 200
|
||||
if request.Tokens > 0:
|
||||
max_tokens = request.Tokens
|
||||
|
||||
inputs = self.tokenizer(request.Prompt, return_tensors="pt")
|
||||
prompt = request.Prompt
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
eos_token_id = self.tokenizer.eos_token_id
|
||||
if request.StopPrompts:
|
||||
eos_token_id = []
|
||||
for word in request.StopPrompts:
|
||||
eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
|
||||
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt")
|
||||
if self.CUDA:
|
||||
inputs = inputs.to("cuda")
|
||||
if XPU and self.OV == False:
|
||||
@ -235,7 +249,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
top_k=request.TopK,
|
||||
do_sample=True,
|
||||
attention_mask=inputs["attention_mask"],
|
||||
eos_token_id=self.tokenizer.eos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
streamer=streamer)
|
||||
thread=Thread(target=self.model.generate, kwargs=config)
|
||||
@ -264,7 +278,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
top_k=request.TopK,
|
||||
do_sample=True,
|
||||
attention_mask=inputs["attention_mask"],
|
||||
eos_token_id=self.tokenizer.eos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=self.tokenizer.eos_token_id)
|
||||
generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user