mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
bugfix: CUDA acceleration not working (#2475)
* bugfix: CUDA acceleration not working CUDA not working after #2286. Refactored the code to be more polish * Update requirements.txt Missing imports Signed-off-by: fakezeta <fakezeta@gmail.com> * Update requirements.txt Signed-off-by: fakezeta <fakezeta@gmail.com> --------- Signed-off-by: fakezeta <fakezeta@gmail.com>
This commit is contained in:
parent
daa7544d9c
commit
6ef78ef7f6
@ -21,10 +21,7 @@ import torch.cuda
|
|||||||
|
|
||||||
|
|
||||||
XPU=os.environ.get("XPU", "0") == "1"
|
XPU=os.environ.get("XPU", "0") == "1"
|
||||||
if XPU:
|
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
|
||||||
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
|
|
||||||
else:
|
|
||||||
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
|
|
||||||
|
|
||||||
|
|
||||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||||
@ -77,11 +74,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
"""
|
"""
|
||||||
model_name = request.Model
|
model_name = request.Model
|
||||||
|
|
||||||
compute = "auto"
|
compute = torch.float16
|
||||||
if request.F16Memory == True:
|
if request.F16Memory == True:
|
||||||
compute=torch.bfloat16
|
compute=torch.bfloat16
|
||||||
|
|
||||||
self.CUDA = request.CUDA
|
self.CUDA = torch.cuda.is_available()
|
||||||
self.OV=False
|
self.OV=False
|
||||||
|
|
||||||
device_map="cpu"
|
device_map="cpu"
|
||||||
@ -89,6 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
quantization = None
|
quantization = None
|
||||||
|
|
||||||
if self.CUDA:
|
if self.CUDA:
|
||||||
|
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
|
||||||
if request.MainGPU:
|
if request.MainGPU:
|
||||||
device_map=request.MainGPU
|
device_map=request.MainGPU
|
||||||
else:
|
else:
|
||||||
@ -107,7 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
bnb_4bit_compute_dtype = None,
|
bnb_4bit_compute_dtype = None,
|
||||||
load_in_8bit=True,
|
load_in_8bit=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if request.Type == "AutoModelForCausalLM":
|
if request.Type == "AutoModelForCausalLM":
|
||||||
if XPU:
|
if XPU:
|
||||||
@ -189,6 +187,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
device=device_map)
|
device=device_map)
|
||||||
self.OV = True
|
self.OV = True
|
||||||
else:
|
else:
|
||||||
|
print("Automodel", file=sys.stderr)
|
||||||
self.model = AutoModel.from_pretrained(model_name,
|
self.model = AutoModel.from_pretrained(model_name,
|
||||||
trust_remote_code=request.TrustRemoteCode,
|
trust_remote_code=request.TrustRemoteCode,
|
||||||
use_safetensors=True,
|
use_safetensors=True,
|
||||||
|
@ -3,4 +3,7 @@ transformers
|
|||||||
grpcio==1.64.0
|
grpcio==1.64.0
|
||||||
protobuf
|
protobuf
|
||||||
torch
|
torch
|
||||||
certifi
|
certifi
|
||||||
|
intel-extension-for-transformers
|
||||||
|
bitsandbytes
|
||||||
|
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
source $(dirname $0)/../common/libbackend.sh
|
||||||
|
|
||||||
|
if [ -d "/opt/intel" ]; then
|
||||||
|
# Assumes we are using the Intel oneAPI container image
|
||||||
|
# https://github.com/intel/intel-extension-for-pytorch/issues/538
|
||||||
|
export XPU=1
|
||||||
|
fi
|
||||||
|
|
||||||
startBackend $@
|
startBackend $@
|
Loading…
Reference in New Issue
Block a user