From 5d1018495f919732c37977fa57cf2d9914e3bc6b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:37:45 +0100
Subject: [PATCH] feat(intel): add diffusers/transformers support (#1746)

* feat(intel): add diffusers support

* try to consume upstream container image

* Debug

* Manually install deps

* Map transformers/hf cache dir to modelpath if not specified

* fix(compel): update initialization, pass by all gRPC options

* fix: add dependencies, implement transformers for xpu

* base it from the oneapi image

* Add pillow

* set threads if specified when launching the API

* Skip conda install if intel

* defaults to non-intel

* ci: add to pipelines

* prepare compel only if enabled

* Skip conda install if intel

* fix cleanup

* Disable compel by default

* Install torch 2.1.0 with Intel

* Skip conda on some setups

* Detect python

* Quiet output

* Do not override system python with conda

* Prefer python3

* Fixups

* exllama2: do not install without conda (overrides pytorch version)

* exllama/exllama2: do not install if not using cuda

* Add missing dataset dependency

* Small fixups, symlink to python, add requirements

* Add neural_speed to the deps

* correctly handle model offloading

* fix: device_map == xpu

* go back at calling python, fixed at dockerfile level

* Exllama2 restricted to only nvidia gpus

* Tokenizer to xpu
---
 .github/workflows/image-pr.yml                | 10 +++-
 .github/workflows/image.yml                   | 16 +++++++
 Dockerfile                                    | 34 ++++++++------
 Makefile                                      |  7 +++
 .../python/common-env/transformers/Makefile   |  7 +++
 .../python/common-env/transformers/install.sh | 34 ++++++++++----
 backend/python/diffusers/Makefile             |  7 +++
 backend/python/diffusers/backend_diffusers.py | 25 +++++++---
 backend/python/diffusers/install.sh           | 46 +++++++++++++++----
 backend/python/diffusers/run.sh               | 13 ++++--
 backend/python/exllama/install.sh             |  5 ++
 backend/python/exllama2/install.sh            |  8 +++-
 backend/python/mamba/install.sh               |  3 +-
 backend/python/petals/Makefile                |  2 +-
 backend/python/petals/install.sh              |  5 ++
 backend/python/transformers/run.sh            | 12 +++--
 .../transformers/transformers_server.py       | 27 ++++++++++-
 backend/python/vall-e-x/Makefile              |  4 ++
 backend/python/vall-e-x/install.sh            | 11 +++--
 core/backend/image.go                         | 23 +++-------
 core/backend/llm.go                           |  7 ++-
 core/backend/options.go                       | 18 ++++++--
 pkg/model/initializers.go                     |  7 +++
 23 files changed, 250 insertions(+), 81 deletions(-)
 create mode 100644 backend/python/petals/install.sh

diff --git a/.github/workflows/image-pr.yml b/.github/workflows/image-pr.yml
index 527a8479..2e9a0afe 100644
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -59,6 +59,14 @@ jobs:
             image-type: 'extras'
             base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
             runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
   core-image-build:
     uses: ./.github/workflows/image_build.yml
     with:
@@ -105,4 +113,4 @@ jobs:
             ffmpeg: 'true'
             image-type: 'core'
             runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
\ No newline at end of file
diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml
index a9620baa..2a7fac27 100644
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -120,6 +120,22 @@ jobs:
             image-type: 'extras'
             base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
             runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f32-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
           # Core images
           - build-type: 'sycl_f16'
             platforms: 'linux/amd64'
diff --git a/Dockerfile b/Dockerfile
index a04a866e..fd365962 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,6 +4,8 @@ ARG BASE_IMAGE=ubuntu:22.04
 # extras or core
 FROM ${BASE_IMAGE} as requirements-core
 
+USER root
+
 ARG GO_VERSION=1.21.7
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
@@ -21,7 +23,7 @@ RUN apt-get update && \
     apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
 
 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -v -C /usr/local -xz
+RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin
 
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
@@ -79,6 +81,10 @@ RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get install -y espeak-ng espeak && apt-get clean
 
+RUN if [ ! -e /usr/bin/python ]; then \
+	  ln -s /usr/bin/python3 /usr/bin/python \
+    ; fi
+
 ###################################
 ###################################
 
@@ -166,43 +172,43 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/
 
 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
+	 make -C backend/python/autogptq \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
+	 make -C backend/python/bark \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
+	 make -C backend/python/diffusers \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
+	 make -C backend/python/vllm \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/mamba \
+	 make -C backend/python/mamba \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
+	 make -C backend/python/sentencetransformers \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
+	 make -C backend/python/transformers \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
+	 make -C backend/python/vall-e-x \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
+	 make -C backend/python/exllama \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
+     make -C backend/python/exllama2 \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
+	 make -C backend/python/petals \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
+	 make -C backend/python/transformers-musicgen \
     ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
+	 make -C backend/python/coqui \
     ; fi
 
 # Make sure the models directory exists
diff --git a/Makefile b/Makefile
index 5159431a..f91fb47e 100644
--- a/Makefile
+++ b/Makefile
@@ -557,3 +557,10 @@ docker-image-intel:
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+
+docker-image-intel-xpu:
+	docker build \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="none" \
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
\ No newline at end of file
diff --git a/backend/python/common-env/transformers/Makefile b/backend/python/common-env/transformers/Makefile
index 1cd71ab1..797af083 100644
--- a/backend/python/common-env/transformers/Makefile
+++ b/backend/python/common-env/transformers/Makefile
@@ -8,6 +8,13 @@ ifeq ($(BUILD_TYPE), hipblas)
 	CONDA_ENV_PATH = "transformers-rocm.yml"
 endif
 
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: transformers
 transformers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
diff --git a/backend/python/common-env/transformers/install.sh b/backend/python/common-env/transformers/install.sh
index 42965bdb..e268fcc8 100644
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -1,24 +1,38 @@
 #!/bin/bash
 set -ex
 
+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
     ! conda list --name "${@}" >/dev/null 2>/dev/null
 }
 
-if conda_env_exists "transformers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name transformers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "transformers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name transformers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the intel image
+    # (no conda env)
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed
 fi
 
 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate transformers
+    if [ $SKIP_CONDA -eq 0 ]; then
+        # Activate conda environment
+        source activate transformers
+    fi
 
     pip cache purge
 fi
\ No newline at end of file
diff --git a/backend/python/diffusers/Makefile b/backend/python/diffusers/Makefile
index 70a62b60..40e1d1a7 100644
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -4,6 +4,13 @@ ifeq ($(BUILD_TYPE), hipblas)
 export CONDA_ENV_PATH = "diffusers-rocm.yml"
 endif
 
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: diffusers
 diffusers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
diff --git a/backend/python/diffusers/backend_diffusers.py b/backend/python/diffusers/backend_diffusers.py
index 6780cae6..ec2dea60 100755
--- a/backend/python/diffusers/backend_diffusers.py
+++ b/backend/python/diffusers/backend_diffusers.py
@@ -21,14 +21,15 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipelin
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image,export_to_video
-from compel import Compel
+from compel import Compel, ReturnedEmbeddingsType
 
 from transformers import CLIPTextModel
 from safetensors.torch import load_file
 
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
-COMPEL=os.environ.get("COMPEL", "1") == "1"
+COMPEL=os.environ.get("COMPEL", "0") == "1"
+XPU=os.environ.get("XPU", "0") == "1"
 CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
 SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
 CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
@@ -36,6 +37,10 @@ FPS=os.environ.get("FPS", "7")
 DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES=os.environ.get("FRAMES", "64")
 
+if XPU:
+    import intel_extension_for_pytorch as ipex
+    print(ipex.xpu.get_device_name(0))
+
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 
@@ -231,8 +236,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             if request.SchedulerType != "":
                 self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
                 
-            if not self.img2vid:
-                self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+            if COMPEL:
+                self.compel = Compel(
+                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ], 
+                    text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
+                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+                    requires_pooled=[False, True]
+                    )
 
 
             if request.ControlNet:
@@ -247,6 +257,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 self.pipe.to('cuda')
                 if self.controlnet:
                     self.controlnet.to('cuda')
+            if XPU:
+                self.pipe = self.pipe.to("xpu")
             # Assume directory from request.ModelFile.
             # Only if request.LoraAdapter it's not an absolute path
             if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@@ -386,8 +398,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         image = {}
         if COMPEL:
-            conditioning = self.compel.build_conditioning_tensor(prompt)
-            kwargs["prompt_embeds"]= conditioning
+            conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
+            kwargs["prompt_embeds"] = conditioning
+            kwargs["pooled_prompt_embeds"] = pooled
             # pass the kwargs dictionary to the self.pipe method
             image = self.pipe(
                 guidance_scale=self.cfg_scale,
diff --git a/backend/python/diffusers/install.sh b/backend/python/diffusers/install.sh
index 0429826e..d83ec0be 100755
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -1,24 +1,50 @@
 #!/bin/bash
 set -ex
 
+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
     ! conda list --name "${@}" >/dev/null 2>/dev/null
 }
 
-if conda_env_exists "diffusers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name diffusers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "diffusers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name diffusers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the Intel image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install torch==2.1.0a0 \
+                torchvision==0.16.0a0 \
+                torchaudio==2.1.0a0 \
+                intel-extension-for-pytorch==2.1.10+xpu \
+                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+    
+    pip install google-api-python-client \
+                grpcio \
+                grpcio-tools \
+                diffusers==0.24.0 \
+                transformers>=4.25.1 \
+                accelerate \
+                compel==2.0.2 \
+                Pillow
 fi
 
 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate diffusers
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate diffusers
+    fi
 
     pip cache purge
 fi
\ No newline at end of file
diff --git a/backend/python/diffusers/run.sh b/backend/python/diffusers/run.sh
index 8e3e1bbf..69b25d50 100755
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@@ -3,10 +3,15 @@
 ##
 ## A bash script wrapper that runs the diffusers server with conda
 
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate diffusers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate diffusers
+fi
 
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh
index 702bb1fb..320e7f4d 100755
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -3,6 +3,11 @@ set -ex
 
 export PATH=$PATH:/opt/conda/bin
 
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
 # Check if environment exist
 conda_env_exists(){
     ! conda list --name "${@}" >/dev/null 2>/dev/null
diff --git a/backend/python/exllama2/install.sh b/backend/python/exllama2/install.sh
index a6df3d37..858685b0 100755
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -2,10 +2,14 @@
 set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f
 
-# Activate conda environment
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
+export PATH=$PATH:/opt/conda/bin
 source activate transformers
 
 echo $CONDA_PREFIX
diff --git a/backend/python/mamba/install.sh b/backend/python/mamba/install.sh
index e56b83c2..4ef26ece 100755
--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@@ -2,13 +2,14 @@
 set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 
 if [ "$BUILD_TYPE" != "cublas" ]; then
     echo "[mamba] Attention!!! nvcc is required - skipping installation"
     exit 0
 fi
 
+export PATH=$PATH:/opt/conda/bin
+
 # Activate conda environment
 source activate transformers
 
diff --git a/backend/python/petals/Makefile b/backend/python/petals/Makefile
index 4bd07b11..aa7778e1 100644
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,7 +1,7 @@
 .PHONY: petals
 petals:
 	@echo "Creating virtual environment..."
-	@conda env create --name petals --file petals.yml
+	bash install.sh "petals.yml"
 	@echo "Virtual environment created."
 
 .PHONY: run
diff --git a/backend/python/petals/install.sh b/backend/python/petals/install.sh
new file mode 100644
index 00000000..97bcbb8a
--- /dev/null
+++ b/backend/python/petals/install.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export PATH=$PATH:/opt/conda/bin
+
+conda env create --name petals --file $1
\ No newline at end of file
diff --git a/backend/python/transformers/run.sh b/backend/python/transformers/run.sh
index e6a42b7e..d09c1f5c 100755
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@@ -3,10 +3,16 @@
 ##
 ## A bash script wrapper that runs the transformers server with conda
 
-export PATH=$PATH:/opt/conda/bin
 
-# Activate conda environment
-source activate transformers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate transformers
+fi
 
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
diff --git a/backend/python/transformers/transformers_server.py b/backend/python/transformers/transformers_server.py
index fe0b815a..41112c44 100755
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -16,7 +16,15 @@ import backend_pb2_grpc
 import grpc
 import torch
 import torch.cuda
-from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+
+XPU=os.environ.get("XPU", "0") == "1"
+if XPU:
+    import intel_extension_for_pytorch as ipex
+    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
+    from transformers import AutoTokenizer, AutoModel, set_seed
+else:
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -69,12 +77,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         model_name = request.Model
         try:
             if request.Type == "AutoModelForCausalLM":
-                self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
+                if XPU:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
+                                              device_map="xpu", load_in_4bit=True)
+                else:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
             else:
                 self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
 
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.CUDA = False
+            self.XPU = False
+
+            if XPU:
+                self.XPU = True
+                try:
+                    print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
+                    self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
+                except Exception as err:
+                    print("Not using XPU:", err, file=sys.stderr)
 
             if request.CUDA or torch.cuda.is_available():
                 try:
@@ -139,6 +160,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
         if self.CUDA:
             inputs = inputs.to("cuda")
+        if XPU:
+            inputs = inputs.to("xpu")
 
         outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
 
diff --git a/backend/python/vall-e-x/Makefile b/backend/python/vall-e-x/Makefile
index 4804f12f..8f34f559 100644
--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -1,3 +1,7 @@
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: ttsvalle
 ttsvalle:
 	$(MAKE) -C ../common-env/transformers
diff --git a/backend/python/vall-e-x/install.sh b/backend/python/vall-e-x/install.sh
index 26ccdccd..a9c4117e 100644
--- a/backend/python/vall-e-x/install.sh
+++ b/backend/python/vall-e-x/install.sh
@@ -2,13 +2,16 @@
 
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 export SHA=3faaf8ccadb154d63b38070caf518ce9309ea0f4
 
-# Activate conda environment
-source activate transformers
+SKIP_CONDA=${SKIP_CONDA:-0}
 
-echo $CONDA_PREFIX
+if [ $SKIP_CONDA -ne 1 ]; then
+    source activate transformers
+else
+    export PATH=$PATH:/opt/conda/bin
+    CONDA_PREFIX=$PWD
+fi
 
 git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && popd
 
diff --git a/core/backend/image.go b/core/backend/image.go
index 60db48f9..79b8d4ba 100644
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -8,27 +8,18 @@ import (
 )
 
 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-
+	threads := backendConfig.Threads
+	if threads == 0 && appConfig.Threads != 0 {
+		threads = appConfig.Threads
+	}
+	gRPCOpts := gRPCModelOpts(backendConfig)
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithThreads(uint32(backendConfig.Threads)),
+		model.WithThreads(uint32(threads)),
 		model.WithContext(appConfig.Context),
 		model.WithModel(backendConfig.Model),
-		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          backendConfig.CUDA || backendConfig.Diffusers.CUDA,
-			SchedulerType: backendConfig.Diffusers.SchedulerType,
-			PipelineType:  backendConfig.Diffusers.PipelineType,
-			CFGScale:      backendConfig.Diffusers.CFGScale,
-			LoraAdapter:   backendConfig.LoraAdapter,
-			LoraScale:     backendConfig.LoraScale,
-			LoraBase:      backendConfig.LoraBase,
-			IMG2IMG:       backendConfig.Diffusers.IMG2IMG,
-			CLIPModel:     backendConfig.Diffusers.ClipModel,
-			CLIPSubfolder: backendConfig.Diffusers.ClipSubFolder,
-			CLIPSkip:      int32(backendConfig.Diffusers.ClipSkip),
-			ControlNet:    backendConfig.Diffusers.ControlNet,
-		}),
+		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
 	})
 
 	inferenceModel, err := loader.BackendLoader(
diff --git a/core/backend/llm.go b/core/backend/llm.go
index f16878c0..54e26188 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -28,7 +28,10 @@ type TokenUsage struct {
 
 func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
-
+	threads := c.Threads
+	if threads == 0 && o.Threads != 0 {
+		threads = o.Threads
+	}
 	grpcOpts := gRPCModelOpts(c)
 
 	var inferenceModel grpc.Backend
@@ -36,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 
 	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
+		model.WithThreads(uint32(threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
diff --git a/core/backend/options.go b/core/backend/options.go
index d2bbb2b8..3af6f679 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,11 +40,23 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	}
 
 	return &pb.ModelOptions{
+		CUDA:                 c.CUDA || c.Diffusers.CUDA,
+		SchedulerType:        c.Diffusers.SchedulerType,
+		PipelineType:         c.Diffusers.PipelineType,
+		CFGScale:             c.Diffusers.CFGScale,
+		LoraAdapter:          c.LoraAdapter,
+		LoraScale:            c.LoraScale,
+		F16Memory:            c.F16,
+		LoraBase:             c.LoraBase,
+		IMG2IMG:              c.Diffusers.IMG2IMG,
+		CLIPModel:            c.Diffusers.ClipModel,
+		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
+		CLIPSkip:             int32(c.Diffusers.ClipSkip),
+		ControlNet:           c.Diffusers.ControlNet,
 		ContextSize:          int32(c.ContextSize),
 		Seed:                 int32(c.Seed),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
-		CUDA:                 c.CUDA, // diffusers, transformers
 		DraftModel:           c.DraftModel,
 		AudioPath:            c.VallE.AudioPath,
 		Quantization:         c.Quantization,
@@ -58,12 +70,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		YarnAttnFactor:       c.YarnAttnFactor,
 		YarnBetaFast:         c.YarnBetaFast,
 		YarnBetaSlow:         c.YarnBetaSlow,
-		LoraAdapter:          c.LoraAdapter,
-		LoraBase:             c.LoraBase,
-		LoraScale:            c.LoraScale,
 		NGQA:                 c.NGQA,
 		RMSNormEps:           c.RMSNormEps,
-		F16Memory:            c.F16,
 		MLock:                c.MMlock,
 		RopeFreqBase:         c.RopeFreqBase,
 		RopeScaling:          c.RopeScaling,
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index fce44fe1..1e2af8f9 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -69,6 +69,13 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			return fmt.Sprintf("127.0.0.1:%d", port), nil
 		}
 
+		// If no specific model path is set for transformers/HF, set it to the model path
+		for _, env := range []string{"HF_HOME", "TRANSFORMERS_CACHE", "HUGGINGFACE_HUB_CACHE"} {
+			if os.Getenv(env) == "" {
+				os.Setenv(env, ml.ModelPath)
+			}
+		}
+
 		// Check if the backend is provided as external
 		if uri, ok := o.externalBackends[backend]; ok {
 			log.Debug().Msgf("Loading external backend: %s", uri)