feat(intel): add diffusers/transformers support (#1746)

* feat(intel): add diffusers support * try to consume upstream container image * Debug * Manually install deps * Map transformers/hf cache dir to modelpath if not specified * fix(compel): update initialization, pass by all gRPC options * fix: add dependencies, implement transformers for xpu * base it from the oneapi image * Add pillow * set threads if specified when launching the API * Skip conda install if intel * defaults to non-intel * ci: add to pipelines * prepare compel only if enabled * Skip conda install if intel * fix cleanup * Disable compel by default * Install torch 2.1.0 with Intel * Skip conda on some setups * Detect python * Quiet output * Do not override system python with conda * Prefer python3 * Fixups * exllama2: do not install without conda (overrides pytorch version) * exllama/exllama2: do not install if not using cuda * Add missing dataset dependency * Small fixups, symlink to python, add requirements * Add neural_speed to the deps * correctly handle model offloading * fix: device_map == xpu * go back at calling python, fixed at dockerfile level * Exllama2 restricted to only nvidia gpus * Tokenizer to xpu
2025-06-01 06:50:44 +00:00 · 2024-03-07 14:37:45 +01:00 · 2024-03-07 14:37:45 +01:00 · 5d1018495f
commit 5d1018495f
parent ad6fd7a991
23 changed files with 250 additions and 81 deletions
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@ -59,6 +59,14 @@ jobs:
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@ -105,4 +113,4 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@ -120,6 +120,22 @@ jobs:
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f32-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          # Core images
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
--- a/34
+++ b/34
@ -4,6 +4,8 @@ ARG BASE_IMAGE=ubuntu:22.04
 # extras or core
 FROM ${BASE_IMAGE} as requirements-core

+USER root
+
 ARG GO_VERSION=1.21.7
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
@ -21,7 +23,7 @@ RUN apt-get update && \
    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean

 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -v -C /usr/local -xz
+RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
@ -79,6 +81,10 @@ RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get install -y espeak-ng espeak && apt-get clean

+RUN if [ ! -e /usr/bin/python ]; then \
+	  ln -s /usr/bin/python3 /usr/bin/python \
+    ; fi
+
 ###################################
 ###################################

@ -166,43 +172,43 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/

 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
+	 make -C backend/python/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
+	 make -C backend/python/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
+	 make -C backend/python/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
+	 make -C backend/python/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/mamba \
+	 make -C backend/python/mamba \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
+	 make -C backend/python/sentencetransformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
+	 make -C backend/python/transformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
+	 make -C backend/python/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
+	 make -C backend/python/exllama \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
+     make -C backend/python/exllama2 \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
+	 make -C backend/python/petals \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
+	 make -C backend/python/transformers-musicgen \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
+	 make -C backend/python/coqui \
    ; fi

 # Make sure the models directory exists
--- a/7
+++ b/7
@ -557,3 +557,10 @@ docker-image-intel:
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+
+docker-image-intel-xpu:
+	docker build \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="none" \
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
--- a/backend/python/common-env/transformers/Makefile
+++ b/backend/python/common-env/transformers/Makefile
@ -8,6 +8,13 @@ ifeq ($(BUILD_TYPE), hipblas)
 	CONDA_ENV_PATH = "transformers-rocm.yml"
 endif

+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: transformers
 transformers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@ -1,24 +1,38 @@
 #!/bin/bash
 set -ex

+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
 }

-if conda_env_exists "transformers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name transformers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "transformers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name transformers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the intel image
+    # (no conda env)
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed
 fi

 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate transformers
+    if [ $SKIP_CONDA -eq 0 ]; then
+        # Activate conda environment
+        source activate transformers
+    fi

    pip cache purge
 fi
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@ -4,6 +4,13 @@ ifeq ($(BUILD_TYPE), hipblas)
 export CONDA_ENV_PATH = "diffusers-rocm.yml"
 endif

+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: diffusers
 diffusers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
--- a/backend/python/diffusers/backend_diffusers.py
+++ b/backend/python/diffusers/backend_diffusers.py
@ -21,14 +21,15 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipelin
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image,export_to_video
-from compel import Compel
+from compel import Compel, ReturnedEmbeddingsType

 from transformers import CLIPTextModel
 from safetensors.torch import load_file


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
-COMPEL=os.environ.get("COMPEL", "1") == "1"
+COMPEL=os.environ.get("COMPEL", "0") == "1"
+XPU=os.environ.get("XPU", "0") == "1"
 CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
 SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
 CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
@ -36,6 +37,10 @@ FPS=os.environ.get("FPS", "7")
 DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES=os.environ.get("FRAMES", "64")

+if XPU:
+    import intel_extension_for_pytorch as ipex
+    print(ipex.xpu.get_device_name(0))
+
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

@ -231,8 +236,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.SchedulerType != "":
                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
                
-            if not self.img2vid:
-                self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+            if COMPEL:
+                self.compel = Compel(
+                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ], 
+                    text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
+                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+                    requires_pooled=[False, True]
+                    )


            if request.ControlNet:
@ -247,6 +257,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.pipe.to('cuda')
                if self.controlnet:
                    self.controlnet.to('cuda')
+            if XPU:
+                self.pipe = self.pipe.to("xpu")
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@ -386,8 +398,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        image = {}
        if COMPEL:
-            conditioning = self.compel.build_conditioning_tensor(prompt)
-            kwargs["prompt_embeds"]= conditioning
+            conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
+            kwargs["prompt_embeds"] = conditioning
+            kwargs["pooled_prompt_embeds"] = pooled
            # pass the kwargs dictionary to the self.pipe method
            image = self.pipe(
                guidance_scale=self.cfg_scale,
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@ -1,24 +1,50 @@
 #!/bin/bash
 set -ex

+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
 }

-if conda_env_exists "diffusers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name diffusers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "diffusers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name diffusers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the Intel image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install torch==2.1.0a0 \
+                torchvision==0.16.0a0 \
+                torchaudio==2.1.0a0 \
+                intel-extension-for-pytorch==2.1.10+xpu \
+                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+    
+    pip install google-api-python-client \
+                grpcio \
+                grpcio-tools \
+                diffusers==0.24.0 \
+                transformers>=4.25.1 \
+                accelerate \
+                compel==2.0.2 \
+                Pillow
 fi

 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate diffusers
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate diffusers
+    fi

    pip cache purge
 fi
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@ -3,10 +3,15 @@
 ##
 ## A bash script wrapper that runs the diffusers server with conda

-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate diffusers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate diffusers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@ -3,6 +3,11 @@ set -ex

 export PATH=$PATH:/opt/conda/bin

+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@ -2,10 +2,14 @@
 set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f

-# Activate conda environment
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
+export PATH=$PATH:/opt/conda/bin
 source activate transformers

 echo $CONDA_PREFIX
--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@ -2,13 +2,14 @@
 set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin

 if [ "$BUILD_TYPE" != "cublas" ]; then
    echo "[mamba] Attention!!! nvcc is required - skipping installation"
    exit 0
 fi

+export PATH=$PATH:/opt/conda/bin
+
 # Activate conda environment
 source activate transformers

--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@ -1,7 +1,7 @@
 .PHONY: petals
 petals:
 	@echo "Creating virtual environment..."
-	@conda env create --name petals --file petals.yml
+	bash install.sh "petals.yml"
 	@echo "Virtual environment created."

 .PHONY: run
--- a/backend/python/petals/install.sh
+++ b/backend/python/petals/install.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+export PATH=$PATH:/opt/conda/bin
+
+conda env create --name petals --file $1
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@ -3,10 +3,16 @@
 ##
 ## A bash script wrapper that runs the transformers server with conda

-export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
-source activate transformers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate transformers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@ -16,7 +16,15 @@ import backend_pb2_grpc
 import grpc
 import torch
 import torch.cuda
-from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+
+XPU=os.environ.get("XPU", "0") == "1"
+if XPU:
+    import intel_extension_for_pytorch as ipex
+    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
+    from transformers import AutoTokenizer, AutoModel, set_seed
+else:
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@ -69,12 +77,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        model_name = request.Model
        try:
            if request.Type == "AutoModelForCausalLM":
-                self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
+                if XPU:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
+                                              device_map="xpu", load_in_4bit=True)
+                else:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
            else:
                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)

            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.CUDA = False
+            self.XPU = False
+
+            if XPU:
+                self.XPU = True
+                try:
+                    print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
+                    self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
+                except Exception as err:
+                    print("Not using XPU:", err, file=sys.stderr)

            if request.CUDA or torch.cuda.is_available():
                try:
@ -139,6 +160,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
        if self.CUDA:
            inputs = inputs.to("cuda")
+        if XPU:
+            inputs = inputs.to("xpu")

        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)

--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@ -1,3 +1,7 @@
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: ttsvalle
 ttsvalle:
 	$(MAKE) -C ../common-env/transformers
--- a/backend/python/vall-e-x/install.sh
+++ b/backend/python/vall-e-x/install.sh
@ -2,13 +2,16 @@

 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 export SHA=3faaf8ccadb154d63b38070caf518ce9309ea0f4

-# Activate conda environment
-source activate transformers
+SKIP_CONDA=${SKIP_CONDA:-0}

-echo $CONDA_PREFIX
+if [ $SKIP_CONDA -ne 1 ]; then
+    source activate transformers
+else
+    export PATH=$PATH:/opt/conda/bin
+    CONDA_PREFIX=$PWD
+fi

 git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && popd

--- a/core/backend/image.go
+++ b/core/backend/image.go
@ -8,27 +8,18 @@ import (
 )

 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-
+	threads := backendConfig.Threads
+	if threads == 0 && appConfig.Threads != 0 {
+		threads = appConfig.Threads
+	}
+	gRPCOpts := gRPCModelOpts(backendConfig)
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithThreads(uint32(backendConfig.Threads)),
+		model.WithThreads(uint32(threads)),
 		model.WithContext(appConfig.Context),
 		model.WithModel(backendConfig.Model),
-		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          backendConfig.CUDA || backendConfig.Diffusers.CUDA,
-			SchedulerType: backendConfig.Diffusers.SchedulerType,
-			PipelineType:  backendConfig.Diffusers.PipelineType,
-			CFGScale:      backendConfig.Diffusers.CFGScale,
-			LoraAdapter:   backendConfig.LoraAdapter,
-			LoraScale:     backendConfig.LoraScale,
-			LoraBase:      backendConfig.LoraBase,
-			IMG2IMG:       backendConfig.Diffusers.IMG2IMG,
-			CLIPModel:     backendConfig.Diffusers.ClipModel,
-			CLIPSubfolder: backendConfig.Diffusers.ClipSubFolder,
-			CLIPSkip:      int32(backendConfig.Diffusers.ClipSkip),
-			ControlNet:    backendConfig.Diffusers.ControlNet,
-		}),
+		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
 	})

 	inferenceModel, err := loader.BackendLoader(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -28,7 +28,10 @@ type TokenUsage struct {

 func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
-
+	threads := c.Threads
+	if threads == 0 && o.Threads != 0 {
+		threads = o.Threads
+	}
 	grpcOpts := gRPCModelOpts(c)

 	var inferenceModel grpc.Backend
@ -36,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode

 	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
+		model.WithThreads(uint32(threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -40,11 +40,23 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	}

 	return &pb.ModelOptions{
+		CUDA:                 c.CUDA || c.Diffusers.CUDA,
+		SchedulerType:        c.Diffusers.SchedulerType,
+		PipelineType:         c.Diffusers.PipelineType,
+		CFGScale:             c.Diffusers.CFGScale,
+		LoraAdapter:          c.LoraAdapter,
+		LoraScale:            c.LoraScale,
+		F16Memory:            c.F16,
+		LoraBase:             c.LoraBase,
+		IMG2IMG:              c.Diffusers.IMG2IMG,
+		CLIPModel:            c.Diffusers.ClipModel,
+		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
+		CLIPSkip:             int32(c.Diffusers.ClipSkip),
+		ControlNet:           c.Diffusers.ControlNet,
 		ContextSize:          int32(c.ContextSize),
 		Seed:                 int32(c.Seed),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
-		CUDA:                 c.CUDA, // diffusers, transformers
 		DraftModel:           c.DraftModel,
 		AudioPath:            c.VallE.AudioPath,
 		Quantization:         c.Quantization,
@ -58,12 +70,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		YarnAttnFactor:       c.YarnAttnFactor,
 		YarnBetaFast:         c.YarnBetaFast,
 		YarnBetaSlow:         c.YarnBetaSlow,
-		LoraAdapter:          c.LoraAdapter,
-		LoraBase:             c.LoraBase,
-		LoraScale:            c.LoraScale,
 		NGQA:                 c.NGQA,
 		RMSNormEps:           c.RMSNormEps,
-		F16Memory:            c.F16,
 		MLock:                c.MMlock,
 		RopeFreqBase:         c.RopeFreqBase,
 		RopeScaling:          c.RopeScaling,
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -69,6 +69,13 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			return fmt.Sprintf("127.0.0.1:%d", port), nil
 		}

+		// If no specific model path is set for transformers/HF, set it to the model path
+		for _, env := range []string{"HF_HOME", "TRANSFORMERS_CACHE", "HUGGINGFACE_HUB_CACHE"} {
+			if os.Getenv(env) == "" {
+				os.Setenv(env, ml.ModelPath)
+			}
+		}
+
 		// Check if the backend is provided as external
 		if uri, ok := o.externalBackends[backend]; ok {
 			log.Debug().Msgf("Loading external backend: %s", uri)