feat: Use ubuntu as base for container images, drop deprecated ggml-transformers backends (#1689)

* cleanup backends * switch image to ubuntu 22.04 * adapt commands for ubuntu * transformers cleanup * no contrib on ubuntu * Change test model to gguf * ci: disable bark tests (too cpu-intensive) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * cleanup * refinements * use intel base image * Makefile: Add docker targets * Change test model --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-18 08:22:57 +00:00 · 2024-02-08 20:12:51 +01:00 · 2024-02-08 20:12:51 +01:00 · ddd21f1644
commit ddd21f1644
parent d0a6a35b55
20 changed files with 161 additions and 466 deletions
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@ -21,6 +21,7 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@ -39,6 +40,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@ -48,6 +50,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@ -60,6 +63,7 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@ -75,9 +79,11 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@ -91,3 +97,4 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@ -25,6 +25,7 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@ -44,6 +45,7 @@ jobs:
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
@ -51,6 +53,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@ -60,6 +63,7 @@ jobs:
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@ -69,6 +73,7 @@ jobs:
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@ -78,6 +83,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@ -87,6 +93,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
@ -94,6 +101,7 @@ jobs:
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@ -107,6 +115,7 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@ -121,10 +130,12 @@ jobs:
            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
@ -132,6 +143,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
@ -139,6 +151,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@ -146,6 +159,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@ -158,6 +172,7 @@ jobs:
            tag-suffix: '-cublas-cuda11-core'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
          - build-type: 'cublas'
            cuda-major-version: "12"
@ -167,6 +182,7 @@ jobs:
            tag-suffix: '-cublas-cuda12-core'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
          - build-type: 'cublas'
            cuda-major-version: "11"
@ -177,6 +193,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@ -186,3 +203,4 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@ -4,6 +4,11 @@ name: 'build container images (reusable)'
 on:
  workflow_call:
    inputs:
      base-image:
        description: 'Base image'
        required: false
        default: ''
        type: string
      build-type:
        description: 'Build type'
        default: ''
@ -154,6 +159,7 @@ jobs:
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
          context: .
          file: ./Dockerfile
          platforms: ${{ inputs.platforms }}
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@ -164,74 +164,74 @@ jobs:
-  tests-bark:
+  # tests-bark:
-    runs-on: ubuntu-latest
+  #   runs-on: ubuntu-latest
-    steps:
+  #   steps:
-      - name: Release space from worker
+  #     - name: Release space from worker
-        run: |
+  #       run: |
-            echo "Listing top largest packages"
+  #           echo "Listing top largest packages"
-            pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+  #           pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-            head -n 30 <<< "${pkgs}"
+  #           head -n 30 <<< "${pkgs}"
-            echo
+  #           echo
-            df -h
+  #           df -h
-            echo
+  #           echo
-            sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+  #           sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-            sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+  #           sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-            sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+  #           sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-            sudo rm -rf /usr/local/lib/android
+  #           sudo rm -rf /usr/local/lib/android
-            sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+  #           sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-            sudo rm -rf /usr/share/dotnet
+  #           sudo rm -rf /usr/share/dotnet
-            sudo apt-get remove -y '^mono-.*' || true
+  #           sudo apt-get remove -y '^mono-.*' || true
-            sudo apt-get remove -y '^ghc-.*' || true
+  #           sudo apt-get remove -y '^ghc-.*' || true
-            sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+  #           sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-            sudo apt-get remove -y 'php.*' || true
+  #           sudo apt-get remove -y 'php.*' || true
-            sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+  #           sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-            sudo apt-get remove -y '^google-.*' || true
+  #           sudo apt-get remove -y '^google-.*' || true
-            sudo apt-get remove -y azure-cli || true
+  #           sudo apt-get remove -y azure-cli || true
-            sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+  #           sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-            sudo apt-get remove -y '^gfortran-.*' || true
+  #           sudo apt-get remove -y '^gfortran-.*' || true
-            sudo apt-get remove -y microsoft-edge-stable || true
+  #           sudo apt-get remove -y microsoft-edge-stable || true
-            sudo apt-get remove -y firefox || true
+  #           sudo apt-get remove -y firefox || true
-            sudo apt-get remove -y powershell || true
+  #           sudo apt-get remove -y powershell || true
-            sudo apt-get remove -y r-base-core || true
+  #           sudo apt-get remove -y r-base-core || true
-            sudo apt-get autoremove -y
+  #           sudo apt-get autoremove -y
-            sudo apt-get clean
+  #           sudo apt-get clean
-            echo
+  #           echo
-            echo "Listing top largest packages"
+  #           echo "Listing top largest packages"
-            pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+  #           pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-            head -n 30 <<< "${pkgs}"
+  #           head -n 30 <<< "${pkgs}"
-            echo
+  #           echo
-            sudo rm -rfv build || true
+  #           sudo rm -rfv build || true
-            sudo rm -rf /usr/share/dotnet || true
+  #           sudo rm -rf /usr/share/dotnet || true
-            sudo rm -rf /opt/ghc || true
+  #           sudo rm -rf /opt/ghc || true
-            sudo rm -rf "/usr/local/share/boost" || true
+  #           sudo rm -rf "/usr/local/share/boost" || true
-            sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
-            df -h
+  #           df -h
-      - name: Clone
+  #     - name: Clone
-        uses: actions/checkout@v4
+  #       uses: actions/checkout@v4
-        with: 
+  #       with: 
-          submodules: true
+  #         submodules: true
-      - name: Dependencies
+  #     - name: Dependencies
-        run: |
+  #       run: |
-          sudo apt-get update
+  #         sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+  #         sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
+  #            sudo apt-get update && \
-             sudo apt-get install -y conda
+  #            sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          sudo rm -rfv /usr/bin/conda || true
+  #         sudo rm -rfv /usr/bin/conda || true
-      - name: Test bark
+  #     - name: Test bark
-        run: |
+  #       run: |
-           export PATH=$PATH:/opt/conda/bin
+  #          export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/bark
+  #          make -C backend/python/bark
-           make -C backend/python/bark test
+  #          make -C backend/python/bark test
  # Below tests needs GPU. Commented out for now
--- a/29
+++ b/29
@ -1,9 +1,11 @@
-ARG GO_VERSION=1.21-bullseye
+ARG GO_VERSION=1.21
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
 # extras or core
 FROM ${BASE_IMAGE} as requirements-core
-FROM golang:$GO_VERSION as requirements-core
+ARG GO_VERSION=1.21.7
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
@ -11,14 +13,17 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}
-
+ENV DEBIAN_FRONTEND=noninteractive
 ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
 ARG GO_TAGS="stablediffusion tinydream tts"
 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
 # Install Go
 RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -v -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@ -30,21 +35,13 @@ RUN echo "Target Variant: $TARGETVARIANT"
 # CuBLAS requirements
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    apt-get install -y software-properties-common && \
-    apt-add-repository contrib && \
+    curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
-    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm -f cuda-keyring_1.1-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
 # oneapi requirements
 RUN if [ "${BUILD_TYPE}" = "sycl_f16" ] || [ "${BUILD_TYPE}" = "sycl_f32" ]; then \
    wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/163da6e4-56eb-4948-aba3-debcec61c064/l_BaseKit_p_2024.0.1.46_offline.sh && \
    sh ./l_BaseKit_p_2024.0.1.46_offline.sh -a -s --eula accept && \
    rm -rf l_BaseKit_p_2024.0.1.46_offline.sh \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
--- a/67
+++ b/67
@ -14,9 +14,6 @@ CPPLLAMA_VERSION?=1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
 # go-ggml-transformers version
 GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=633c5a3485c403cb2520693dc0991a25dace9f0f
@ -145,7 +142,16 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif
-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-ggml backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
 # If empty, then we build all
@ -217,14 +223,6 @@ backend-assets/espeak-ng-data: sources/go-piper
 sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
 ## CEREBRAS GPT
 sources/go-ggml-transformers:
 	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp sources/go-ggml-transformers
 	cd sources/go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
 sources/go-ggml-transformers/libtransformers.a: sources/go-ggml-transformers
 	$(MAKE) -C sources/go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
 sources/whisper.cpp:
 	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
@ -252,12 +250,11 @@ sources/go-piper/libpiper_binding.a: sources/go-piper
 backend/cpp/llama/llama.cpp:
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp	
-get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/go-ggml-transformers sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
 	touch $@
 replace:
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(CURDIR)/sources/go-ggml-transformers
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
@ -276,7 +273,6 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C sources/go-llama clean
 	$(MAKE) -C sources/go-llama-ggml clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C sources/go-ggml-transformers clean
 	$(MAKE) -C sources/go-rwkv clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
@ -321,7 +317,7 @@ run: prepare ## run local-ai
 test-models/testmodel:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@ -505,26 +501,6 @@ backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
 backend-assets/grpc/dolly: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./backend/go/llm/dolly/
 backend-assets/grpc/gptj: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./backend/go/llm/gptj/
 backend-assets/grpc/gptneox: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./backend/go/llm/gptneox/
 backend-assets/grpc/mpt: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./backend/go/llm/mpt/
 backend-assets/grpc/replit: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./backend/go/llm/replit/
 backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
@ -556,3 +532,22 @@ backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
 grpcs: prepare $(GRPC_BACKENDS)
 DOCKER_IMAGE?=local-ai
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:22.04
 docker:
 	docker build \
 		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS=$(GO_TAGS) \
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
 		-t $(DOCKER_IMAGE) .
 docker-image-intel:
 	docker build \
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg BUILD_TYPE=sycl_f16 -t $(DOCKER_IMAGE) .
--- a/api/api_test.go
+++ b/api/api_test.go
@ -29,6 +29,15 @@ import (
 	"github.com/sashabaranov/go-openai/jsonschema"
 )
 const testPrompt = `### System:
 You are an AI assistant that follows instruction extremely well. Help as much as you can.
 ### User:
 Can you help rephrasing sentences?
 ### Response:`
 type modelApplyRequest struct {
 	ID        string                 `json:"id"`
 	URL       string                 `json:"url"`
@ -629,28 +638,28 @@ var _ = Describe("API test", func() {
 			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions", func() {
-			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
 		})
 		It("can generate chat completions ", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate completions from model configs", func() {
-			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: "abcdedfghikl"})
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
 		})
 		It("can generate chat completions from model configs", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
@ -658,7 +667,7 @@ var _ = Describe("API test", func() {
 		It("returns errors", func() {
 			backends := len(model.AutoLoadBackends) + 1 // +1 for huggingface
-			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
+			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: testPrompt})
 			Expect(err).To(HaveOccurred())
 			Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
 		})
@ -834,13 +843,13 @@ var _ = Describe("API test", func() {
 			app.Shutdown()
 		})
 		It("can generate chat completions from config file (list1)", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate chat completions from config file (list2)", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
--- a/backend/go/llm/transformers/dolly.go
+++ b/backend/go/llm/transformers/dolly.go
@ -1,44 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type Dolly struct {
 	base.SingleThread
 	dolly *transformers.Dolly
 }
 func (llm *Dolly) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.NewDolly(opts.ModelFile)
 	llm.dolly = model
 	return err
 }
 func (llm *Dolly) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *Dolly) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/transformers/gpt2.go
+++ b/backend/go/llm/transformers/gpt2.go
@ -1,42 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type GPT2 struct {
 	base.SingleThread
 	gpt2 *transformers.GPT2
 }
 func (llm *GPT2) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.New(opts.ModelFile)
 	llm.gpt2 = model
 	return err
 }
 func (llm *GPT2) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gpt2.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *GPT2) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.gpt2.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/transformers/gptj.go
+++ b/backend/go/llm/transformers/gptj.go
@ -1,42 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type GPTJ struct {
 	base.SingleThread
 	gptj *transformers.GPTJ
 }
 func (llm *GPTJ) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.NewGPTJ(opts.ModelFile)
 	llm.gptj = model
 	return err
 }
 func (llm *GPTJ) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gptj.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *GPTJ) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.gptj.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/transformers/gptneox.go
+++ b/backend/go/llm/transformers/gptneox.go
@ -1,42 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type GPTNeoX struct {
 	base.SingleThread
 	gptneox *transformers.GPTNeoX
 }
 func (llm *GPTNeoX) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.NewGPTNeoX(opts.ModelFile)
 	llm.gptneox = model
 	return err
 }
 func (llm *GPTNeoX) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gptneox.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *GPTNeoX) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.gptneox.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/transformers/mpt.go
+++ b/backend/go/llm/transformers/mpt.go
@ -1,42 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type MPT struct {
 	base.SingleThread
 	mpt *transformers.MPT
 }
 func (llm *MPT) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.NewMPT(opts.ModelFile)
 	llm.mpt = model
 	return err
 }
 func (llm *MPT) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.mpt.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *MPT) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.mpt.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/transformers/predict.go
+++ b/backend/go/llm/transformers/predict.go
@ -1,26 +0,0 @@
 package transformers
 import (
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 func buildPredictOptions(opts *pb.PredictOptions) []transformers.PredictOption {
 	predictOptions := []transformers.PredictOption{
 		transformers.SetTemperature(float64(opts.Temperature)),
 		transformers.SetTopP(float64(opts.TopP)),
 		transformers.SetTopK(int(opts.TopK)),
 		transformers.SetTokens(int(opts.Tokens)),
 		transformers.SetThreads(int(opts.Threads)),
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, transformers.SetBatch(int(opts.Batch)))
 	}
 	if opts.Seed != 0 {
 		predictOptions = append(predictOptions, transformers.SetSeed(int(opts.Seed)))
 	}
 	return predictOptions
 }
--- a/backend/go/llm/transformers/replit.go
+++ b/backend/go/llm/transformers/replit.go
@ -1,42 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type Replit struct {
 	base.SingleThread
 	replit *transformers.Replit
 }
 func (llm *Replit) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.NewReplit(opts.ModelFile)
 	llm.replit = model
 	return err
 }
 func (llm *Replit) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.replit.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *Replit) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.replit.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/transformers/starcoder.go
+++ b/backend/go/llm/transformers/starcoder.go
@ -1,43 +0,0 @@
 package transformers
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )
 type Starcoder struct {
 	base.SingleThread
 	starcoder *transformers.Starcoder
 }
 func (llm *Starcoder) Load(opts *pb.ModelOptions) error {
 	model, err := transformers.NewStarcoder(opts.ModelFile)
 	llm.starcoder = model
 	return err
 }
 func (llm *Starcoder) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.starcoder.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 // fallback to Predict
 func (llm *Starcoder) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.starcoder.Predict(opts.Prompt, buildPredictOptions(opts)...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		results <- res
 		close(results)
 	}()
 	return nil
 }
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -13,10 +13,6 @@ if [ -n "$EXTRA_BACKENDS" ]; then
 	done
 fi
 if [ -e "/opt/intel/oneapi/setvars.sh" ]; then
 	source /opt/intel/oneapi/setvars.sh
 fi
 if [ "$REBUILD" != "false" ]; then
 	rm -rf ./local-ai
 	make build -j${BUILD_PARALLELISM:-1}
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -23,11 +23,6 @@ const (
 	GoLlamaBackend      = "llama"
 	LlamaGGML           = "llama-ggml"
 	LLamaCPP            = "llama-cpp"
 	GPTJBackend         = "gptj"
 	DollyBackend        = "dolly"
 	MPTBackend          = "mpt"
 	GPTNeoXBackend      = "gptneox"
 	ReplitBackend       = "replit"
 	Gpt4AllLlamaBackend = "gpt4all-llama"
 	Gpt4AllMptBackend   = "gpt4all-mpt"
 	Gpt4AllJBackend     = "gpt4all-j"
@ -50,12 +45,7 @@ var AutoLoadBackends []string = []string{
 	LlamaGGML,
 	GoLlamaBackend,
 	Gpt4All,
 	GPTNeoXBackend,
 	BertEmbeddingsBackend,
 	GPTJBackend,
 	DollyBackend,
 	MPTBackend,
 	ReplitBackend,
 	RwkvBackend,
 	WhisperBackend,
 	StableDiffusionBackend,
--- a/tests/models_fixtures/config.yaml
+++ b/tests/models_fixtures/config.yaml
@ -4,7 +4,7 @@
    top_p: 80
    top_k: 0.9
    temperature: 0.1
-  context_size: 10
+  context_size: 200
  stopwords:
  - "HUMAN:"
  - "### Response:"
@ -20,7 +20,7 @@
    top_k: 0.9
    temperature: 0.1
    model: testmodel
-  context_size: 10
+  context_size: 200
  stopwords:
  - "HUMAN:"
  - "### Response:"
--- a/tests/models_fixtures/gpt4.yaml
+++ b/tests/models_fixtures/gpt4.yaml
@ -4,7 +4,7 @@ parameters:
  top_p: 80
  top_k: 0.9
  temperature: 0.1
-context_size: 10
+context_size: 200
 stopwords:
 - "HUMAN:"
 - "### Response:"
--- a/tests/models_fixtures/gpt4_2.yaml
+++ b/tests/models_fixtures/gpt4_2.yaml
@ -4,7 +4,7 @@ parameters:
  top_p: 80
  top_k: 0.9
  temperature: 0.1
-context_size: 10
+context_size: 200
 stopwords:
 - "HUMAN:"
 - "### Response:"