From 5d892f86eaca87510b7e27fd496fed904778df31 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:47:36 +0200
Subject: [PATCH] chore(cuda): reduce binary size (#3379)

fix(cuda): reduce binary size

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Dockerfile | 9 ++++++++-
 Makefile   | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9d651760..14e037e6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -286,7 +286,14 @@ COPY --from=grpc /opt/grpc /usr/local
 WORKDIR /build
 
 ## Build the binary
-RUN make build
+## If it's CUDA, we want to skip some of the llama-compat backends to save space
+## We only leave the most CPU-optimized variant and the fallback for the cublas build
+## (both will use CUDA for the actual computation)
+RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
+    else \
+        make build; \
+    fi
 
 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
         mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
diff --git a/Makefile b/Makefile
index 2ecbaea8..ca8077af 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=3ba780e2a8f0ffe13f571b27f0bbf2ca5a199efc
+CPPLLAMA_VERSION?=e11bd856d538e44d24d8cad4b0381fba0984d162
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp