Merge branch 'master' into ci/public-runner

2025-02-19 00:50:23 +00:00 · 2025-02-08 11:00:45 +01:00 · 2025-02-08 11:00:45 +01:00 · d6ea1a67cf
commit d6ea1a67cf
parent 4c145b037b 7a5912908a
89 changed files with 1245 additions and 1548 deletions
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.2.0
+        uses: dependabot/fetch-metadata@v2.3.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@ -18,7 +18,7 @@ jobs:
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
--- a/2
+++ b/2
@ -303,7 +303,7 @@ RUN make prepare
 ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
 ## (both will use CUDA or hipblas for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
+        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
--- a/52
+++ b/52
@ -6,9 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
+CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
 CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@ -24,7 +22,7 @@ BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
+STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@ -186,8 +183,8 @@ endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@ -221,19 +218,6 @@ endif
 all: help
 ## go-llama.cpp
 sources/go-llama.cpp:
 	mkdir -p sources/go-llama.cpp
 	cd sources/go-llama.cpp && \
 	git init && \
 	git remote add origin $(GOLLAMA_REPO) && \
 	git fetch origin && \
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@ -309,19 +293,17 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@ -329,7 +311,6 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build
@ -433,7 +414,7 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@ -448,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
 	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@ -478,10 +458,6 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@ -699,6 +675,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
 backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx512
 	$(MAKE) -C backend/cpp/llama-avx512 purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
@ -752,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@ -853,7 +829,7 @@ swagger:
 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
 ## Documentation
 docs/layouts/_default:
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -163,6 +163,11 @@ message Reply {
  double timing_token_generation = 5;
 }
 message GrammarTrigger {
  string word = 1;
  bool at_start = 2; 
 }
 message ModelOptions {
  string Model = 1;
  int32 ContextSize = 2;
@ -247,6 +252,8 @@ message ModelOptions {
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
  repeated GrammarTrigger GrammarTriggers = 65;
 }
 message Result {
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@ -468,6 +468,9 @@ struct llama_server_context
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_trigger_words;
    int32_t n_ctx;  // total context for all clients / slots
    // system prompt
@ -706,6 +709,8 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
        slot->sparams.grammar_trigger_words = grammar_trigger_words;
        slot->sparams.grammar_lazy = grammar_lazy;
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
    if ( request->ropefreqscale() != 0.0f ) {
        params.rope_freq_scale = request->ropefreqscale();
    }
    if (request->grammartriggers_size() > 0) {
        LOG_INFO("configuring grammar triggers", {});
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
            trigger.word = request->grammartriggers(i).word();
            trigger.at_start = request->grammartriggers(i).at_start();
            llama.grammar_trigger_words.push_back(trigger);
            LOG_INFO("grammar trigger", {
                { "word", trigger.word },
                { "at_start", trigger.at_start }
            });
        }
    }
 }
@ -2522,6 +2542,18 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
         json data = parse_options(false, request, llama);
         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
         for (int i=0 ; i< tokens.size(); i++){
            response->add_tokens(tokens[i]);
         }
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@ -1,204 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type LLM struct {
 	base.SingleThread
 	llama *llama.LLama
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	llamaOpts := []llama.ModelOption{
 		llama.WithRopeFreqBase(ropeFreqBase),
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}
 	if opts.NGQA != 0 {
 		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
 	}
 	if opts.RMSNormEps != 0 {
 		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
 	}
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}
 	if opts.F16Memory {
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}
 	if opts.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
 	if opts.NGPULayers != 0 {
 		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
 	}
 	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
 	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
 	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
 	if opts.NBatch != 0 {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
 	} else {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
 	}
 	if opts.NUMA {
 		llamaOpts = append(llamaOpts, llama.EnableNUMA)
 	}
 	if opts.LowVRAM {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}
 	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	predictOptions := []llama.PredictOption{
 		llama.SetTemperature(opts.Temperature),
 		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
 		llama.WithGrammar(opts.Grammar),
 		llama.SetRopeFreqBase(ropeFreqBase),
 		llama.SetRopeFreqScale(ropeFreqScale),
 		llama.SetNegativePromptScale(opts.NegativePromptScale),
 		llama.SetNegativePrompt(opts.NegativePrompt),
 	}
 	if opts.PromptCacheAll {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
 	}
 	if opts.PromptCacheRO {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
 	}
 	// Expected absolute path
 	if opts.PromptCachePath != "" {
 		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
 	}
 	if opts.Mirostat != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
 	}
 	if opts.MirostatETA != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}
 	if opts.MirostatTAU != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}
 	if opts.Debug {
 		predictOptions = append(predictOptions, llama.Debug)
 	}
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
 	if opts.PresencePenalty != 0 {
 		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}
 	if opts.NKeep != 0 {
 		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
 	}
 	if opts.F16KV {
 		predictOptions = append(predictOptions, llama.EnableF16KV)
 	}
 	if opts.IgnoreEOS {
 		predictOptions = append(predictOptions, llama.IgnoreEOS)
 	}
 	if opts.Seed != 0 {
 		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
 	}
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
 	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
 	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
 		results <- token
 		return true
 	}))
 	go func() {
 		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	predictOptions := buildPredictOptions(opts)
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
 	}
 	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
 }
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@ -1,19 +0,0 @@
 package main
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@ -1,5 +1,5 @@
 setuptools
-grpcio==1.69.0
+grpcio==1.70.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
--- a/backend/python/transformers/requirements-cpu.txt
+++ b/backend/python/transformers/requirements-cpu.txt
@ -5,4 +5,4 @@ accelerate
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-cublas11.txt
+++ b/backend/python/transformers/requirements-cublas11.txt
@ -6,4 +6,4 @@ accelerate
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-cublas12.txt
+++ b/backend/python/transformers/requirements-cublas12.txt
@ -5,4 +5,4 @@ numba==0.60.0
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@ -7,4 +7,4 @@ numba==0.60.0
 bitsandbytes
 outetts
 bitsandbytes
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@ -8,4 +8,4 @@ numba==0.60.0
 intel-extension-for-transformers
 bitsandbytes
 outetts
-sentence-transformers==3.3.1
+sentence-transformers==3.4.1
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.70.0
 protobuf
 certifi
 setuptools
--- a/core/application/startup.go
+++ b/core/application/startup.go
@ -62,7 +62,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}
 	}
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
 		log.Error().Err(err).Msg("error installing models")
 	}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -118,9 +118,19 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		nGPULayers = *c.NGPULayers
 	}
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
 			Word:    t.Word,
 			AtStart: t.AtStart,
 		})
 	}
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		GrammarTriggers:      triggers,
 		PipelineType:         c.Diffusers.PipelineType,
 		CFGScale:             c.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@ -16,12 +16,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
-	if backendConfig.Backend == "" {
+	inferenceModel, err = loader.Load(opts...)
 		inferenceModel, err = loader.Load(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.Load(opts...)
 	}
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
@ -35,6 +30,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 		return schema.TokenizeResponse{}, err
 	}
 	if resp.Tokens == nil {
 		resp.Tokens = make([]int32, 0)
 	}
 	return schema.TokenizeResponse{
 		Tokens: resp.Tokens,
 	}, nil
--- a/core/cli/models.go
+++ b/core/cli/models.go
@ -100,7 +100,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 			log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
 		}
-		err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
+		err = startup.InstallModels(galleries, mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
 		if err != nil {
 			return err
 		}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@ -32,7 +32,6 @@ type RunCMD struct {
 	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
 	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
 	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
 	Models              []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
 	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
@ -90,7 +89,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
 		config.WithStringGalleries(r.Galleries),
 		config.WithModelLibraryURL(r.RemoteLibrary),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
 		config.WithCsrf(r.CSRF),
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@ -44,8 +44,6 @@ type ApplicationConfig struct {
 	DisableGalleryEndpoint             bool
 	LoadToMemory                       []string
 	ModelLibraryURL string
 	Galleries []Gallery
 	BackendAssets     embed.FS
@ -126,12 +124,6 @@ func WithP2PToken(s string) AppOption {
 	}
 }
 func WithModelLibraryURL(url string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ModelLibraryURL = url
 	}
 }
 func WithLibPath(path string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.LibPath = path
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -287,7 +287,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultTemp := 0.9
-	defaultMirostat := 2
+	// https://github.com/mudler/LocalAI/issues/2780
 	defaultMirostat := 0
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@ -48,9 +48,9 @@ parameters:
 			Expect(config.Name).To(Equal("bar-baz"))
 			Expect(config.Validate()).To(BeTrue())
-			// download https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml
+			// download https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml
 			httpClient := http.Client{}
-			resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml")
+			resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml")
 			Expect(err).To(BeNil())
 			defer resp.Body.Close()
 			tmp, err = os.CreateTemp("", "config.yaml")
--- a/core/gallery/models_test.go
+++ b/core/gallery/models_test.go
@ -48,8 +48,10 @@ var _ = Describe("Model test", func() {
 			defer os.RemoveAll(tempdir)
 			gallery := []GalleryModel{{
-				Name: "bert",
+				Metadata: Metadata{
-				URL:  bertEmbeddingsURL,
+					Name: "bert",
 					URL:  bertEmbeddingsURL,
 				},
 			}}
 			out, err := yaml.Marshal(gallery)
 			Expect(err).ToNot(HaveOccurred())
--- a/core/gallery/request.go
+++ b/core/gallery/request.go
@ -11,6 +11,14 @@ import (
 // It is used to install the model by resolving the URL and downloading the files.
 // The other fields are used to override the configuration of the model.
 type GalleryModel struct {
 	Metadata `json:",inline" yaml:",inline"`
 	// config_file is read in the situation where URL is blank - and therefore this is a base config.
 	ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
 	// Overrides are used to override the configuration of the model located at URL
 	Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
 }
 type Metadata struct {
 	URL         string   `json:"url,omitempty" yaml:"url,omitempty"`
 	Name        string   `json:"name,omitempty" yaml:"name,omitempty"`
 	Description string   `json:"description,omitempty"  yaml:"description,omitempty"`
@ -18,10 +26,6 @@ type GalleryModel struct {
 	URLs        []string `json:"urls,omitempty" yaml:"urls,omitempty"`
 	Icon        string   `json:"icon,omitempty" yaml:"icon,omitempty"`
 	Tags        []string `json:"tags,omitempty" yaml:"tags,omitempty"`
 	// config_file is read in the situation where URL is blank - and therefore this is a base config.
 	ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
 	// Overrides are used to override the configuration of the model located at URL
 	Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
 	// AdditionalFiles are used to add additional files to the model
 	AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"`
 	// Gallery is a reference to the gallery which contains the model
--- a/core/gallery/request_test.go
+++ b/core/gallery/request_test.go
@ -9,7 +9,11 @@ import (
 var _ = Describe("Gallery API tests", func() {
 	Context("requests", func() {
 		It("parses github with a branch", func() {
-			req := GalleryModel{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
+			req := GalleryModel{
 				Metadata: Metadata{
 					URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main",
 				},
 			}
 			e, err := GetGalleryConfigFromURL(req.URL, "")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(e.Name).To(Equal("gpt4all-j"))
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@ -299,14 +299,18 @@ var _ = Describe("API test", func() {
 			g := []gallery.GalleryModel{
 				{
-					Name: "bert",
+					Metadata: gallery.Metadata{
-					URL:  bertEmbeddingsURL,
+						Name: "bert",
 						URL:  bertEmbeddingsURL,
 					},
 				},
 				{
-					Name:            "bert2",
+					Metadata: gallery.Metadata{
-					URL:             bertEmbeddingsURL,
+						Name:            "bert2",
-					Overrides:       map[string]interface{}{"foo": "bar"},
+						URL:             bertEmbeddingsURL,
-					AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
+						AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
 					},
 					Overrides: map[string]interface{}{"foo": "bar"},
 				},
 			}
 			out, err := yaml.Marshal(g)
@ -476,7 +480,7 @@ var _ = Describe("API test", func() {
 			})
 			It("apply models from config", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
+					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@ -522,77 +526,6 @@ var _ = Describe("API test", func() {
 				Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
 			})
 			It("runs openllama(llama-ggml backend)", Label("llama"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
 					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
 				uuid := response["uuid"].(string)
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))
 				By("testing completion")
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
 				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: "openllama_3b",
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
 								Content: "What is the weather like in San Francisco (celsius)?",
 							},
 						},
 						Functions: []openai.FunctionDefinition{
 							openai.FunctionDefinition{
 								Name:        "get_current_weather",
 								Description: "Get the current weather",
 								Parameters: jsonschema.Definition{
 									Type: jsonschema.Object,
 									Properties: map[string]jsonschema.Definition{
 										"location": {
 											Type:        jsonschema.String,
 											Description: "The city and state, e.g. San Francisco, CA",
 										},
 										"unit": {
 											Type: jsonschema.String,
 											Enum: []string{"celcius", "fahrenheit"},
 										},
 									},
 									Required: []string{"location"},
 								},
 							},
 						},
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
 				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
 				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
@ -600,7 +533,7 @@ var _ = Describe("API test", func() {
 				modelName := "hermes-2-pro-mistral"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
+					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@ -117,19 +117,25 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
 // @Router /models/available [get]
 func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
 		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
-		log.Debug().Msgf("Models found from galleries: %+v", models)
+
-		for _, m := range models {
+		log.Debug().Msgf("Available %d models from %d galleries\n", len(models), len(mgs.galleries))
-			log.Debug().Msgf("Model found from galleries: %+v", m)
+
 		m := []gallery.Metadata{}
 		for _, mm := range models {
 			m = append(m, mm.Metadata)
 		}
-		dat, err := json.Marshal(models)
+
 		log.Debug().Msgf("Models %#v", m)
 		dat, err := json.Marshal(m)
 		if err != nil {
-			return err
+			return fmt.Errorf("could not marshal models: %w", err)
 		}
 		return c.Send(dat)
 	}
--- a/core/http/endpoints/localai/tokenize.go
+++ b/core/http/endpoints/localai/tokenize.go
@ -12,6 +12,7 @@ import (
 // TokenizeEndpoint exposes a REST API to tokenize the content
 // @Summary Tokenize the input.
 // @Param request body schema.TokenizeRequest true "Request"
 // @Success 200 {object} schema.TokenizeResponse "Response"
 // @Router /v1/tokenize [post]
 func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
@ -51,8 +52,6 @@ func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, app
 			return err
 		}
-		c.JSON(tokenResponse)
+		return c.JSON(tokenResponse)
 		return nil
 	}
 }
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@ -401,6 +401,11 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 				log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
 				finishReason := "stop"
 				if len(input.Tools) > 0 {
 					finishReason = "tool_calls"
 				}
 				switch {
 				case noActionsToRun:
 					result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput)
@ -408,19 +413,18 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 						log.Error().Err(err).Msg("error handling question")
 						return
 					}
 					*c = append(*c, schema.Choice{
-						Message: &schema.Message{Role: "assistant", Content: &result}})
+						FinishReason: finishReason,
 						Message:      &schema.Message{Role: "assistant", Content: &result}})
 				default:
 					toolChoice := schema.Choice{
 						FinishReason: finishReason,
 						Message: &schema.Message{
 							Role: "assistant",
 						},
 					}
 					if len(input.Tools) > 0 {
 						toolChoice.FinishReason = "tool_calls"
 					}
 					for _, ss := range results {
 						name, args := ss.Name, ss.Arguments
 						if len(input.Tools) > 0 {
@ -438,7 +442,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 								},
 							)
 						} else {
-							// otherwise we return more choices directly
+							// otherwise we return more choices directly (deprecated)
 							*c = append(*c, schema.Choice{
 								FinishReason: "function_call",
 								Message: &schema.Message{
--- a/core/services/gallery.go
+++ b/core/services/gallery.go
@ -129,7 +129,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.BackendConfigLoader
 					if op.GalleryModelName != "" {
 						err = gallery.InstallModelFromGallery(op.Galleries, op.GalleryModelName, g.appConfig.ModelPath, op.Req, progressCallback, g.appConfig.EnforcePredownloadScans)
 					} else if op.ConfigURL != "" {
-						err = startup.InstallModels(op.Galleries, op.ConfigURL, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
+						err = startup.InstallModels(op.Galleries, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
 						if err != nil {
 							updateError(err)
 							continue
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@ -148,6 +148,9 @@ function:
    no_action_function_name: "" # Function name to call when no action is determined.
    no_action_description_name: "" # Description name for no-action functions.
    response_regex: [] # Regular expressions to match response from
    argument_regex: [] # Named regular to extract function arguments from the response.
    argument_regex_key_name: "key" # Name of the named regex capture to capture the key of the function arguments
 	  argument_regex_value_name: "value" # Name of the named regex capture to capture the value of the function arguments
    json_regex_match: [] # Regular expressions to match JSON data when in tool mode
    replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
    replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.
--- a/docs/content/docs/advanced/run-other-models.md
+++ b/docs/content/docs/advanced/run-other-models.md
@ -1,126 +0,0 @@
 +++
 disableToc = false
 title = "Run other Models"
 weight = 23
 icon = "rocket_launch"
 +++
 ## Running other models
 > _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/models" %}})_.
 To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/models" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
 To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
 There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
 {{% alert icon="💡" %}}
 To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI-examples/tree/main/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
 {{% /alert %}}
 {{< tabs tabTotal="3" >}}
 {{% tab tabName="CPU-only" %}}
 > 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
 | Model | Category | Docker command |
 | --- | --- | --- |
 | [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
 | 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
 | 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
 | 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
 | 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
 | [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
 | [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
 | [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
 | whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
 | rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
 | 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
 | 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
 | 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X)  | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
 | mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
 | [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
 | [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
 | 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
 | animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
 | transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
 | [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
 | [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
 | [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
 {{% /tab %}}
 {{% tab tabName="GPU (CUDA 11)" %}}
 > To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
 | Model | Category | Docker command |
 | --- | --- | --- |
 | [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
 | 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
 | 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
 | 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
 | 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
 | [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
 | [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
 | [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
 | whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
 | rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
 | 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
 | 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
 | 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
 | mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
 | [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
 | [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
 | 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
 | animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) |  ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
 | transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
 | [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
 | [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
 | [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
 {{% /tab %}}
 {{% tab tabName="GPU (CUDA 12)" %}}
 > To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
 | Model | Category | Docker command |
 | --- | --- | --- |
 | [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
 | 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
 | 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
 | 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
 | 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
 | [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
 | [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
 | [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
 | whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
 | rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
 | 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
 | 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
 | 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
 | mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
 | [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
 | [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
 | 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
 | animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
 | transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
 | [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
 | [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
 | [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
 {{% /tab %}}
 {{< /tabs >}}
 {{% alert icon="💡" %}}
 **Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
 ```bash
 docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
 ```
 {{% /alert %}}
--- a/docs/content/docs/features/model-gallery.md
+++ b/docs/content/docs/features/model-gallery.md
@ -134,12 +134,12 @@ curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
   }' 
 ```
-An example that installs openllama can be:
+An example that installs hermes-2-pro-mistral can be:
 ```bash
 LOCALAI=http://localhost:8080
 curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
-     "config_url": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml"
+     "config_url": "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml"
   }' 
 ```
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
 {{% alert note %}}
-The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings.
+The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
 {{% /alert %}}
@ -175,25 +175,12 @@ name: llama
 backend: llama
 parameters:
  # Relative to the models path
-  model: file.gguf.bin
+  model: file.gguf
 ```
 In the example above we specify `llama` as the backend to restrict loading `gguf` models only. 
 For instance, to use the `llama-ggml` backend for `ggml` models:
 ```yaml
 name: llama
 backend: llama-ggml
 parameters:
  # Relative to the models path
  model: file.ggml.bin
 ```
 #### Reference
 - [llama](https://github.com/ggerganov/llama.cpp)
 - [binding](https://github.com/go-skynet/go-llama.cpp)
 ### exllama/2
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@ -143,7 +143,7 @@ The AIO Images are inheriting the same environment variables as the base images
 | Variable | Default | Description |
 | ---------------------| ------- | ----------- |
 | `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` |
-| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/advanced/run-other-models" %}})) |
+| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/getting-started/models" %}})) |
 ## Standard container images
@ -154,7 +154,7 @@ Images are available with and without python dependencies. Note that images with
 Images with `core` in the tag are smaller and do not contain any python dependencies. 
-{{< tabs tabTotal="7" >}}
+{{< tabs tabTotal="8" >}}
 {{% tab tabName="Vanilla / CPU Images" %}}
 | Description | Quay | Docker Hub                                   |
@ -236,6 +236,18 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core`             |
 {{% /tab %}}
 {{% tab tabName="Nvidia Linux for tegra" %}}
 These images are compatible with Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. For more information, see the [Nvidia L4T guide]({{%relref "docs/reference/nvidia-l4t" %}}).
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core` | `localai/localai:master-nvidia-l4t-arm64-core`                      |
 | Latest tag | `quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-core` | `localai/localai:latest-nvidia-l4t-arm64-core`                 |
 | Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-nvidia-l4t-arm64-core` | `localai/localai:{{< version >}}-nvidia-l4t-arm64-core`             |
 {{% /tab %}}
 {{< /tabs >}}
 ## See Also
--- a/docs/content/docs/overview.md
+++ b/docs/content/docs/overview.md
@ -40,6 +40,10 @@ icon = "info"
 </a>
 </p>
 <p align="center">
 <a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
 <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
@ -118,7 +122,24 @@ To help the project you can:
 ## 🌟 Star history
-[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
+[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=mudler/LocalAI&type=Date)](https://star-history.com/#mudler/LocalAI&Date)
 ## ❤️ Sponsors
 > Do you find LocalAI useful?
 Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
 A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
    <img width=200 src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img  width=200 src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
  </a>
 </p>
 ## 📖 License
--- a/docs/content/docs/reference/nvidia-l4t.md
+++ b/docs/content/docs/reference/nvidia-l4t.md
@ -21,7 +21,13 @@ git clone https://github.com/mudler/LocalAI
 cd LocalAI
-docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t localai-orin .
+docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core .
 ```
 Otherwise images are available on quay.io and dockerhub:
 ```bash
 docker pull quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
 ```
 ## Usage
@ -29,7 +35,7 @@ docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build
 Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models:
 ```bash
-docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models  -ti --restart=always --name local-ai --runtime nvidia --gpus all localai-orin
+docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models  -ti --restart=always --name local-ai --runtime nvidia --gpus all quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
 ```
 Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
@ -1 +1 @@
-Subproject commit 8dad5ee419e5bb2a0b380aa72d7a7389af4945f6
+Subproject commit 66bc366c4727a958f3873f409550daa36932c03f
--- a/embedded/embedded.go
+++ b/embedded/embedded.go
@ -1,72 +0,0 @@
 package embedded
 import (
 	"embed"
 	"fmt"
 	"slices"
 	"strings"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/pkg/assets"
 	"gopkg.in/yaml.v3"
 )
 var modelShorteners map[string]string
 //go:embed model_library.yaml
 var modelLibrary []byte
 //go:embed models/*
 var embeddedModels embed.FS
 func ModelShortURL(s string) string {
 	if _, ok := modelShorteners[s]; ok {
 		s = modelShorteners[s]
 	}
 	return s
 }
 func init() {
 	err := yaml.Unmarshal(modelLibrary, &modelShorteners)
 	if err != nil {
 		log.Error().Err(err).Msg("error while unmarshalling embedded modelLibrary")
 	}
 }
 func GetRemoteLibraryShorteners(url string, basePath string) (map[string]string, error) {
 	remoteLibrary := map[string]string{}
 	uri := downloader.URI(url)
 	err := uri.DownloadWithCallback(basePath, func(_ string, i []byte) error {
 		return yaml.Unmarshal(i, &remoteLibrary)
 	})
 	if err != nil {
 		return nil, fmt.Errorf("error downloading remote library: %s", err.Error())
 	}
 	return remoteLibrary, err
 }
 // ExistsInModelsLibrary checks if a model exists in the embedded models library
 func ExistsInModelsLibrary(s string) bool {
 	f := fmt.Sprintf("%s.yaml", s)
 	a := []string{}
 	for _, j := range assets.ListFiles(embeddedModels) {
 		a = append(a, strings.TrimPrefix(j, "models/"))
 	}
 	return slices.Contains(a, f)
 }
 // ResolveContent returns the content in the embedded model library
 func ResolveContent(s string) ([]byte, error) {
 	if ExistsInModelsLibrary(s) {
 		return embeddedModels.ReadFile(fmt.Sprintf("models/%s.yaml", s))
 	}
 	return nil, fmt.Errorf("cannot find model %s", s)
 }
--- a/embedded/model_library.yaml
+++ b/embedded/model_library.yaml
@ -1,9 +0,0 @@
 ### 
 ###
 ### This file contains the list of models that are available in the library
 ### The URLs are automatically expanded when local-ai is being called with the key as argument
 ###
 ### For models with an entire YAML file to be embededd, put the file inside the `models`
 ### directory, it will be automatically available with the file name as key (without the .yaml extension)
 phi-2:  "github://mudler/LocalAI-examples/configurations/phi-2.yaml@main"
--- a/embedded/models/all-minilm-l6-v2.yaml
+++ b/embedded/models/all-minilm-l6-v2.yaml
@ -1,13 +0,0 @@
 name: all-minilm-l6-v2
 backend: sentencetransformers
 embeddings: true
 parameters:
  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
      "input": "Your text string goes here",
      "model": "all-minilm-l6-v2"
    }'
--- a/embedded/models/animagine-xl.yaml
+++ b/embedded/models/animagine-xl.yaml
@ -1,17 +0,0 @@
 name: animagine-xl
 parameters:
  model: Linaqruf/animagine-xl
 backend: diffusers
 f16: true
 diffusers:
  scheduler_type: euler_a
 usage: |
        curl http://localhost:8080/v1/images/generations \
          -H "Content-Type: application/json" \
          -d '{
            "prompt": "<positive prompt>|<negative prompt>",
            "model": "animagine-xl",
            "step": 51,
            "size": "1024x1024"
          }'
--- a/embedded/models/bakllava.yaml
+++ b/embedded/models/bakllava.yaml
@ -1,40 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: bakllava
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
  model: bakllava.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: bakllava.gguf
  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
 - filename: bakllava-mmproj.gguf
  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "bakllava",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/bark.yaml
+++ b/embedded/models/bark.yaml
@ -1,8 +0,0 @@
 usage: |
    bark works without any configuration, to test it, you can run the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
     "backend": "bark",
     "input":"Hello, this is a test!"
    }' | aplay
 # TODO: This is a placeholder until we manage to pre-load HF/Transformers models
--- a/embedded/models/cerbero.yaml
+++ b/embedded/models/cerbero.yaml
@ -1,24 +0,0 @@
 backend: llama
 context_size: 8192
 f16: false
 gpu_layers: 90
 name: cerbero
 mmap: false
 parameters:
  model: huggingface://galatolo/cerbero-7b-gguf/ggml-model-Q8_0.gguf
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 template:
  completion: "{{.Input}}"
  chat: "Questa è una conversazione tra un umano ed un assistente AI.\n{{.Input}}\n[|Assistente|]  "
 roles:
  user: "[|Umano|] "
  system: "[|Umano|] "
  assistant: "[|Assistente|] "
 stopwords:
 - "[|Umano|]"
 trimsuffix: 
 - "\n"
--- a/embedded/models/codellama-7b-gguf.yaml
+++ b/embedded/models/codellama-7b-gguf.yaml
@ -1,20 +0,0 @@
 name: codellama-7b-gguf
 backend: transformers
 parameters:
  model: huggingface://TheBloke/CodeLlama-7B-GGUF/codellama-7b.Q4_K_M.gguf
  temperature: 0.5
  top_k: 40
  seed: -1
  top_p: 0.95
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 context_size: 4096
 f16: true
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
          "model": "codellama-7b-gguf",
          "prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
      }'
--- a/embedded/models/codellama-7b.yaml
+++ b/embedded/models/codellama-7b.yaml
@ -1,14 +0,0 @@
 name: codellama-7b
 backend: transformers
 type: AutoModelForCausalLM
 parameters:
  model: codellama/CodeLlama-7b-hf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
 usage: |
      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
          "model": "codellama-7b",
          "prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
      }'
--- a/embedded/models/coqui.yaml
+++ b/embedded/models/coqui.yaml
@ -1,9 +0,0 @@
 usage: |
    coqui works without any configuration, to test it, you can run the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
        "backend": "coqui",
        "model": "tts_models/en/ljspeech/glow-tts",
        "input":"Hello, this is a test!"
        }'
 # TODO: This is a placeholder until we manage to pre-load HF/Transformers models
--- a/embedded/models/dolphin-2.5-mixtral-8x7b.yaml
+++ b/embedded/models/dolphin-2.5-mixtral-8x7b.yaml
@ -1,31 +0,0 @@
 name: dolphin-mixtral-8x7b
 mmap: true
 parameters:
  model: huggingface://TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/dolphin-2.5-mixtral-8x7b.Q2_K.gguf
  temperature: 0.5
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}<|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "dolphin-mixtral-8x7b",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/hermes-2-pro-mistral.yaml
+++ b/embedded/models/hermes-2-pro-mistral.yaml
@ -1,59 +0,0 @@
 name: hermes-2-pro-mistral
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
    <tool_call>
    {{- else if eq .RoleName "tool" }}
    <tool_response>
    {{- end }}
    {{- if .Content}}
    {{.Content }}
    {{- end }}
    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
    Use the following pydantic model json schema for each tool call you will make:
    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 - "\n</tool_call>"
 - "\n\n\n"
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "hermes-2-pro-mistral",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/llama3-instruct.yaml
+++ b/embedded/models/llama3-instruct.yaml
@ -1,48 +0,0 @@
 name: llama3-8b-instruct
 mmap: true
 parameters:
  model: huggingface://second-state/Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf
 template:
  chat_message: |
    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content -}}
    {{ else if .FunctionCall -}}
    {{ toJson .FunctionCall -}}
    {{ end -}}
    <|eot_id|>
  function: |
    <|start_header_id|>system<|end_header_id|>
    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
    Use the following pydantic model json schema for each tool call you will make:
    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
    Function call:
  chat: |
    <|begin_of_text|>{{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  completion: |
    {{.Input}}
 context_size: 8192
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 - "<|eot_id|>"
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "llama3-8b-instruct",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/llava-1.5.yaml
+++ b/embedded/models/llava-1.5.yaml
@ -1,33 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava-1.5
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.5-7b-mmproj-Q8_0.gguf
 parameters:
  model: llava-v1.5-7b-Q4_K.gguf
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: llava-v1.5-7b-Q4_K.gguf
  uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-Q4_K.gguf
 - filename: llava-v1.5-7b-mmproj-Q8_0.gguf
  uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-Q8_0.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava-1.5",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava-1.6-mistral.yaml
+++ b/embedded/models/llava-1.6-mistral.yaml
@ -1,33 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava-1.6-mistral
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
  model: llava-v1.6-mistral-7b.gguf
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: llava-v1.6-mistral-7b.gguf
  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q6_K.gguf
 - filename: llava-v1.6-7b-mmproj-f16.gguf
  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava-1.6-mistral",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava-1.6-vicuna.yaml
+++ b/embedded/models/llava-1.6-vicuna.yaml
@ -1,37 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava-1.6-vicuna
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: mmproj-vicuna7b-f16.gguf
 parameters:
  model: vicuna-7b-q5_k.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: vicuna-7b-q5_k.gguf
  uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
 - filename: mmproj-vicuna7b-f16.gguf
  uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava-1.6-vicuna",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava.yaml
+++ b/embedded/models/llava.yaml
@ -1,40 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
  model: bakllava.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: bakllava.gguf
  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
 - filename: bakllava-mmproj.gguf
  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/mamba-bagel.yaml
+++ b/embedded/models/mamba-bagel.yaml
@ -1,21 +0,0 @@
 name: bagel
 backend: mamba
 parameters:
  model: "jondurbin/bagel-dpo-2.8b-v0.2"
 systemPrompt: "You are a helpful, unbiased, uncensored assistant."
 template:
  chat_message: |
    {{if eq .RoleName "assistant"}}{{.Content}}{{else}}
    [INST]
    {{if .SystemPrompt}}{{.SystemPrompt}}{{else if eq .RoleName "system"}}<<SYS>>{{.Content}}<</SYS>>
    {{else if .Content}}{{.Content}}{{end}}
    [/INST]
    {{end}}
  completion: |
    {{.Input}}
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
      "model": "bagel",
      "messages": [{"role": "user", "content": "how are you doing"}],
    }'
--- a/embedded/models/mamba-chat.yaml
+++ b/embedded/models/mamba-chat.yaml
@ -1,28 +0,0 @@
 name: mamba-chat
 backend: mamba
 parameters:
  model: "havenhq/mamba-chat"
 trimsuffix: 
 - <|endoftext|>
 # https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json
 #   "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
 template:
  chat_message: |
    {{if eq .RoleName "assistant"}}<|assistant|>{{else if eq .RoleName "system"}}<|system|>{{else if eq .RoleName "user"}}<|user|>{{end}}
    {{if .Content}}{{.Content}}{{end}}
    </s>
  chat: |
    {{.Input}}
    <|assistant|>
  completion: |
    {{.Input}}
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
      "model": "mamba-chat",
      "messages": [{"role": "user", "content": "how are you doing"}],
      "temperature": 0.7
    }'
--- a/embedded/models/mistral-openorca.yaml
+++ b/embedded/models/mistral-openorca.yaml
@ -1,32 +0,0 @@
 name: mistral-openorca
 mmap: true
 parameters:
  model: huggingface://TheBloke/Mistral-7B-OpenOrca-GGUF/mistral-7b-openorca.Q6_K.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}
    <|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "mistral-openorca",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/mixtral-instruct.yaml
+++ b/embedded/models/mixtral-instruct.yaml
@ -1,25 +0,0 @@
 name: mixtral-instruct
 mmap: true
 parameters:
  model: huggingface://TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q2_K.gguf
  temperature: 0.2
  top_k: 40
  seed: -1
  top_p: 0.95
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat: &chat |
    [INST] {{.Input}} [/INST]    
  completion: *chat
 context_size: 4096
 f16: true
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
          "model": "mixtral-instruct",
          "prompt": "How are you doing?"
      }'
--- a/embedded/models/phi-2-chat.yaml
+++ b/embedded/models/phi-2-chat.yaml
@ -1,25 +0,0 @@
 name: phi-2-chat
 mmap: true
 parameters:
  model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}
    <|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "phi-2-chat",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/phi-2-orange.yaml
+++ b/embedded/models/phi-2-orange.yaml
@ -1,30 +0,0 @@
 name: phi-2-orange
 mmap: true
 parameters:
  model: huggingface://l3utterfly/phi-2-orange-GGUF/phi-2-orange.Q6_K.gguf
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}
    <|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 description: |
  This model is a chatbot that can be used for general conversation.
  [Model card](https://huggingface.co/TheBloke/phi-2-orange-GGUF)
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "phi-2-orange",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/rhasspy-voice-en-us-amy.yaml
+++ b/embedded/models/rhasspy-voice-en-us-amy.yaml
@ -1,13 +0,0 @@
 name: voice-en-us-amy-low
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
 usage: |
    To test if this model works as expected, you can use the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
      "model":"en-us-amy-low.onnx",
      "input": "Hi, this is a test."
    }'
--- a/embedded/models/tinyllama-chat.yaml
+++ b/embedded/models/tinyllama-chat.yaml
@ -1,29 +0,0 @@
 name: tinyllama-chat
 mmap: true
 parameters:
  model: huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q8_0.gguf
  temperature: 0.2
  top_k: 40
  seed: -1
  top_p: 0.95
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}<|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "tinyllama-chat",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/transformers-tinyllama.yaml
+++ b/embedded/models/transformers-tinyllama.yaml
@ -1,31 +0,0 @@
 name: tinyllama-chat
 backend: transformers
 type: AutoModelForCausalLM
 parameters:
  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  max_tokens: 4096
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}<|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 stopwords:
 - <|im_end|>
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "tinyllama-chat",
        "messages": [{"role": "user", "content": "Say this is a test!"}],
        "temperature": 0.7
      }'
--- a/embedded/models/vall-e-x.yaml
+++ b/embedded/models/vall-e-x.yaml
@ -1,8 +0,0 @@
 usage: |
    Vall-e-x works without any configuration, to test it, you can run the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
     "backend": "vall-e-x",
     "input":"Hello, this is a test!"
    }' | aplay
 # TODO: This is a placeholder until we manage to pre-load HF/Transformers models
--- a/embedded/models/whisper-base.yaml
+++ b/embedded/models/whisper-base.yaml
@ -1,18 +0,0 @@
 name: whisper
 backend: whisper
 parameters:
  model: ggml-whisper-base.bin
 usage: |
    ## example audio file
    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
    ## Send the example audio file to the transcriptions endpoint
    curl http://localhost:8080/v1/audio/transcriptions \
         -H "Content-Type: multipart/form-data" \
         -F file="@$PWD/gb1.ogg" -F model="whisper"
 download_files:
 - filename: "ggml-whisper-base.bin"
  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/gallery/deepseek-r1.yaml
+++ b/gallery/deepseek-r1.yaml
@ -0,0 +1,23 @@
 ---
 name: "deepseek-r1"
 config_file: |
  context_size: 131072
  mmap: true
  f16: true
  stopwords:
    - <｜begin▁of▁sentence｜>
    - <｜end▁of▁sentence｜>
    - <｜User｜>
    - <｜Assistant｜>
  template:
    chat_message: |
      {{if eq .RoleName "system" -}}{{.Content }}
      {{ end -}}
      {{if eq .RoleName "user" -}}<｜User｜>{{.Content}}
      {{end -}}
      {{if eq .RoleName "assistant" -}}<｜Assistant｜>{{.Content}}<｜end▁of▁sentence｜>{{end}}
    completion: |
      {{.Input}}
    chat: |
      {{.Input -}}<｜Assistant｜>
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/gallery/llama3.2-fcall.yaml
+++ b/gallery/llama3.2-fcall.yaml
@ -0,0 +1,49 @@
 ---
 name: "llama3.2-fcall"
 config_file: |
  mmap: true
  function:
    json_regex_match:
    - "(?s)<Output>(.*?)</Output>"
    capture_llm_results:
      - (?s)<Thought>(.*?)</Thought>
    replace_llm_results:
      - key: (?s)<Thought>(.*?)</Thought>
        value: ""
    grammar:
      properties_order: "name,arguments"
      function_arguments_key: "arguments"
  template:
    chat: |
      <|start_header_id|>system<|end_header_id|>
      You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
      {{.Input }}
      <|start_header_id|>assistant<|end_header_id|>
    chat_message: |
      <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
      {{ if .FunctionCall -}}
      {{ else if eq .RoleName "tool" -}}
      {{ end -}}
      {{ if .Content -}}
      {{.Content -}}
      {{ else if .FunctionCall -}}
      {{ toJson .FunctionCall -}}
      {{ end -}}
      <|eot_id|>
    completion: |
      {{.Input}}
    function: |
      <|start_header_id|>system<|end_header_id|>
      You are an AI assistant that executes function calls, and these are the tools at your disposal:
      {{range .Functions}}
      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
      {{end}}
      <|eot_id|>{{.Input}}<|start_header_id|>assistant<|end_header_id|>
  context_size: 8192
  f16: true
  stopwords:
  - <|im_end|>
  - <dummy32000>
  - "<|eot_id|>"
  - <|end_of_text|>
--- a/gallery/llama3.2-quantized.yaml
+++ b/gallery/llama3.2-quantized.yaml
@ -0,0 +1,55 @@
 ---
 name: "llama3.2-quantized"
 config_file: |
  mmap: true
  function:
    disable_no_action: true
    grammar:
      disable: true
    response_regex:
    - \[(?P<name>\w+)\((?P<arguments>.*)\)\]
    argument_regex:
    - (?P<key>[^ '\(=,]+)[='"]+(?P<value>[^=,"']+)['"]?
  template:
    chat: |
      <|begin_of_text|><|start_header_id|>system<|end_header_id|>
      You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
      {{.Input }}
      <|start_header_id|>assistant<|end_header_id|>
    chat_message: |
      <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
      {{ if .FunctionCall -}}
      {{ else if eq .RoleName "tool" -}}
      The Function was executed and the response was:
      {{ end -}}
      {{ if .Content -}}
      {{.Content -}}
      {{ else if .FunctionCall -}}
      {{ range .FunctionCall }}
      [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
      {{ end }}
      {{ end -}}
      <|eot_id|>
    completion: |
      {{.Input}}
    function: |
      <|start_header_id|>system<|end_header_id|>
      You are an expert in composing functions. You are given a question and a set of possible functions.
      Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
      If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
      If you decide to invoke any of the function(s), you MUST put it in the format as follows:
      [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
      You SHOULD NOT include any other text in the response.
      Here is a list of functions in JSON format that you can invoke.
      {{toJson .Functions}}
      <|eot_id|><|start_header_id|>user<|end_header_id|>
      {{.Input}}
      <|eot_id|><|start_header_id|>assistant<|end_header_id|>
  context_size: 8192
  f16: true
  stopwords:
  - <|im_end|>
  - <dummy32000>
  - "<|eot_id|>"
  - <|end_of_text|>
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@ -21,14 +21,16 @@ import (
 )
 const (
-	HuggingFacePrefix = "huggingface://"
+	HuggingFacePrefix  = "huggingface://"
-	OCIPrefix         = "oci://"
+	HuggingFacePrefix1 = "hf://"
-	OllamaPrefix      = "ollama://"
+	HuggingFacePrefix2 = "hf.co/"
-	HTTPPrefix        = "http://"
+	OCIPrefix          = "oci://"
-	HTTPSPrefix       = "https://"
+	OllamaPrefix       = "ollama://"
-	GithubURI         = "github:"
+	HTTPPrefix         = "http://"
-	GithubURI2        = "github://"
+	HTTPSPrefix        = "https://"
-	LocalPrefix       = "file://"
+	GithubURI          = "github:"
 	GithubURI2         = "github://"
 	LocalPrefix        = "file://"
 )
 type URI string
@ -127,6 +129,8 @@ func (u URI) LooksLikeURL() bool {
 	return strings.HasPrefix(string(u), HTTPPrefix) ||
 		strings.HasPrefix(string(u), HTTPSPrefix) ||
 		strings.HasPrefix(string(u), HuggingFacePrefix) ||
 		strings.HasPrefix(string(u), HuggingFacePrefix1) ||
 		strings.HasPrefix(string(u), HuggingFacePrefix2) ||
 		strings.HasPrefix(string(u), GithubURI) ||
 		strings.HasPrefix(string(u), OllamaPrefix) ||
 		strings.HasPrefix(string(u), OCIPrefix) ||
@ -170,8 +174,10 @@ func (s URI) ResolveURL() string {
 		projectPath := strings.Join(repoPath[2:], "/")
 		return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
-	case strings.HasPrefix(string(s), HuggingFacePrefix):
+	case strings.HasPrefix(string(s), HuggingFacePrefix) || strings.HasPrefix(string(s), HuggingFacePrefix1) || strings.HasPrefix(string(s), HuggingFacePrefix2):
 		repository := strings.Replace(string(s), HuggingFacePrefix, "", 1)
 		repository = strings.Replace(repository, HuggingFacePrefix1, "", 1)
 		repository = strings.Replace(repository, HuggingFacePrefix2, "", 1)
 		// convert repository to a full URL.
 		// e.g. TheBloke/Mixtral-8x7B-v0.1-GGUF/mixtral-8x7b-v0.1.Q2_K.gguf@main -> https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q2_K.gguf
 		owner := strings.Split(repository, "/")[0]
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"io"
 	"regexp"
 	"slices"
 	"strings"
 	"github.com/mudler/LocalAI/pkg/functions/grammars"
@ -46,6 +47,14 @@ type GrammarConfig struct {
 	// SchemaType can be configured to use a specific schema type to force the grammar
 	// available : json, llama3.1
 	SchemaType string `yaml:"schema_type"`
 	GrammarTriggers []GrammarTrigger `yaml:"triggers"`
 }
 type GrammarTrigger struct {
 	// Trigger is the string that triggers the grammar
 	Word    string `yaml:"word"`
 	AtStart bool   `yaml:"at_start"`
 }
 // FunctionsConfig is the configuration for the tool/function call.
@ -71,6 +80,12 @@ type FunctionsConfig struct {
 	// JSONRegexMatch is a regex to extract the JSON object from the response
 	JSONRegexMatch []string `yaml:"json_regex_match"`
 	// ArgumentRegex is a named regex to extract the arguments from the response. Use ArgumentRegexKey and ArgumentRegexValue to set the names of the named regex for key and value of the arguments.
 	ArgumentRegex []string `yaml:"argument_regex"`
 	// ArgumentRegex named regex names for key and value extractions. default: key and value
 	ArgumentRegexKey   string `yaml:"argument_regex_key_name"`   // default: key
 	ArgumentRegexValue string `yaml:"argument_regex_value_name"` // default: value
 	// ReplaceFunctionResults allow to replace strings in the results before parsing them
 	ReplaceFunctionResults []ReplaceResult `yaml:"replace_function_results"`
@ -310,7 +325,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
 				if functionName == "" {
 					return results
 				}
-				results = append(results, FuncCallResults{Name: result[functionNameKey], Arguments: result[functionArgumentsKey]})
+				results = append(results, FuncCallResults{Name: result[functionNameKey], Arguments: ParseFunctionCallArgs(result[functionArgumentsKey], functionConfig)})
 			}
 		}
 	} else {
@ -322,3 +337,38 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
 	return results
 }
 func ParseFunctionCallArgs(functionArguments string, functionConfig FunctionsConfig) string {
 	if len(functionConfig.ArgumentRegex) == 0 {
 		return functionArguments
 	}
 	// We use named regexes here to extract the function argument key value pairs and convert this to valid json.
 	// TODO: there might be responses where an object as a value is expected/required. This is currently not handled.
 	args := make(map[string]string)
 	agrsRegexKeyName := "key"
 	agrsRegexValueName := "value"
 	if functionConfig.ArgumentRegexKey != "" {
 		agrsRegexKeyName = functionConfig.ArgumentRegexKey
 	}
 	if functionConfig.ArgumentRegexValue != "" {
 		agrsRegexValueName = functionConfig.ArgumentRegexValue
 	}
 	for _, r := range functionConfig.ArgumentRegex {
 		var respRegex = regexp.MustCompile(r)
 		var nameRange []string = respRegex.SubexpNames()
 		var keyIndex = slices.Index(nameRange, agrsRegexKeyName)
 		var valueIndex = slices.Index(nameRange, agrsRegexValueName)
 		matches := respRegex.FindAllStringSubmatch(functionArguments, -1)
 		for _, match := range matches {
 			args[match[keyIndex]] = match[valueIndex]
 		}
 	}
 	jsonBytes, _ := json.Marshal(args)
 	return string(jsonBytes)
 }
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -43,11 +43,10 @@ var TypeAlias map[string]string = map[string]string{
 var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
 const (
 	LlamaGGML = "llama-ggml"
 	LLamaCPP = "llama-cpp"
 	LLamaCPPAVX2     = "llama-cpp-avx2"
 	LLamaCPPAVX512   = "llama-cpp-avx512"
 	LLamaCPPAVX      = "llama-cpp-avx"
 	LLamaCPPFallback = "llama-cpp-fallback"
 	LLamaCPPCUDA     = "llama-cpp-cuda"
@ -66,6 +65,18 @@ const (
 	LocalStoreBackend   = "local-store"
 )
 var llamaCPPVariants = []string{
 	LLamaCPPAVX2,
 	LLamaCPPAVX512,
 	LLamaCPPAVX,
 	LLamaCPPFallback,
 	LLamaCPPCUDA,
 	LLamaCPPHipblas,
 	LLamaCPPSycl16,
 	LLamaCPPSycl32,
 	LLamaCPPGRPC,
 }
 func backendPath(assetDir, backend string) string {
 	return filepath.Join(assetDir, "backend-assets", "grpc", backend)
 }
@ -107,40 +118,14 @@ ENTRY:
 	if AutoDetect {
 		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
 		// when starting the service
-		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
+		foundVariants := map[string]bool{}
 		if _, ok := backends[LLamaCPP]; !ok {
 			for _, e := range entry {
-				if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
+				for _, v := range llamaCPPVariants {
-					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
+					if strings.Contains(e.Name(), v) && !foundVariants[v] {
-					foundLCPPAVX2 = true
+						backends[LLamaCPP] = append(backends[LLamaCPP], v)
-				}
+						foundVariants[v] = true
-				if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
+					}
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
 					foundLCPPAVX = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
 					foundLCPPFallback = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
 					foundLCPPGRPC = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
 					foundLCPPCuda = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPHipblas) && !foundLCPPHipblas {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
 					foundLCPPHipblas = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
 					foundSycl16 = true
 				}
 				if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
 					foundSycl32 = true
 				}
 			}
 		}
@ -156,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
 	// sets a priority list - first has more priority
 	priorityList := []string{
-		// First llama.cpp(variants) and llama-ggml to follow.
+		// First llama.cpp(variants)
 		// We keep the fallback to prevent that if the llama.cpp variants
 		// that depends on shared libs if breaks have still a safety net.
-		LLamaCPP, LlamaGGML, LLamaCPPFallback,
+		LLamaCPP, LLamaCPPFallback,
 	}
 	toTheEnd := []string{
@ -283,6 +268,12 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
 			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
 			selectedProcess = p
 		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX512F) {
 		p := backendPath(assetDir, LLamaCPPAVX512)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX512 variant", backend)
 			selectedProcess = p
 		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
 		p := backendPath(assetDir, LLamaCPPAVX)
 		if _, err := os.Stat(p); err == nil {
--- a/pkg/startup/model_preload.go
+++ b/pkg/startup/model_preload.go
@ -9,7 +9,6 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/embedded"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
@ -18,42 +17,17 @@ import (
 // InstallModels will preload models from the given list of URLs and galleries
 // It will download the model if it is not already present in the model path
 // It will also try to resolve if the model is an embedded model YAML configuration
-func InstallModels(galleries []config.Gallery, modelLibraryURL string, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
+func InstallModels(galleries []config.Gallery, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
 	// create an error that groups all errors
 	var err error
 	lib, _ := embedded.GetRemoteLibraryShorteners(modelLibraryURL, modelPath)
 	for _, url := range models {
 		// As a best effort, try to resolve the model from the remote library
 		// if it's not resolved we try with the other method below
 		if modelLibraryURL != "" {
 			if lib[url] != "" {
 				log.Debug().Msgf("[startup] model configuration is defined remotely: %s (%s)", url, lib[url])
 				url = lib[url]
 			}
 		}
 		url = embedded.ModelShortURL(url)
 		uri := downloader.URI(url)
 		switch {
 		case embedded.ExistsInModelsLibrary(url):
 			modelYAML, e := embedded.ResolveContent(url)
 			// If we resolve something, just save it to disk and continue
 			if e != nil {
 				log.Error().Err(e).Msg("error resolving model content")
 				err = errors.Join(err, e)
 				continue
 			}
 			log.Debug().Msgf("[startup] resolved embedded model: %s", url)
 			md5Name := utils.MD5(url)
 			modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
 			if e := os.WriteFile(modelDefinitionFilePath, modelYAML, 0600); err != nil {
 				log.Error().Err(e).Str("filepath", modelDefinitionFilePath).Msg("error writing model definition")
 				err = errors.Join(err, e)
 			}
 		case uri.LooksLikeOCI():
 			log.Debug().Msgf("[startup] resolved OCI model to download: %s", url)
--- a/pkg/startup/model_preload_test.go
+++ b/pkg/startup/model_preload_test.go
@ -7,7 +7,6 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/pkg/startup"
 	"github.com/mudler/LocalAI/pkg/utils"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@ -16,29 +15,13 @@ import (
 var _ = Describe("Preload test", func() {
 	Context("Preloading from strings", func() {
 		It("loads from remote url", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			libraryURL := "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml"
 			fileName := fmt.Sprintf("%s.yaml", "phi-2")
 			InstallModels([]config.Gallery{}, libraryURL, tmpdir, true, nil, "phi-2")
 			resultFile := filepath.Join(tmpdir, fileName)
 			content, err := os.ReadFile(resultFile)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(content)).To(ContainSubstring("name: phi-2"))
 		})
 		It("loads from embedded full-urls", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "https://raw.githubusercontent.com/mudler/LocalAI-examples/main/configurations/phi-2.yaml"
 			fileName := fmt.Sprintf("%s.yaml", "phi-2")
-			InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
+			InstallModels([]config.Gallery{}, tmpdir, true, nil, url)
 			resultFile := filepath.Join(tmpdir, fileName)
@ -47,45 +30,13 @@ var _ = Describe("Preload test", func() {
 			Expect(string(content)).To(ContainSubstring("name: phi-2"))
 		})
 		It("loads from embedded short-urls", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "phi-2"
 			InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
 			entry, err := os.ReadDir(tmpdir)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(entry).To(HaveLen(1))
 			resultFile := entry[0].Name()
 			content, err := os.ReadFile(filepath.Join(tmpdir, resultFile))
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(content)).To(ContainSubstring("name: phi-2"))
 		})
 		It("loads from embedded models", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "mistral-openorca"
 			fileName := fmt.Sprintf("%s.yaml", utils.MD5(url))
 			InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
 			resultFile := filepath.Join(tmpdir, fileName)
 			content, err := os.ReadFile(resultFile)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(content)).To(ContainSubstring("name: mistral-openorca"))
 		})
 		It("downloads from urls", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
 			fileName := fmt.Sprintf("%s.gguf", "tinyllama-1.1b-chat-v0.3.Q2_K")
-			err = InstallModels([]config.Gallery{}, "", tmpdir, false, nil, url)
+			err = InstallModels([]config.Gallery{}, tmpdir, false, nil, url)
 			Expect(err).ToNot(HaveOccurred())
 			resultFile := filepath.Join(tmpdir, fileName)
--- a/swagger/docs.go
+++ b/swagger/docs.go
@ -765,6 +765,17 @@ const docTemplate = `{
        "/v1/tokenize": {
            "post": {
                "summary": "Tokenize the input.",
                "parameters": [
                    {
                        "description": "Request",
                        "name": "request",
                        "in": "body",
                        "required": true,
                        "schema": {
                            "$ref": "#/definitions/schema.TokenizeRequest"
                        }
                    }
                ],
                "responses": {
                    "200": {
                        "description": "Response",
@ -1838,6 +1849,17 @@ const docTemplate = `{
                }
            }
        },
        "schema.TokenizeRequest": {
            "type": "object",
            "properties": {
                "content": {
                    "type": "string"
                },
                "model": {
                    "type": "string"
                }
            }
        },
        "schema.TokenizeResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@ -758,6 +758,17 @@
        "/v1/tokenize": {
            "post": {
                "summary": "Tokenize the input.",
                "parameters": [
                    {
                        "description": "Request",
                        "name": "request",
                        "in": "body",
                        "required": true,
                        "schema": {
                            "$ref": "#/definitions/schema.TokenizeRequest"
                        }
                    }
                ],
                "responses": {
                    "200": {
                        "description": "Response",
@ -1831,6 +1842,17 @@
                }
            }
        },
        "schema.TokenizeRequest": {
            "type": "object",
            "properties": {
                "content": {
                    "type": "string"
                },
                "model": {
                    "type": "string"
                }
            }
        },
        "schema.TokenizeResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@ -705,6 +705,13 @@ definitions:
        description: voice audio file or speaker id
        type: string
    type: object
  schema.TokenizeRequest:
    properties:
      content:
        type: string
      model:
        type: string
    type: object
  schema.TokenizeResponse:
    properties:
      tokens:
@ -1216,6 +1223,13 @@ paths:
      summary: Get TokenMetrics for Active Slot.
  /v1/tokenize:
    post:
      parameters:
      - description: Request
        in: body
        name: request
        required: true
        schema:
          $ref: '#/definitions/schema.TokenizeRequest'
      responses:
        "200":
          description: Response
--- a/embedded/webui_static.yaml
+++ b/embedded/webui_static.yaml
		`@ -1 +1 @@`
			`Subproject commit 8dad5ee419e5bb2a0b380aa72d7a7389af4945f6`				`Subproject commit 66bc366c4727a958f3873f409550daa36932c03f`