feat: bump llama.cpp, add gguf support (#943)

**Description** This PR syncs up the `llama` backend to use `gguf` (https://github.com/go-skynet/go-llama.cpp/pull/180). It also adds `llama-stable` to the targets so we can still load ggml. It adapts the current tests to use the `llama-backend` for ggml and uses a `gguf` model to run tests on the new backend. In order to consume the new version of go-llama.cpp, it also bump go to 1.21 (images, pipelines, etc) --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-11 11:41:37 +00:00 · 2023-08-24 01:18:58 +02:00 · 2023-08-24 01:18:58 +02:00 · 1120847f72
commit 1120847f72
parent 704323b805
7 changed files with 89 additions and 16 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -22,6 +22,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          sudo apt-get update
@ -60,6 +63,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
      - name: Build
        id: build
        env:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        go-version: ['1.20.x', 'stable']
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
@ -63,7 +63,7 @@ jobs:
    runs-on: macOS-latest
    strategy:
      matrix:
-        go-version: ['1.20.x', 'stable']
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-ARG GO_VERSION=1.20-bullseye
+ARG GO_VERSION=1.21-bullseye
 FROM golang:$GO_VERSION as requirements
--- a/11
+++ b/11
@ -4,7 +4,7 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 # llama.cpp versions
-GOLLAMA_VERSION?=f03869d188b72c8a617bea3a36cf8eb43f73445c
+GOLLAMA_VERSION?=0ef04cde78e5da41de234832d73bb768ced709e7
 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
@ -103,7 +103,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif
-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
 .PHONY: all test build vendor
@ -302,9 +302,10 @@ test: prepare test-models/testmodel grpcs
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@ -316,6 +317,10 @@ test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
--- a/api/api_test.go
+++ b/api/api_test.go
@ -296,7 +296,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@ -359,6 +359,76 @@ var _ = Describe("API test", func() {
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs openllama gguf", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
 					Name:      "openllama_3b_gguf",
 					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
 				uuid := response["uuid"].(string)
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))
 				By("testing completion")
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
 				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: "openllama_3b_gguf",
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
 								Content: "What is the weather like in San Francisco (celsius)?",
 							},
 						},
 						Functions: []openai.FunctionDefinition{
 							openai.FunctionDefinition{
 								Name:        "get_current_weather",
 								Description: "Get the current weather",
 								Parameters: jsonschema.Definition{
 									Type: jsonschema.Object,
 									Properties: map[string]jsonschema.Definition{
 										"location": {
 											Type:        jsonschema.String,
 											Description: "The city and state, e.g. San Francisco, CA",
 										},
 										"unit": {
 											Type: jsonschema.String,
 											Enum: []string{"celcius", "fahrenheit"},
 										},
 									},
 									Required: []string{"location"},
 								},
 							},
 						},
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
 				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
 				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs gpt4all", Label("gpt4all"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
--- a/go.mod
+++ b/go.mod
@ -1,6 +1,6 @@
 module github.com/go-skynet/LocalAI
-go 1.20
+go 1.21
 require (
 	github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df
--- a/pkg/backend/llm/llama/llama.go
+++ b/pkg/backend/llm/llama/llama.go
@ -32,14 +32,6 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}
 	if opts.NGQA != 0 {
 		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
 	}
 	if opts.RMSNormEps != 0 {
 		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
 	}
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}
`@ -1,4 +1,4 @@`
	`ARG GO_VERSION=1.20-bullseye`	`ARG GO_VERSION=1.21-bullseye`

	`FROM golang:$GO_VERSION as requirements`	`FROM golang:$GO_VERSION as requirements`