feat: bump llama.cpp, add gguf support (#943)

**Description** This PR syncs up the `llama` backend to use `gguf` (https://github.com/go-skynet/go-llama.cpp/pull/180). It also adds `llama-stable` to the targets so we can still load ggml. It adapts the current tests to use the `llama-backend` for ggml and uses a `gguf` model to run tests on the new backend. In order to consume the new version of go-llama.cpp, it also bump go to 1.21 (images, pipelines, etc) --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-31 22:40:45 +00:00 · 2023-08-24 01:18:58 +02:00 · 2023-08-24 01:18:58 +02:00 · 1120847f72
commit 1120847f72
parent 704323b805
7 changed files with 89 additions and 16 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -22,6 +22,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          sudo apt-get update
@ -60,6 +63,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Build
        id: build
        env:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        go-version: ['1.20.x', 'stable']
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
@ -63,7 +63,7 @@ jobs:
    runs-on: macOS-latest
    strategy:
      matrix:
-        go-version: ['1.20.x', 'stable']
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-ARG GO_VERSION=1.20-bullseye
+ARG GO_VERSION=1.21-bullseye

 FROM golang:$GO_VERSION as requirements

--- a/11
+++ b/11
@ -4,7 +4,7 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=f03869d188b72c8a617bea3a36cf8eb43f73445c
+GOLLAMA_VERSION?=0ef04cde78e5da41de234832d73bb768ced709e7

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

@ -103,7 +103,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)

 .PHONY: all test build vendor

@ -302,9 +302,10 @@ test: prepare test-models/testmodel grpcs
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
+	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

@ -316,6 +317,10 @@ test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg

+test-llama-gguf: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
--- a/api/api_test.go
+++ b/api/api_test.go
@ -296,7 +296,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@ -359,6 +359,76 @@ var _ = Describe("API test", func() {
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})

+			It("runs openllama gguf", Label("llama-gguf"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
+					Name:      "openllama_3b_gguf",
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing completion")
+				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: "openllama_3b_gguf",
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
 			It("runs gpt4all", Label("gpt4all"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
--- a/go.mod
+++ b/go.mod
@ -1,6 +1,6 @@
 module github.com/go-skynet/LocalAI

-go 1.20
+go 1.21

 require (
 	github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df
--- a/pkg/backend/llm/llama/llama.go
+++ b/pkg/backend/llm/llama/llama.go
@ -32,14 +32,6 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}

-	if opts.NGQA != 0 {
-		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
-	}
-
-	if opts.RMSNormEps != 0 {
-		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
-	}
-
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}