feat: bump llama.cpp, add gguf support (#943)

**Description**

This PR syncs up the `llama` backend to use `gguf`
(https://github.com/go-skynet/go-llama.cpp/pull/180). It also adds
`llama-stable` to the targets so we can still load ggml. It adapts the
current tests to use the `llama-backend` for ggml and uses a `gguf`
model to run tests on the new backend.

In order to consume the new version of go-llama.cpp, it also bump go to
1.21 (images, pipelines, etc)

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2023-08-24 01:18:58 +02:00 committed by GitHub
parent 704323b805
commit 1120847f72
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 89 additions and 16 deletions

View File

@ -22,6 +22,9 @@ jobs:
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
submodules: true submodules: true
- uses: actions/setup-go@v4
with:
go-version: '>=1.21.0'
- name: Dependencies - name: Dependencies
run: | run: |
sudo apt-get update sudo apt-get update
@ -60,6 +63,9 @@ jobs:
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
submodules: true submodules: true
- uses: actions/setup-go@v4
with:
go-version: '>=1.21.0'
- name: Build - name: Build
id: build id: build
env: env:

View File

@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
go-version: ['1.20.x', 'stable'] go-version: ['1.21.x']
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v3
@ -63,7 +63,7 @@ jobs:
runs-on: macOS-latest runs-on: macOS-latest
strategy: strategy:
matrix: matrix:
go-version: ['1.20.x', 'stable'] go-version: ['1.21.x']
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v3

View File

@ -1,4 +1,4 @@
ARG GO_VERSION=1.20-bullseye ARG GO_VERSION=1.21-bullseye
FROM golang:$GO_VERSION as requirements FROM golang:$GO_VERSION as requirements

View File

@ -4,7 +4,7 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai BINARY_NAME=local-ai
# llama.cpp versions # llama.cpp versions
GOLLAMA_VERSION?=f03869d188b72c8a617bea3a36cf8eb43f73445c GOLLAMA_VERSION?=0ef04cde78e5da41de234832d73bb768ced709e7
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
@ -103,7 +103,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
OPTIONAL_GRPC+=backend-assets/grpc/piper OPTIONAL_GRPC+=backend-assets/grpc/piper
endif endif
GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
.PHONY: all test build vendor .PHONY: all test build vendor
@ -302,9 +302,10 @@ test: prepare test-models/testmodel grpcs
export GO_TAGS="tts stablediffusion" export GO_TAGS="tts stablediffusion"
$(MAKE) prepare-test $(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
$(MAKE) test-gpt4all $(MAKE) test-gpt4all
$(MAKE) test-llama $(MAKE) test-llama
$(MAKE) test-llama-gguf
$(MAKE) test-tts $(MAKE) test-tts
$(MAKE) test-stablediffusion $(MAKE) test-stablediffusion
@ -316,6 +317,10 @@ test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
test-tts: prepare-test test-tts: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg

View File

@ -296,7 +296,7 @@ var _ = Describe("API test", func() {
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama_3b.yaml", URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
Name: "openllama_3b", Name: "openllama_3b",
Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128}, Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
}) })
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@ -359,6 +359,76 @@ var _ = Describe("API test", func() {
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason)) Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
}) })
It("runs openllama gguf", Label("llama-gguf"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
Name: "openllama_3b_gguf",
Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
uuid := response["uuid"].(string)
Eventually(func() bool {
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
return response["processed"].(bool)
}, "360s", "10s").Should(Equal(true))
By("testing completion")
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1))
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
By("testing functions")
resp2, err := client.CreateChatCompletion(
context.TODO(),
openai.ChatCompletionRequest{
Model: "openllama_3b_gguf",
Messages: []openai.ChatCompletionMessage{
{
Role: "user",
Content: "What is the weather like in San Francisco (celsius)?",
},
},
Functions: []openai.FunctionDefinition{
openai.FunctionDefinition{
Name: "get_current_weather",
Description: "Get the current weather",
Parameters: jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"location": {
Type: jsonschema.String,
Description: "The city and state, e.g. San Francisco, CA",
},
"unit": {
Type: jsonschema.String,
Enum: []string{"celcius", "fahrenheit"},
},
},
Required: []string{"location"},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp2.Choices)).To(Equal(1))
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
var res map[string]string
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
Expect(err).ToNot(HaveOccurred())
Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
})
It("runs gpt4all", Label("gpt4all"), func() { It("runs gpt4all", Label("gpt4all"), func() {
if runtime.GOOS != "linux" { if runtime.GOOS != "linux" {
Skip("test supported only on linux") Skip("test supported only on linux")

2
go.mod
View File

@ -1,6 +1,6 @@
module github.com/go-skynet/LocalAI module github.com/go-skynet/LocalAI
go 1.20 go 1.21
require ( require (
github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df

View File

@ -32,14 +32,6 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
llama.WithRopeFreqScale(ropeFreqScale), llama.WithRopeFreqScale(ropeFreqScale),
} }
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 { if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
} }