diff --git a/Makefile b/Makefile index 7edb6f6a..790c6e6d 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,6 @@ BINARY_NAME=local-ai DETECT_LIBS?=true # llama.cpp versions -GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp -GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b # whisper.cpp version @@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas) LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib export CXX=$(ROCM_HOME)/llvm/bin/clang++ export CC=$(ROCM_HOME)/llvm/bin/clang - # llama-ggml has no hipblas support, so override it here. export STABLE_BUILD_TYPE= export GGML_HIP=1 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101 @@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback -ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper @@ -222,19 +218,6 @@ endif all: help -## go-llama.cpp -sources/go-llama.cpp: - mkdir -p sources/go-llama.cpp - cd sources/go-llama.cpp && \ - git init && \ - git remote add origin $(GOLLAMA_REPO) && \ - git fetch origin && \ - git checkout $(GOLLAMA_VERSION) && \ - git submodule update --init --recursive --depth 1 --single-branch - -sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp - $(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a - ## bark.cpp sources/bark.cpp: git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \ @@ -310,19 +293,17 @@ sources/whisper.cpp: sources/whisper.cpp/libwhisper.a: sources/whisper.cpp cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a -get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp +get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp replace: $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper - $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp dropreplace: $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper - $(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp prepare-sources: get-sources replace $(GOCMD) mod download @@ -330,7 +311,6 @@ prepare-sources: get-sources replace ## GENERIC rebuild: ## Rebuilds the project $(GOCMD) clean -cache - $(MAKE) -C sources/go-llama.cpp clean $(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/go-piper clean $(MAKE) build @@ -434,7 +414,7 @@ run: prepare ## run local-ai test-models/testmodel.ggml: mkdir test-models mkdir test-dir - wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml + wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav @@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs export GO_TAGS="tts debug" $(MAKE) prepare-test HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) - $(MAKE) test-llama + $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) $(MAKE) test-llama-gguf $(MAKE) test-tts $(MAKE) test-stablediffusion @@ -479,10 +458,6 @@ teardown-e2e: rm -rf $(TEST_DIR) || true docker stop $$(docker ps -q --filter ancestor=localai-tests) -test-llama: prepare-test - TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) - test-llama-gguf: prepare-test TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) @@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc mkdir -p backend-assets/util/ cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server -backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc - CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ - $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ -ifneq ($(UPX),) - $(UPX) backend-assets/grpc/llama-ggml -endif - backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/ diff --git a/backend/go/llm/llama-ggml/llama.go b/backend/go/llm/llama-ggml/llama.go deleted file mode 100644 index 1a7add69..00000000 --- a/backend/go/llm/llama-ggml/llama.go +++ /dev/null @@ -1,204 +0,0 @@ -package main - -// This is a wrapper to statisfy the GRPC service interface -// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) -import ( - "fmt" - - "github.com/go-skynet/go-llama.cpp" - "github.com/mudler/LocalAI/pkg/grpc/base" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" -) - -type LLM struct { - base.SingleThread - - llama *llama.LLama -} - -func (llm *LLM) Load(opts *pb.ModelOptions) error { - ropeFreqBase := float32(10000) - ropeFreqScale := float32(1) - - if opts.RopeFreqBase != 0 { - ropeFreqBase = opts.RopeFreqBase - } - if opts.RopeFreqScale != 0 { - ropeFreqScale = opts.RopeFreqScale - } - - llamaOpts := []llama.ModelOption{ - llama.WithRopeFreqBase(ropeFreqBase), - llama.WithRopeFreqScale(ropeFreqScale), - } - - if opts.NGQA != 0 { - llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA))) - } - - if opts.RMSNormEps != 0 { - llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps)) - } - - if opts.ContextSize != 0 { - llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) - } - if opts.F16Memory { - llamaOpts = append(llamaOpts, llama.EnableF16Memory) - } - if opts.Embeddings { - llamaOpts = append(llamaOpts, llama.EnableEmbeddings) - } - if opts.NGPULayers != 0 { - llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) - } - - llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) - llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) - llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) - if opts.NBatch != 0 { - llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) - } else { - llamaOpts = append(llamaOpts, llama.SetNBatch(512)) - } - - if opts.NUMA { - llamaOpts = append(llamaOpts, llama.EnableNUMA) - } - - if opts.LowVRAM { - llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) - } - - model, err := llama.New(opts.ModelFile, llamaOpts...) - llm.llama = model - - return err -} - -func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { - ropeFreqBase := float32(10000) - ropeFreqScale := float32(1) - - if opts.RopeFreqBase != 0 { - ropeFreqBase = opts.RopeFreqBase - } - if opts.RopeFreqScale != 0 { - ropeFreqScale = opts.RopeFreqScale - } - predictOptions := []llama.PredictOption{ - llama.SetTemperature(opts.Temperature), - llama.SetTopP(opts.TopP), - llama.SetTopK(int(opts.TopK)), - llama.SetTokens(int(opts.Tokens)), - llama.SetThreads(int(opts.Threads)), - llama.WithGrammar(opts.Grammar), - llama.SetRopeFreqBase(ropeFreqBase), - llama.SetRopeFreqScale(ropeFreqScale), - llama.SetNegativePromptScale(opts.NegativePromptScale), - llama.SetNegativePrompt(opts.NegativePrompt), - } - - if opts.PromptCacheAll { - predictOptions = append(predictOptions, llama.EnablePromptCacheAll) - } - - if opts.PromptCacheRO { - predictOptions = append(predictOptions, llama.EnablePromptCacheRO) - } - - // Expected absolute path - if opts.PromptCachePath != "" { - predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) - } - - if opts.Mirostat != 0 { - predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) - } - - if opts.MirostatETA != 0 { - predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA)) - } - - if opts.MirostatTAU != 0 { - predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU)) - } - - if opts.Debug { - predictOptions = append(predictOptions, llama.Debug) - } - - predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) - - if opts.PresencePenalty != 0 { - predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty)) - } - - if opts.NKeep != 0 { - predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) - } - - if opts.Batch != 0 { - predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) - } - - if opts.F16KV { - predictOptions = append(predictOptions, llama.EnableF16KV) - } - - if opts.IgnoreEOS { - predictOptions = append(predictOptions, llama.IgnoreEOS) - } - - if opts.Seed != 0 { - predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) - } - - //predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed)) - - predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty)) - predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) - predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) - predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) - predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) - predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ)) - predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP)) - return predictOptions -} - -func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { - return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) -} - -func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { - predictOptions := buildPredictOptions(opts) - - predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { - results <- token - return true - })) - - go func() { - _, err := llm.llama.Predict(opts.Prompt, predictOptions...) - if err != nil { - fmt.Println("err: ", err) - } - close(results) - }() - - return nil -} - -func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { - predictOptions := buildPredictOptions(opts) - - if len(opts.EmbeddingTokens) > 0 { - tokens := []int{} - for _, t := range opts.EmbeddingTokens { - tokens = append(tokens, int(t)) - } - return llm.llama.TokenEmbeddings(tokens, predictOptions...) - } - - return llm.llama.Embeddings(opts.Embeddings, predictOptions...) -} diff --git a/backend/go/llm/llama-ggml/main.go b/backend/go/llm/llama-ggml/main.go deleted file mode 100644 index 544771db..00000000 --- a/backend/go/llm/llama-ggml/main.go +++ /dev/null @@ -1,19 +0,0 @@ -package main - -import ( - "flag" - - grpc "github.com/mudler/LocalAI/pkg/grpc" -) - -var ( - addr = flag.String("addr", "localhost:50051", "the address to connect to") -) - -func main() { - flag.Parse() - - if err := grpc.StartServer(*addr, &LLM{}); err != nil { - panic(err) - } -} diff --git a/core/http/app_test.go b/core/http/app_test.go index ca7a2eaa..ecaf6da3 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -526,77 +526,6 @@ var _ = Describe("API test", func() { Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) }) - It("runs openllama(llama-ggml backend)", Label("llama"), func() { - if runtime.GOOS != "linux" { - Skip("test supported only on linux") - } - response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ - URL: "github:go-skynet/model-gallery/openllama_3b.yaml", - Name: "openllama_3b", - Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128}, - }) - - Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) - - uuid := response["uuid"].(string) - - Eventually(func() bool { - response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid) - return response["processed"].(bool) - }, "360s", "10s").Should(Equal(true)) - - By("testing completion") - resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "}) - Expect(err).ToNot(HaveOccurred()) - Expect(len(resp.Choices)).To(Equal(1)) - Expect(resp.Choices[0].Text).To(ContainSubstring("five")) - - By("testing functions") - resp2, err := client.CreateChatCompletion( - context.TODO(), - openai.ChatCompletionRequest{ - Model: "openllama_3b", - Messages: []openai.ChatCompletionMessage{ - { - Role: "user", - Content: "What is the weather like in San Francisco (celsius)?", - }, - }, - Functions: []openai.FunctionDefinition{ - openai.FunctionDefinition{ - Name: "get_current_weather", - Description: "Get the current weather", - Parameters: jsonschema.Definition{ - Type: jsonschema.Object, - Properties: map[string]jsonschema.Definition{ - "location": { - Type: jsonschema.String, - Description: "The city and state, e.g. San Francisco, CA", - }, - "unit": { - Type: jsonschema.String, - Enum: []string{"celcius", "fahrenheit"}, - }, - }, - Required: []string{"location"}, - }, - }, - }, - }) - Expect(err).ToNot(HaveOccurred()) - Expect(len(resp2.Choices)).To(Equal(1)) - Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil()) - Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name) - - var res map[string]string - err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res) - Expect(err).ToNot(HaveOccurred()) - Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res)) - Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res)) - Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason)) - - }) - It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() { if runtime.GOOS != "linux" { Skip("test supported only on linux") diff --git a/docs/content/docs/features/text-generation.md b/docs/content/docs/features/text-generation.md index 11ab3999..342b8e76 100644 --- a/docs/content/docs/features/text-generation.md +++ b/docs/content/docs/features/text-generation.md @@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a {{% alert note %}} -The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings. +The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. {{% /alert %}} @@ -175,25 +175,12 @@ name: llama backend: llama parameters: # Relative to the models path - model: file.gguf.bin -``` - -In the example above we specify `llama` as the backend to restrict loading `gguf` models only. - -For instance, to use the `llama-ggml` backend for `ggml` models: - -```yaml -name: llama -backend: llama-ggml -parameters: - # Relative to the models path - model: file.ggml.bin + model: file.gguf ``` #### Reference - [llama](https://github.com/ggerganov/llama.cpp) -- [binding](https://github.com/go-skynet/go-llama.cpp) ### exllama/2 diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index ace72fa3..5e465cf0 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{ var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" const ( - LlamaGGML = "llama-ggml" - LLamaCPP = "llama-cpp" LLamaCPPAVX2 = "llama-cpp-avx2" @@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) { // sets a priority list - first has more priority priorityList := []string{ - // First llama.cpp(variants) and llama-ggml to follow. + // First llama.cpp(variants) // We keep the fallback to prevent that if the llama.cpp variants // that depends on shared libs if breaks have still a safety net. - LLamaCPP, LlamaGGML, LLamaCPPFallback, + LLamaCPP, LLamaCPPFallback, } toTheEnd := []string{