From ab5b75eb01e9391c3041940afe0af0ac838a969d Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 20 Aug 2023 16:35:42 +0200 Subject: [PATCH] feat: add llama-stable backend (#932) Signed-off-by: Ettore Di Giacinto --- Makefile | 22 ++- cmd/grpc/llama-stable/main.go | 21 +++ pkg/backend/llm/llama-stable/llama.go | 204 ++++++++++++++++++++++++++ pkg/model/initializers.go | 4 +- 4 files changed, 247 insertions(+), 4 deletions(-) create mode 100644 cmd/grpc/llama-stable/main.go create mode 100644 pkg/backend/llm/llama-stable/llama.go diff --git a/Makefile b/Makefile index 433ef451..97fcb3c4 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,8 @@ BINARY_NAME=local-ai # llama.cpp versions GOLLAMA_VERSION?=f03869d188b72c8a617bea3a36cf8eb43f73445c +GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 + # gpt4all version GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all GPT4ALL_VERSION?=36f7fb584824961dc692c9f2354ee8f60c50587b @@ -194,17 +196,23 @@ go-llama: git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1 +go-llama-stable: + git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable + cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1 + go-llama/libbinding.a: go-llama $(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a +go-llama-stable/libbinding.a: go-llama-stable + $(MAKE) -C go-llama-stable BUILD_TYPE=$(BUILD_TYPE) libbinding.a + go-piper/libpiper_binding.a: $(MAKE) -C go-piper libpiper_binding.a example/main -get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion +get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion touch $@ replace: - $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama $(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang $(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers $(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv @@ -222,6 +230,7 @@ prepare-sources: get-sources replace rebuild: ## Rebuilds the project $(GOCMD) clean -cache $(MAKE) -C go-llama clean + $(MAKE) -C go-llama-stable clean $(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean $(MAKE) -C go-ggml-transformers clean $(MAKE) -C go-rwkv clean @@ -240,7 +249,8 @@ clean: ## Remove build related file $(GOCMD) clean -cache rm -f prepare rm -rf ./go-llama - rm -rf ./gpt4all + rm -rf ./gpt4all + rm -rf ./go-llama-stable rm -rf ./go-gpt2 rm -rf ./go-stable-diffusion rm -rf ./go-ggml-transformers @@ -353,6 +363,7 @@ backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/ backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a + $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/ # TODO: every binary should have its own folder instead, so can have different metal implementations @@ -360,6 +371,11 @@ ifeq ($(BUILD_TYPE),metal) cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/ endif +backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a + $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable + CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \ + $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/ + backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/ diff --git a/cmd/grpc/llama-stable/main.go b/cmd/grpc/llama-stable/main.go new file mode 100644 index 00000000..afe98f7d --- /dev/null +++ b/cmd/grpc/llama-stable/main.go @@ -0,0 +1,21 @@ +package main + +import ( + "flag" + + llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable" + + grpc "github.com/go-skynet/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +func main() { + flag.Parse() + + if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil { + panic(err) + } +} diff --git a/pkg/backend/llm/llama-stable/llama.go b/pkg/backend/llm/llama-stable/llama.go new file mode 100644 index 00000000..b9e25754 --- /dev/null +++ b/pkg/backend/llm/llama-stable/llama.go @@ -0,0 +1,204 @@ +package llama + +// This is a wrapper to statisfy the GRPC service interface +// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) +import ( + "fmt" + + "github.com/go-skynet/LocalAI/pkg/grpc/base" + pb "github.com/go-skynet/LocalAI/pkg/grpc/proto" + "github.com/go-skynet/go-llama.cpp" +) + +type LLM struct { + base.SingleThread + + llama *llama.LLama +} + +func (llm *LLM) Load(opts *pb.ModelOptions) error { + ropeFreqBase := float32(10000) + ropeFreqScale := float32(1) + + if opts.RopeFreqBase != 0 { + ropeFreqBase = opts.RopeFreqBase + } + if opts.RopeFreqScale != 0 { + ropeFreqScale = opts.RopeFreqScale + } + + llamaOpts := []llama.ModelOption{ + llama.WithRopeFreqBase(ropeFreqBase), + llama.WithRopeFreqScale(ropeFreqScale), + } + + if opts.NGQA != 0 { + llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA))) + } + + if opts.RMSNormEps != 0 { + llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps)) + } + + if opts.ContextSize != 0 { + llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) + } + if opts.F16Memory { + llamaOpts = append(llamaOpts, llama.EnableF16Memory) + } + if opts.Embeddings { + llamaOpts = append(llamaOpts, llama.EnableEmbeddings) + } + if opts.NGPULayers != 0 { + llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) + } + + llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) + llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) + llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) + if opts.NBatch != 0 { + llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) + } else { + llamaOpts = append(llamaOpts, llama.SetNBatch(512)) + } + + if opts.NUMA { + llamaOpts = append(llamaOpts, llama.EnableNUMA) + } + + if opts.LowVRAM { + llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) + } + + model, err := llama.New(opts.ModelFile, llamaOpts...) + llm.llama = model + + return err +} + +func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { + ropeFreqBase := float32(10000) + ropeFreqScale := float32(1) + + if opts.RopeFreqBase != 0 { + ropeFreqBase = opts.RopeFreqBase + } + if opts.RopeFreqScale != 0 { + ropeFreqScale = opts.RopeFreqScale + } + predictOptions := []llama.PredictOption{ + llama.SetTemperature(opts.Temperature), + llama.SetTopP(opts.TopP), + llama.SetTopK(int(opts.TopK)), + llama.SetTokens(int(opts.Tokens)), + llama.SetThreads(int(opts.Threads)), + llama.WithGrammar(opts.Grammar), + llama.SetRopeFreqBase(ropeFreqBase), + llama.SetRopeFreqScale(ropeFreqScale), + llama.SetNegativePromptScale(opts.NegativePromptScale), + llama.SetNegativePrompt(opts.NegativePrompt), + } + + if opts.PromptCacheAll { + predictOptions = append(predictOptions, llama.EnablePromptCacheAll) + } + + if opts.PromptCacheRO { + predictOptions = append(predictOptions, llama.EnablePromptCacheRO) + } + + // Expected absolute path + if opts.PromptCachePath != "" { + predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) + } + + if opts.Mirostat != 0 { + predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) + } + + if opts.MirostatETA != 0 { + predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA)) + } + + if opts.MirostatTAU != 0 { + predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU)) + } + + if opts.Debug { + predictOptions = append(predictOptions, llama.Debug) + } + + predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) + + if opts.PresencePenalty != 0 { + predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty)) + } + + if opts.NKeep != 0 { + predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) + } + + if opts.Batch != 0 { + predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) + } + + if opts.F16KV { + predictOptions = append(predictOptions, llama.EnableF16KV) + } + + if opts.IgnoreEOS { + predictOptions = append(predictOptions, llama.IgnoreEOS) + } + + if opts.Seed != 0 { + predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) + } + + //predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed)) + + predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty)) + predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) + predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) + predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) + predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) + predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ)) + predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP)) + return predictOptions +} + +func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { + return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) +} + +func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { + predictOptions := buildPredictOptions(opts) + + predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { + results <- token + return true + })) + + go func() { + _, err := llm.llama.Predict(opts.Prompt, predictOptions...) + if err != nil { + fmt.Println("err: ", err) + } + close(results) + }() + + return nil +} + +func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { + predictOptions := buildPredictOptions(opts) + + if len(opts.EmbeddingTokens) > 0 { + tokens := []int{} + for _, t := range opts.EmbeddingTokens { + tokens = append(tokens, int(t)) + } + return llm.llama.TokenEmbeddings(tokens, predictOptions...) + } + + return llm.llama.Embeddings(opts.Embeddings, predictOptions...) +} diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 2c9e0de9..7773eb1e 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -16,6 +16,7 @@ import ( const ( LlamaBackend = "llama" + LlamaStableBackend = "llama-stable" BloomzBackend = "bloomz" StarcoderBackend = "starcoder" GPTJBackend = "gptj" @@ -41,6 +42,7 @@ const ( var AutoLoadBackends []string = []string{ LlamaBackend, + LlamaStableBackend, Gpt4All, FalconBackend, GPTNeoXBackend, @@ -173,7 +175,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err er } switch backend { - case LlamaBackend, GPTJBackend, DollyBackend, + case LlamaBackend, LlamaStableBackend, GPTJBackend, DollyBackend, MPTBackend, Gpt2Backend, FalconBackend, GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend, RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend: