mirror of
https://github.com/mudler/LocalAI.git
synced 2025-02-19 08:56:29 +00:00
chore(llama-ggml): drop deprecated backend (#4775)
The GGML format is now dead, since in the next version of LocalAI we already bring many breaking compatibility changes, taking the occasion also to drop ggml support (pre-gguf). Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
8d45670e41
commit
7f90ff7aec
38
Makefile
38
Makefile
@ -6,8 +6,6 @@ BINARY_NAME=local-ai
|
|||||||
DETECT_LIBS?=true
|
DETECT_LIBS?=true
|
||||||
|
|
||||||
# llama.cpp versions
|
# llama.cpp versions
|
||||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
|
||||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
|
||||||
CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b
|
CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b
|
||||||
|
|
||||||
# whisper.cpp version
|
# whisper.cpp version
|
||||||
@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
|
|||||||
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
|
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
|
||||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||||
# llama-ggml has no hipblas support, so override it here.
|
|
||||||
export STABLE_BUILD_TYPE=
|
export STABLE_BUILD_TYPE=
|
||||||
export GGML_HIP=1
|
export GGML_HIP=1
|
||||||
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
||||||
@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
|
|||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
|
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
||||||
@ -222,19 +218,6 @@ endif
|
|||||||
|
|
||||||
all: help
|
all: help
|
||||||
|
|
||||||
## go-llama.cpp
|
|
||||||
sources/go-llama.cpp:
|
|
||||||
mkdir -p sources/go-llama.cpp
|
|
||||||
cd sources/go-llama.cpp && \
|
|
||||||
git init && \
|
|
||||||
git remote add origin $(GOLLAMA_REPO) && \
|
|
||||||
git fetch origin && \
|
|
||||||
git checkout $(GOLLAMA_VERSION) && \
|
|
||||||
git submodule update --init --recursive --depth 1 --single-branch
|
|
||||||
|
|
||||||
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
|
|
||||||
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
|
|
||||||
|
|
||||||
## bark.cpp
|
## bark.cpp
|
||||||
sources/bark.cpp:
|
sources/bark.cpp:
|
||||||
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
|
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
|
||||||
@ -310,19 +293,17 @@ sources/whisper.cpp:
|
|||||||
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
||||||
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
||||||
|
|
||||||
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
||||||
|
|
||||||
replace:
|
replace:
|
||||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
|
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
|
||||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
|
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
|
||||||
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
|
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
|
||||||
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
|
|
||||||
|
|
||||||
dropreplace:
|
dropreplace:
|
||||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
|
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
|
||||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
|
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
|
||||||
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
|
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
|
||||||
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
|
|
||||||
|
|
||||||
prepare-sources: get-sources replace
|
prepare-sources: get-sources replace
|
||||||
$(GOCMD) mod download
|
$(GOCMD) mod download
|
||||||
@ -330,7 +311,6 @@ prepare-sources: get-sources replace
|
|||||||
## GENERIC
|
## GENERIC
|
||||||
rebuild: ## Rebuilds the project
|
rebuild: ## Rebuilds the project
|
||||||
$(GOCMD) clean -cache
|
$(GOCMD) clean -cache
|
||||||
$(MAKE) -C sources/go-llama.cpp clean
|
|
||||||
$(MAKE) -C sources/whisper.cpp clean
|
$(MAKE) -C sources/whisper.cpp clean
|
||||||
$(MAKE) -C sources/go-piper clean
|
$(MAKE) -C sources/go-piper clean
|
||||||
$(MAKE) build
|
$(MAKE) build
|
||||||
@ -434,7 +414,7 @@ run: prepare ## run local-ai
|
|||||||
test-models/testmodel.ggml:
|
test-models/testmodel.ggml:
|
||||||
mkdir test-models
|
mkdir test-models
|
||||||
mkdir test-dir
|
mkdir test-dir
|
||||||
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
|
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
|
||||||
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
|
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
|
||||||
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
|
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
|
||||||
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
|
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
|
||||||
@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
|
|||||||
export GO_TAGS="tts debug"
|
export GO_TAGS="tts debug"
|
||||||
$(MAKE) prepare-test
|
$(MAKE) prepare-test
|
||||||
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||||
$(MAKE) test-llama
|
|
||||||
$(MAKE) test-llama-gguf
|
$(MAKE) test-llama-gguf
|
||||||
$(MAKE) test-tts
|
$(MAKE) test-tts
|
||||||
$(MAKE) test-stablediffusion
|
$(MAKE) test-stablediffusion
|
||||||
@ -479,10 +458,6 @@ teardown-e2e:
|
|||||||
rm -rf $(TEST_DIR) || true
|
rm -rf $(TEST_DIR) || true
|
||||||
docker stop $$(docker ps -q --filter ancestor=localai-tests)
|
docker stop $$(docker ps -q --filter ancestor=localai-tests)
|
||||||
|
|
||||||
test-llama: prepare-test
|
|
||||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
|
||||||
|
|
||||||
test-llama-gguf: prepare-test
|
test-llama-gguf: prepare-test
|
||||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||||
@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
|
|||||||
mkdir -p backend-assets/util/
|
mkdir -p backend-assets/util/
|
||||||
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
||||||
|
|
||||||
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
|
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
|
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
|
|
||||||
ifneq ($(UPX),)
|
|
||||||
$(UPX) backend-assets/grpc/llama-ggml
|
|
||||||
endif
|
|
||||||
|
|
||||||
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
|
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
|
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
|
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
|
||||||
|
@ -1,204 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
// This is a wrapper to statisfy the GRPC service interface
|
|
||||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/go-skynet/go-llama.cpp"
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
|
||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
||||||
)
|
|
||||||
|
|
||||||
type LLM struct {
|
|
||||||
base.SingleThread
|
|
||||||
|
|
||||||
llama *llama.LLama
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) Load(opts *pb.ModelOptions) error {
|
|
||||||
ropeFreqBase := float32(10000)
|
|
||||||
ropeFreqScale := float32(1)
|
|
||||||
|
|
||||||
if opts.RopeFreqBase != 0 {
|
|
||||||
ropeFreqBase = opts.RopeFreqBase
|
|
||||||
}
|
|
||||||
if opts.RopeFreqScale != 0 {
|
|
||||||
ropeFreqScale = opts.RopeFreqScale
|
|
||||||
}
|
|
||||||
|
|
||||||
llamaOpts := []llama.ModelOption{
|
|
||||||
llama.WithRopeFreqBase(ropeFreqBase),
|
|
||||||
llama.WithRopeFreqScale(ropeFreqScale),
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NGQA != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.RMSNormEps != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.ContextSize != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
|
|
||||||
}
|
|
||||||
if opts.F16Memory {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
|
|
||||||
}
|
|
||||||
if opts.Embeddings {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
|
|
||||||
}
|
|
||||||
if opts.NGPULayers != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
|
|
||||||
}
|
|
||||||
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
|
|
||||||
if opts.NBatch != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
|
|
||||||
} else {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NUMA {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnableNUMA)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.LowVRAM {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
|
|
||||||
}
|
|
||||||
|
|
||||||
model, err := llama.New(opts.ModelFile, llamaOpts...)
|
|
||||||
llm.llama = model
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
|
|
||||||
ropeFreqBase := float32(10000)
|
|
||||||
ropeFreqScale := float32(1)
|
|
||||||
|
|
||||||
if opts.RopeFreqBase != 0 {
|
|
||||||
ropeFreqBase = opts.RopeFreqBase
|
|
||||||
}
|
|
||||||
if opts.RopeFreqScale != 0 {
|
|
||||||
ropeFreqScale = opts.RopeFreqScale
|
|
||||||
}
|
|
||||||
predictOptions := []llama.PredictOption{
|
|
||||||
llama.SetTemperature(opts.Temperature),
|
|
||||||
llama.SetTopP(opts.TopP),
|
|
||||||
llama.SetTopK(int(opts.TopK)),
|
|
||||||
llama.SetTokens(int(opts.Tokens)),
|
|
||||||
llama.SetThreads(int(opts.Threads)),
|
|
||||||
llama.WithGrammar(opts.Grammar),
|
|
||||||
llama.SetRopeFreqBase(ropeFreqBase),
|
|
||||||
llama.SetRopeFreqScale(ropeFreqScale),
|
|
||||||
llama.SetNegativePromptScale(opts.NegativePromptScale),
|
|
||||||
llama.SetNegativePrompt(opts.NegativePrompt),
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.PromptCacheAll {
|
|
||||||
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.PromptCacheRO {
|
|
||||||
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Expected absolute path
|
|
||||||
if opts.PromptCachePath != "" {
|
|
||||||
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Mirostat != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.MirostatETA != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.MirostatTAU != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Debug {
|
|
||||||
predictOptions = append(predictOptions, llama.Debug)
|
|
||||||
}
|
|
||||||
|
|
||||||
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
|
|
||||||
|
|
||||||
if opts.PresencePenalty != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NKeep != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Batch != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.F16KV {
|
|
||||||
predictOptions = append(predictOptions, llama.EnableF16KV)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.IgnoreEOS {
|
|
||||||
predictOptions = append(predictOptions, llama.IgnoreEOS)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Seed != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
|
|
||||||
}
|
|
||||||
|
|
||||||
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
|
|
||||||
|
|
||||||
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
|
|
||||||
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
|
|
||||||
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
|
|
||||||
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
|
|
||||||
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
|
|
||||||
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
|
|
||||||
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
|
|
||||||
return predictOptions
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
|
|
||||||
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
|
||||||
predictOptions := buildPredictOptions(opts)
|
|
||||||
|
|
||||||
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
|
|
||||||
results <- token
|
|
||||||
return true
|
|
||||||
}))
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println("err: ", err)
|
|
||||||
}
|
|
||||||
close(results)
|
|
||||||
}()
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
|
|
||||||
predictOptions := buildPredictOptions(opts)
|
|
||||||
|
|
||||||
if len(opts.EmbeddingTokens) > 0 {
|
|
||||||
tokens := []int{}
|
|
||||||
for _, t := range opts.EmbeddingTokens {
|
|
||||||
tokens = append(tokens, int(t))
|
|
||||||
}
|
|
||||||
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
|
|
||||||
}
|
|
||||||
|
|
||||||
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
|
|
||||||
}
|
|
@ -1,19 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"flag"
|
|
||||||
|
|
||||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
@ -526,77 +526,6 @@ var _ = Describe("API test", func() {
|
|||||||
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
|
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
|
||||||
})
|
})
|
||||||
|
|
||||||
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
|
|
||||||
if runtime.GOOS != "linux" {
|
|
||||||
Skip("test supported only on linux")
|
|
||||||
}
|
|
||||||
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
|
|
||||||
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
|
|
||||||
Name: "openllama_3b",
|
|
||||||
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
|
|
||||||
})
|
|
||||||
|
|
||||||
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
|
|
||||||
|
|
||||||
uuid := response["uuid"].(string)
|
|
||||||
|
|
||||||
Eventually(func() bool {
|
|
||||||
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
|
|
||||||
return response["processed"].(bool)
|
|
||||||
}, "360s", "10s").Should(Equal(true))
|
|
||||||
|
|
||||||
By("testing completion")
|
|
||||||
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(len(resp.Choices)).To(Equal(1))
|
|
||||||
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
|
|
||||||
|
|
||||||
By("testing functions")
|
|
||||||
resp2, err := client.CreateChatCompletion(
|
|
||||||
context.TODO(),
|
|
||||||
openai.ChatCompletionRequest{
|
|
||||||
Model: "openllama_3b",
|
|
||||||
Messages: []openai.ChatCompletionMessage{
|
|
||||||
{
|
|
||||||
Role: "user",
|
|
||||||
Content: "What is the weather like in San Francisco (celsius)?",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Functions: []openai.FunctionDefinition{
|
|
||||||
openai.FunctionDefinition{
|
|
||||||
Name: "get_current_weather",
|
|
||||||
Description: "Get the current weather",
|
|
||||||
Parameters: jsonschema.Definition{
|
|
||||||
Type: jsonschema.Object,
|
|
||||||
Properties: map[string]jsonschema.Definition{
|
|
||||||
"location": {
|
|
||||||
Type: jsonschema.String,
|
|
||||||
Description: "The city and state, e.g. San Francisco, CA",
|
|
||||||
},
|
|
||||||
"unit": {
|
|
||||||
Type: jsonschema.String,
|
|
||||||
Enum: []string{"celcius", "fahrenheit"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Required: []string{"location"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
})
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(len(resp2.Choices)).To(Equal(1))
|
|
||||||
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
|
|
||||||
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
|
|
||||||
|
|
||||||
var res map[string]string
|
|
||||||
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
|
|
||||||
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
|
|
||||||
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
|
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
|
||||||
if runtime.GOOS != "linux" {
|
if runtime.GOOS != "linux" {
|
||||||
Skip("test supported only on linux")
|
Skip("test supported only on linux")
|
||||||
|
@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
|
|||||||
|
|
||||||
{{% alert note %}}
|
{{% alert note %}}
|
||||||
|
|
||||||
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings.
|
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
|
||||||
|
|
||||||
{{% /alert %}}
|
{{% /alert %}}
|
||||||
|
|
||||||
@ -175,25 +175,12 @@ name: llama
|
|||||||
backend: llama
|
backend: llama
|
||||||
parameters:
|
parameters:
|
||||||
# Relative to the models path
|
# Relative to the models path
|
||||||
model: file.gguf.bin
|
model: file.gguf
|
||||||
```
|
|
||||||
|
|
||||||
In the example above we specify `llama` as the backend to restrict loading `gguf` models only.
|
|
||||||
|
|
||||||
For instance, to use the `llama-ggml` backend for `ggml` models:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
name: llama
|
|
||||||
backend: llama-ggml
|
|
||||||
parameters:
|
|
||||||
# Relative to the models path
|
|
||||||
model: file.ggml.bin
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Reference
|
#### Reference
|
||||||
|
|
||||||
- [llama](https://github.com/ggerganov/llama.cpp)
|
- [llama](https://github.com/ggerganov/llama.cpp)
|
||||||
- [binding](https://github.com/go-skynet/go-llama.cpp)
|
|
||||||
|
|
||||||
|
|
||||||
### exllama/2
|
### exllama/2
|
||||||
|
@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{
|
|||||||
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||||
|
|
||||||
const (
|
const (
|
||||||
LlamaGGML = "llama-ggml"
|
|
||||||
|
|
||||||
LLamaCPP = "llama-cpp"
|
LLamaCPP = "llama-cpp"
|
||||||
|
|
||||||
LLamaCPPAVX2 = "llama-cpp-avx2"
|
LLamaCPPAVX2 = "llama-cpp-avx2"
|
||||||
@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
|
|||||||
|
|
||||||
// sets a priority list - first has more priority
|
// sets a priority list - first has more priority
|
||||||
priorityList := []string{
|
priorityList := []string{
|
||||||
// First llama.cpp(variants) and llama-ggml to follow.
|
// First llama.cpp(variants)
|
||||||
// We keep the fallback to prevent that if the llama.cpp variants
|
// We keep the fallback to prevent that if the llama.cpp variants
|
||||||
// that depends on shared libs if breaks have still a safety net.
|
// that depends on shared libs if breaks have still a safety net.
|
||||||
LLamaCPP, LlamaGGML, LLamaCPPFallback,
|
LLamaCPP, LLamaCPPFallback,
|
||||||
}
|
}
|
||||||
|
|
||||||
toTheEnd := []string{
|
toTheEnd := []string{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user