mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-24 06:46:39 +00:00
feat(llama.cpp): add distributed llama.cpp inferencing (#2324)
* feat(llama.cpp): support distributed llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat: let tweak how chat messages are merged together Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Makefile: register to ALL_GRPC_BACKENDS Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring, allow disable auto-detection of backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * minor fixups Signed-off-by: mudler <mudler@localai.io> * feat: add cmd to start rpc-server from llama.cpp Signed-off-by: mudler <mudler@localai.io> * ci: add ccache Signed-off-by: mudler <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: mudler <mudler@localai.io>
This commit is contained in:
parent
29909666c3
commit
c89271b2e4
5
.env
5
.env
@ -71,6 +71,11 @@
|
|||||||
### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
|
### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
|
||||||
# LLAMACPP_PARALLEL=1
|
# LLAMACPP_PARALLEL=1
|
||||||
|
|
||||||
|
### Define a list of GRPC Servers for llama-cpp workers to distribute the load
|
||||||
|
# https://github.com/ggerganov/llama.cpp/pull/6829
|
||||||
|
# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
|
||||||
|
# LLAMACPP_GRPC_SERVERS=""
|
||||||
|
|
||||||
### Enable to run parallel requests
|
### Enable to run parallel requests
|
||||||
# LOCALAI_PARALLEL_REQUESTS=true
|
# LOCALAI_PARALLEL_REQUESTS=true
|
||||||
|
|
||||||
|
4
.github/workflows/release.yaml
vendored
4
.github/workflows/release.yaml
vendored
@ -29,7 +29,7 @@ jobs:
|
|||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential ffmpeg protobuf-compiler
|
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
|
||||||
- name: Install CUDA Dependencies
|
- name: Install CUDA Dependencies
|
||||||
run: |
|
run: |
|
||||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
||||||
@ -86,7 +86,7 @@ jobs:
|
|||||||
cache: false
|
cache: false
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
|
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
|
||||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
|
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
|
||||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
|
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
|
||||||
- name: Build stablediffusion
|
- name: Build stablediffusion
|
||||||
|
@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts"
|
|||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
build-essential \
|
build-essential \
|
||||||
|
ccache \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
cmake \
|
cmake \
|
||||||
curl \
|
curl \
|
||||||
|
17
Makefile
17
Makefile
@ -5,7 +5,7 @@ BINARY_NAME=local-ai
|
|||||||
|
|
||||||
# llama.cpp versions
|
# llama.cpp versions
|
||||||
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||||
CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
|
CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12
|
||||||
|
|
||||||
# gpt4all version
|
# gpt4all version
|
||||||
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
||||||
@ -158,6 +158,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
|
|||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
|
||||||
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
||||||
|
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
||||||
@ -314,7 +316,7 @@ build: prepare backend-assets grpcs ## Build the project
|
|||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
|
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
|
||||||
|
|
||||||
build-minimal:
|
build-minimal:
|
||||||
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
|
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build
|
||||||
|
|
||||||
build-api:
|
build-api:
|
||||||
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
|
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
|
||||||
@ -691,6 +693,17 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
|
|||||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
|
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
|
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
|
||||||
|
|
||||||
|
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
|
||||||
|
cp -rf backend/cpp/llama backend/cpp/llama-grpc
|
||||||
|
$(MAKE) -C backend/cpp/llama-grpc purge
|
||||||
|
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
|
||||||
|
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
|
||||||
|
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
|
||||||
|
|
||||||
|
backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
|
||||||
|
mkdir -p backend-assets/util/
|
||||||
|
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
||||||
|
|
||||||
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
|
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
|
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
|
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
|
||||||
|
@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
|
|||||||
} else {
|
} else {
|
||||||
params.n_parallel = 1;
|
params.n_parallel = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
|
||||||
|
if (llama_grpc_servers != NULL) {
|
||||||
|
params.rpc_servers = std::string(llama_grpc_servers);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Add yarn
|
// TODO: Add yarn
|
||||||
|
|
||||||
if (!request->tensorsplit().empty()) {
|
if (!request->tensorsplit().empty()) {
|
||||||
|
@ -17,4 +17,5 @@ var CLI struct {
|
|||||||
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
|
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
|
||||||
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
|
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
|
||||||
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
|
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
|
||||||
|
LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
|
||||||
}
|
}
|
||||||
|
37
core/cli/llamacppworker.go
Normal file
37
core/cli/llamacppworker.go
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
package cli
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"github.com/go-skynet/LocalAI/pkg/assets"
|
||||||
|
"github.com/rs/zerolog/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LLAMACPPWorkerCMD struct {
|
||||||
|
Args []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
|
||||||
|
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
|
||||||
|
// Extract files from the embedded FS
|
||||||
|
err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
|
||||||
|
log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return syscall.Exec(
|
||||||
|
assets.ResolvePath(
|
||||||
|
r.BackendAssetsPath,
|
||||||
|
"util",
|
||||||
|
"llama-cpp-rpc-server",
|
||||||
|
),
|
||||||
|
append([]string{
|
||||||
|
assets.ResolvePath(
|
||||||
|
r.BackendAssetsPath,
|
||||||
|
"util",
|
||||||
|
"llama-cpp-rpc-server",
|
||||||
|
)}, r.Args...),
|
||||||
|
os.Environ())
|
||||||
|
}
|
@ -93,6 +93,8 @@ type Diffusers struct {
|
|||||||
ControlNet string `yaml:"control_net"`
|
ControlNet string `yaml:"control_net"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LLMConfig is a struct that holds the configuration that are
|
||||||
|
// generic for most of the LLM backends.
|
||||||
type LLMConfig struct {
|
type LLMConfig struct {
|
||||||
SystemPrompt string `yaml:"system_prompt"`
|
SystemPrompt string `yaml:"system_prompt"`
|
||||||
TensorSplit string `yaml:"tensor_split"`
|
TensorSplit string `yaml:"tensor_split"`
|
||||||
@ -144,6 +146,7 @@ type LLMConfig struct {
|
|||||||
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
|
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
|
||||||
type AutoGPTQ struct {
|
type AutoGPTQ struct {
|
||||||
ModelBaseName string `yaml:"model_base_name"`
|
ModelBaseName string `yaml:"model_base_name"`
|
||||||
Device string `yaml:"device"`
|
Device string `yaml:"device"`
|
||||||
@ -151,13 +154,31 @@ type AutoGPTQ struct {
|
|||||||
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
|
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TemplateConfig is a struct that holds the configuration of the templating system
|
||||||
type TemplateConfig struct {
|
type TemplateConfig struct {
|
||||||
|
// Chat is the template used in the chat completion endpoint
|
||||||
Chat string `yaml:"chat"`
|
Chat string `yaml:"chat"`
|
||||||
|
|
||||||
|
// ChatMessage is the template used for chat messages
|
||||||
ChatMessage string `yaml:"chat_message"`
|
ChatMessage string `yaml:"chat_message"`
|
||||||
|
|
||||||
|
// Completion is the template used for completion requests
|
||||||
Completion string `yaml:"completion"`
|
Completion string `yaml:"completion"`
|
||||||
|
|
||||||
|
// Edit is the template used for edit completion requests
|
||||||
Edit string `yaml:"edit"`
|
Edit string `yaml:"edit"`
|
||||||
|
|
||||||
|
// Functions is the template used when tools are present in the client requests
|
||||||
Functions string `yaml:"function"`
|
Functions string `yaml:"function"`
|
||||||
|
|
||||||
|
// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
|
||||||
|
// Note: this is mostly consumed for backends such as vllm and transformers
|
||||||
|
// that can use the tokenizers specified in the JSON config files of the models
|
||||||
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
|
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
|
||||||
|
|
||||||
|
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
|
||||||
|
// It defaults to \n
|
||||||
|
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *BackendConfig) SetFunctionCallString(s string) {
|
func (c *BackendConfig) SetFunctionCallString(s string) {
|
||||||
|
@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
|||||||
mess = append(mess, content)
|
mess = append(mess, content)
|
||||||
}
|
}
|
||||||
|
|
||||||
predInput = strings.Join(mess, "\n")
|
joinCharacter := "\n"
|
||||||
|
if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
|
||||||
|
joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
|
||||||
|
}
|
||||||
|
|
||||||
|
predInput = strings.Join(mess, joinCharacter)
|
||||||
log.Debug().Msgf("Prompt (before templating): %s", predInput)
|
log.Debug().Msgf("Prompt (before templating): %s", predInput)
|
||||||
|
|
||||||
templateFile := ""
|
templateFile := ""
|
||||||
|
@ -8,6 +8,10 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func ResolvePath(dir string, paths ...string) string {
|
||||||
|
return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...)
|
||||||
|
}
|
||||||
|
|
||||||
func ExtractFiles(content embed.FS, extractDir string) error {
|
func ExtractFiles(content embed.FS, extractDir string) error {
|
||||||
// Create the target directory if it doesn't exist
|
// Create the target directory if it doesn't exist
|
||||||
err := os.MkdirAll(extractDir, 0750)
|
err := os.MkdirAll(extractDir, 0750)
|
||||||
@ -39,7 +43,7 @@ func ExtractFiles(content embed.FS, extractDir string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create the file in the target directory
|
// Create the file in the target directory
|
||||||
err = os.WriteFile(targetFile, fileData, 0600)
|
err = os.WriteFile(targetFile, fileData, 0700)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to write file: %v", err)
|
return fmt.Errorf("failed to write file: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -12,9 +12,9 @@ import (
|
|||||||
|
|
||||||
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
|
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
|
||||||
"github.com/go-skynet/LocalAI/pkg/xsysinfo"
|
"github.com/go-skynet/LocalAI/pkg/xsysinfo"
|
||||||
|
"github.com/klauspost/cpuid/v2"
|
||||||
"github.com/phayes/freeport"
|
"github.com/phayes/freeport"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
"golang.org/x/sys/cpu"
|
|
||||||
|
|
||||||
"github.com/elliotchance/orderedmap/v2"
|
"github.com/elliotchance/orderedmap/v2"
|
||||||
)
|
)
|
||||||
@ -26,16 +26,18 @@ var Aliases map[string]string = map[string]string{
|
|||||||
"langchain-huggingface": LCHuggingFaceBackend,
|
"langchain-huggingface": LCHuggingFaceBackend,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||||
|
|
||||||
const (
|
const (
|
||||||
LlamaGGML = "llama-ggml"
|
LlamaGGML = "llama-ggml"
|
||||||
|
|
||||||
LLamaCPP = "llama-cpp"
|
LLamaCPP = "llama-cpp"
|
||||||
|
|
||||||
LLamaCPPCUDA12 = "llama-cpp-cuda12"
|
|
||||||
LLamaCPPAVX2 = "llama-cpp-avx2"
|
LLamaCPPAVX2 = "llama-cpp-avx2"
|
||||||
LLamaCPPAVX = "llama-cpp-avx"
|
LLamaCPPAVX = "llama-cpp-avx"
|
||||||
LLamaCPPFallback = "llama-cpp-fallback"
|
LLamaCPPFallback = "llama-cpp-fallback"
|
||||||
LLamaCPPCUDA = "llama-cpp-cuda"
|
LLamaCPPCUDA = "llama-cpp-cuda"
|
||||||
|
LLamaCPPGRPC = "llama-cpp-grpc"
|
||||||
|
|
||||||
Gpt4AllLlamaBackend = "gpt4all-llama"
|
Gpt4AllLlamaBackend = "gpt4all-llama"
|
||||||
Gpt4AllMptBackend = "gpt4all-mpt"
|
Gpt4AllMptBackend = "gpt4all-mpt"
|
||||||
@ -59,7 +61,7 @@ func backendPath(assetDir, backend string) string {
|
|||||||
|
|
||||||
// backendsInAssetDir returns the list of backends in the asset directory
|
// backendsInAssetDir returns the list of backends in the asset directory
|
||||||
// that should be loaded
|
// that should be loaded
|
||||||
func backendsInAssetDir(assetDir string) (*orderedmap.OrderedMap[string, any], error) {
|
func backendsInAssetDir(assetDir string) ([]string, error) {
|
||||||
// Exclude backends from automatic loading
|
// Exclude backends from automatic loading
|
||||||
excludeBackends := []string{LocalStoreBackend}
|
excludeBackends := []string{LocalStoreBackend}
|
||||||
entry, err := os.ReadDir(backendPath(assetDir, ""))
|
entry, err := os.ReadDir(backendPath(assetDir, ""))
|
||||||
@ -74,14 +76,24 @@ ENTRY:
|
|||||||
continue ENTRY
|
continue ENTRY
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !e.IsDir() {
|
if e.IsDir() {
|
||||||
if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
|
continue
|
||||||
backends[e.Name()] = []string{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback := false, false, false
|
// Skip the llama.cpp variants if we are autoDetecting
|
||||||
|
// But we always load the fallback variant if it exists
|
||||||
|
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
backends[e.Name()] = []string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
|
||||||
|
if autoDetect {
|
||||||
|
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
|
||||||
|
// when starting the service
|
||||||
|
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda := false, false, false, false, false
|
||||||
if _, ok := backends[LLamaCPP]; !ok {
|
if _, ok := backends[LLamaCPP]; !ok {
|
||||||
for _, e := range entry {
|
for _, e := range entry {
|
||||||
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
|
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
|
||||||
@ -96,16 +108,28 @@ ENTRY:
|
|||||||
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
|
||||||
foundLCPPFallback = true
|
foundLCPPFallback = true
|
||||||
}
|
}
|
||||||
|
if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
|
||||||
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
|
||||||
|
foundLCPPGRPC = true
|
||||||
|
}
|
||||||
|
if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda {
|
||||||
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
|
||||||
|
foundLCPPCuda = true
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// order backends from the asset directory.
|
// order backends from the asset directory.
|
||||||
// as we scan for backends, we want to keep some order which backends are tried of.
|
// as we scan for backends, we want to keep some order which backends are tried of.
|
||||||
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
|
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
|
||||||
// sets a priority list
|
|
||||||
// First has more priority
|
// sets a priority list - first has more priority
|
||||||
priorityList := []string{
|
priorityList := []string{
|
||||||
// First llama.cpp and llama-ggml
|
|
||||||
|
// First llama.cpp(variants) and llama-ggml to follow.
|
||||||
|
// We keep the fallback to prevent that if the llama.cpp variants
|
||||||
|
// that depends on shared libs if breaks have still a safety net.
|
||||||
LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
|
LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -139,7 +163,57 @@ ENTRY:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return orderedBackends, nil
|
return orderedBackends.Keys(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// selectGRPCProcess selects the GRPC process to start based on system capabilities
|
||||||
|
func selectGRPCProcess(backend, assetDir string) string {
|
||||||
|
foundCUDA := false
|
||||||
|
var grpcProcess string
|
||||||
|
|
||||||
|
// Select backend now just for llama.cpp
|
||||||
|
if backend != LLamaCPP {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
|
||||||
|
if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" {
|
||||||
|
log.Info().Msgf("[%s] attempting to load with GRPC variant", LLamaCPPGRPC)
|
||||||
|
return backendPath(assetDir, LLamaCPPGRPC)
|
||||||
|
}
|
||||||
|
|
||||||
|
gpus, err := xsysinfo.GPUs()
|
||||||
|
if err == nil {
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
if strings.Contains(gpu.String(), "nvidia") {
|
||||||
|
p := backendPath(assetDir, LLamaCPPCUDA)
|
||||||
|
if _, err := os.Stat(p); err == nil {
|
||||||
|
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
|
||||||
|
grpcProcess = p
|
||||||
|
foundCUDA = true
|
||||||
|
} else {
|
||||||
|
log.Info().Msgf("GPU device found but no CUDA backend present")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if foundCUDA {
|
||||||
|
return grpcProcess
|
||||||
|
}
|
||||||
|
|
||||||
|
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||||
|
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
||||||
|
grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
|
||||||
|
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||||
|
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
||||||
|
grpcProcess = backendPath(assetDir, LLamaCPPAVX)
|
||||||
|
} else {
|
||||||
|
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
||||||
|
grpcProcess = backendPath(assetDir, LLamaCPPFallback)
|
||||||
|
}
|
||||||
|
|
||||||
|
return grpcProcess
|
||||||
}
|
}
|
||||||
|
|
||||||
// starts the grpcModelProcess for the backend, and returns a grpc client
|
// starts the grpcModelProcess for the backend, and returns a grpc client
|
||||||
@ -192,33 +266,10 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
|||||||
} else {
|
} else {
|
||||||
grpcProcess := backendPath(o.assetDir, backend)
|
grpcProcess := backendPath(o.assetDir, backend)
|
||||||
|
|
||||||
foundCUDA := false
|
if autoDetect {
|
||||||
// for llama-cpp, check CPU capabilities and load the appropriate variant
|
// autoDetect GRPC process to start based on system capabilities
|
||||||
if backend == LLamaCPP {
|
if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
|
||||||
gpus, err := xsysinfo.GPUs()
|
grpcProcess = selectedProcess
|
||||||
if err == nil {
|
|
||||||
for _, gpu := range gpus {
|
|
||||||
if strings.Contains(gpu.String(), "nvidia") {
|
|
||||||
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
|
|
||||||
grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
|
|
||||||
if _, err := os.Stat(grpcProcess); err == nil {
|
|
||||||
foundCUDA = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !foundCUDA {
|
|
||||||
if cpu.X86.HasAVX2 {
|
|
||||||
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
|
||||||
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
|
|
||||||
} else if cpu.X86.HasAVX {
|
|
||||||
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
|
||||||
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
|
|
||||||
} else {
|
|
||||||
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
|
||||||
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -363,28 +414,24 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
|||||||
|
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
// autoload also external backends
|
// get backends embedded in the binary
|
||||||
allBackendsToAutoLoad := orderedmap.NewOrderedMap[string, any]()
|
|
||||||
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
|
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// append externalBackends supplied by the user via the CLI
|
||||||
|
for _, b := range o.externalBackends {
|
||||||
|
autoLoadBackends = append(autoLoadBackends, b)
|
||||||
|
}
|
||||||
|
|
||||||
log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
|
log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
|
||||||
|
|
||||||
for _, k := range autoLoadBackends.Keys() {
|
|
||||||
v, _ := autoLoadBackends.Get(k)
|
|
||||||
allBackendsToAutoLoad.Set(k, v)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, b := range o.externalBackends {
|
|
||||||
allBackendsToAutoLoad.Set(b, []string{})
|
|
||||||
}
|
|
||||||
|
|
||||||
if o.model != "" {
|
if o.model != "" {
|
||||||
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, allBackendsToAutoLoad.Keys())
|
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, autoLoadBackends)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, key := range allBackendsToAutoLoad.Keys() {
|
for _, key := range autoLoadBackends {
|
||||||
log.Info().Msgf("[%s] Attempting to load", key)
|
log.Info().Msgf("[%s] Attempting to load", key)
|
||||||
options := []Option{
|
options := []Option{
|
||||||
WithBackendString(key),
|
WithBackendString(key),
|
||||||
|
Loading…
Reference in New Issue
Block a user