mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-23 14:32:25 +00:00
a670318a9f
* auto select cpu variant Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * remove cuda target for now Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * fix metal Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * fix path Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * cuda Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * auto select cuda Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * update test Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * select CUDA backend only if present Signed-off-by: mudler <mudler@localai.io> * ci: keep cuda bin in path Signed-off-by: mudler <mudler@localai.io> * Makefile: make dist now builds also cuda Signed-off-by: mudler <mudler@localai.io> * Keep pushing fallback in case auto-flagset/nvidia fails There could be other reasons for which the default binary may fail. For example we might have detected an Nvidia GPU, however the user might not have the drivers/cuda libraries installed in the system, and so it would fail to start. We keep the fallback of llama.cpp at the end of the llama.cpp backends to try to fallback loading in case things go wrong Signed-off-by: mudler <mudler@localai.io> * Do not build cuda on MacOS Signed-off-by: mudler <mudler@localai.io> * cleanup Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * Apply suggestions from code review Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> --------- Signed-off-by: Sertac Ozercan <sozercan@gmail.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Signed-off-by: mudler <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: mudler <mudler@localai.io>
416 lines
12 KiB
Go
416 lines
12 KiB
Go
package model
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"slices"
|
|
"strings"
|
|
"time"
|
|
|
|
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
|
|
"github.com/go-skynet/LocalAI/pkg/xsysinfo"
|
|
"github.com/phayes/freeport"
|
|
"github.com/rs/zerolog/log"
|
|
"golang.org/x/sys/cpu"
|
|
|
|
"github.com/elliotchance/orderedmap/v2"
|
|
)
|
|
|
|
var Aliases map[string]string = map[string]string{
|
|
"go-llama": LLamaCPP,
|
|
"llama": LLamaCPP,
|
|
"embedded-store": LocalStoreBackend,
|
|
"langchain-huggingface": LCHuggingFaceBackend,
|
|
}
|
|
|
|
const (
|
|
LlamaGGML = "llama-ggml"
|
|
|
|
LLamaCPP = "llama-cpp"
|
|
|
|
LLamaCPPCUDA12 = "llama-cpp-cuda12"
|
|
LLamaCPPAVX2 = "llama-cpp-avx2"
|
|
LLamaCPPAVX = "llama-cpp-avx"
|
|
LLamaCPPFallback = "llama-cpp-fallback"
|
|
LLamaCPPCUDA = "llama-cpp-cuda"
|
|
|
|
Gpt4AllLlamaBackend = "gpt4all-llama"
|
|
Gpt4AllMptBackend = "gpt4all-mpt"
|
|
Gpt4AllJBackend = "gpt4all-j"
|
|
Gpt4All = "gpt4all"
|
|
|
|
BertEmbeddingsBackend = "bert-embeddings"
|
|
RwkvBackend = "rwkv"
|
|
WhisperBackend = "whisper"
|
|
StableDiffusionBackend = "stablediffusion"
|
|
TinyDreamBackend = "tinydream"
|
|
PiperBackend = "piper"
|
|
LCHuggingFaceBackend = "huggingface"
|
|
|
|
LocalStoreBackend = "local-store"
|
|
)
|
|
|
|
func backendPath(assetDir, backend string) string {
|
|
return filepath.Join(assetDir, "backend-assets", "grpc", backend)
|
|
}
|
|
|
|
// backendsInAssetDir returns the list of backends in the asset directory
|
|
// that should be loaded
|
|
func backendsInAssetDir(assetDir string) (*orderedmap.OrderedMap[string, any], error) {
|
|
// Exclude backends from automatic loading
|
|
excludeBackends := []string{LocalStoreBackend}
|
|
entry, err := os.ReadDir(backendPath(assetDir, ""))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
backends := make(map[string][]string)
|
|
ENTRY:
|
|
for _, e := range entry {
|
|
for _, exclude := range excludeBackends {
|
|
if e.Name() == exclude {
|
|
continue ENTRY
|
|
}
|
|
}
|
|
if !e.IsDir() {
|
|
if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
|
|
backends[e.Name()] = []string{}
|
|
}
|
|
}
|
|
}
|
|
|
|
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback := false, false, false
|
|
if _, ok := backends[LLamaCPP]; !ok {
|
|
for _, e := range entry {
|
|
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
|
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
|
|
foundLCPPAVX2 = true
|
|
}
|
|
if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
|
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
|
|
foundLCPPAVX = true
|
|
}
|
|
if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
|
|
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
|
|
foundLCPPFallback = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// order backends from the asset directory.
|
|
// as we scan for backends, we want to keep some order which backends are tried of.
|
|
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
|
|
// sets a priority list
|
|
// First has more priority
|
|
priorityList := []string{
|
|
// First llama.cpp and llama-ggml
|
|
LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
|
|
}
|
|
|
|
toTheEnd := []string{
|
|
// last has to be huggingface
|
|
LCHuggingFaceBackend,
|
|
// then bert embeddings
|
|
BertEmbeddingsBackend,
|
|
}
|
|
|
|
// create an ordered map
|
|
orderedBackends := orderedmap.NewOrderedMap[string, any]()
|
|
// add priorityList first
|
|
for _, p := range priorityList {
|
|
if _, ok := backends[p]; ok {
|
|
orderedBackends.Set(p, backends[p])
|
|
}
|
|
}
|
|
|
|
for k, v := range backends {
|
|
if !slices.Contains(toTheEnd, k) {
|
|
if _, ok := orderedBackends.Get(k); !ok {
|
|
orderedBackends.Set(k, v)
|
|
}
|
|
}
|
|
}
|
|
|
|
for _, t := range toTheEnd {
|
|
if _, ok := backends[t]; ok {
|
|
orderedBackends.Set(t, backends[t])
|
|
}
|
|
}
|
|
|
|
return orderedBackends, nil
|
|
}
|
|
|
|
// starts the grpcModelProcess for the backend, and returns a grpc client
|
|
// It also loads the model
|
|
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
|
|
return func(modelName, modelFile string) (ModelAddress, error) {
|
|
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
|
|
|
|
var client ModelAddress
|
|
|
|
getFreeAddress := func() (string, error) {
|
|
port, err := freeport.GetFreePort()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed allocating free ports: %s", err.Error())
|
|
}
|
|
return fmt.Sprintf("127.0.0.1:%d", port), nil
|
|
}
|
|
|
|
// If no specific model path is set for transformers/HF, set it to the model path
|
|
for _, env := range []string{"HF_HOME", "TRANSFORMERS_CACHE", "HUGGINGFACE_HUB_CACHE"} {
|
|
if os.Getenv(env) == "" {
|
|
err := os.Setenv(env, ml.ModelPath)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("name", env).Str("modelPath", ml.ModelPath).Msg("unable to set environment variable to modelPath")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if the backend is provided as external
|
|
if uri, ok := o.externalBackends[backend]; ok {
|
|
log.Debug().Msgf("Loading external backend: %s", uri)
|
|
// check if uri is a file or a address
|
|
if _, err := os.Stat(uri); err == nil {
|
|
serverAddress, err := getFreeAddress()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed allocating free ports: %s", err.Error())
|
|
}
|
|
// Make sure the process is executable
|
|
if err := ml.startProcess(uri, o.model, serverAddress); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
log.Debug().Msgf("GRPC Service Started")
|
|
|
|
client = ModelAddress(serverAddress)
|
|
} else {
|
|
// address
|
|
client = ModelAddress(uri)
|
|
}
|
|
} else {
|
|
grpcProcess := backendPath(o.assetDir, backend)
|
|
|
|
foundCUDA := false
|
|
// for llama-cpp, check CPU capabilities and load the appropriate variant
|
|
if backend == LLamaCPP {
|
|
gpus, err := xsysinfo.GPUs()
|
|
if err == nil {
|
|
for _, gpu := range gpus {
|
|
if strings.Contains(gpu.String(), "nvidia") {
|
|
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
|
|
grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
|
|
if _, err := os.Stat(grpcProcess); err == nil {
|
|
foundCUDA = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if !foundCUDA {
|
|
if cpu.X86.HasAVX2 {
|
|
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
|
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
|
|
} else if cpu.X86.HasAVX {
|
|
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
|
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
|
|
} else {
|
|
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
|
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if the file exists
|
|
if _, err := os.Stat(grpcProcess); os.IsNotExist(err) {
|
|
return "", fmt.Errorf("grpc process not found: %s. some backends(stablediffusion, tts) require LocalAI compiled with GO_TAGS", grpcProcess)
|
|
}
|
|
|
|
serverAddress, err := getFreeAddress()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed allocating free ports: %s", err.Error())
|
|
}
|
|
|
|
// Make sure the process is executable
|
|
if err := ml.startProcess(grpcProcess, o.model, serverAddress); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
log.Debug().Msgf("GRPC Service Started")
|
|
|
|
client = ModelAddress(serverAddress)
|
|
}
|
|
|
|
// Wait for the service to start up
|
|
ready := false
|
|
for i := 0; i < o.grpcAttempts; i++ {
|
|
alive, err := client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background())
|
|
if alive {
|
|
log.Debug().Msgf("GRPC Service Ready")
|
|
ready = true
|
|
break
|
|
}
|
|
if err != nil && i == o.grpcAttempts-1 {
|
|
log.Error().Err(err).Msg("failed starting/connecting to the gRPC service")
|
|
}
|
|
time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second)
|
|
}
|
|
|
|
if !ready {
|
|
log.Debug().Msgf("GRPC Service NOT ready")
|
|
return "", fmt.Errorf("grpc service not ready")
|
|
}
|
|
|
|
options := *o.gRPCOptions
|
|
options.Model = modelName
|
|
options.ModelFile = modelFile
|
|
|
|
log.Debug().Msgf("GRPC: Loading model with options: %+v", options)
|
|
|
|
res, err := client.GRPC(o.parallelRequests, ml.wd).LoadModel(o.context, &options)
|
|
if err != nil {
|
|
return "", fmt.Errorf("could not load model: %w", err)
|
|
}
|
|
if !res.Success {
|
|
return "", fmt.Errorf("could not load model (no success): %s", res.Message)
|
|
}
|
|
|
|
return client, nil
|
|
}
|
|
}
|
|
|
|
func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (grpc.Backend, error) {
|
|
if parallel {
|
|
return addr.GRPC(parallel, ml.wd), nil
|
|
}
|
|
|
|
if _, ok := ml.grpcClients[string(addr)]; !ok {
|
|
ml.grpcClients[string(addr)] = addr.GRPC(parallel, ml.wd)
|
|
}
|
|
return ml.grpcClients[string(addr)], nil
|
|
}
|
|
|
|
func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
|
|
o := NewOptions(opts...)
|
|
|
|
if o.model != "" {
|
|
log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
|
|
} else {
|
|
log.Info().Msgf("Loading model with backend %s", o.backendString)
|
|
}
|
|
|
|
backend := strings.ToLower(o.backendString)
|
|
if realBackend, exists := Aliases[backend]; exists {
|
|
backend = realBackend
|
|
log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
|
|
}
|
|
|
|
if o.singleActiveBackend {
|
|
ml.mu.Lock()
|
|
log.Debug().Msgf("Stopping all backends except '%s'", o.model)
|
|
err := ml.StopAllExcept(o.model)
|
|
ml.mu.Unlock()
|
|
if err != nil {
|
|
log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel")
|
|
return nil, err
|
|
}
|
|
|
|
}
|
|
|
|
var backendToConsume string
|
|
|
|
switch backend {
|
|
case Gpt4AllLlamaBackend, Gpt4AllMptBackend, Gpt4AllJBackend, Gpt4All:
|
|
o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "gpt4all")
|
|
backendToConsume = Gpt4All
|
|
case PiperBackend:
|
|
o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "espeak-ng-data")
|
|
backendToConsume = PiperBackend
|
|
default:
|
|
backendToConsume = backend
|
|
}
|
|
|
|
addr, err := ml.LoadModel(o.model, ml.grpcModel(backendToConsume, o))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return ml.resolveAddress(addr, o.parallelRequests)
|
|
}
|
|
|
|
func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
|
o := NewOptions(opts...)
|
|
|
|
ml.mu.Lock()
|
|
// Return earlier if we have a model already loaded
|
|
// (avoid looping through all the backends)
|
|
if m := ml.CheckIsLoaded(o.model); m != "" {
|
|
log.Debug().Msgf("Model '%s' already loaded", o.model)
|
|
ml.mu.Unlock()
|
|
|
|
return ml.resolveAddress(m, o.parallelRequests)
|
|
}
|
|
// If we can have only one backend active, kill all the others (except external backends)
|
|
if o.singleActiveBackend {
|
|
log.Debug().Msgf("Stopping all backends except '%s'", o.model)
|
|
err := ml.StopAllExcept(o.model)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
|
|
}
|
|
}
|
|
ml.mu.Unlock()
|
|
|
|
var err error
|
|
|
|
// autoload also external backends
|
|
allBackendsToAutoLoad := orderedmap.NewOrderedMap[string, any]()
|
|
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
|
|
|
|
for _, k := range autoLoadBackends.Keys() {
|
|
v, _ := autoLoadBackends.Get(k)
|
|
allBackendsToAutoLoad.Set(k, v)
|
|
}
|
|
|
|
for _, b := range o.externalBackends {
|
|
allBackendsToAutoLoad.Set(b, []string{})
|
|
}
|
|
|
|
if o.model != "" {
|
|
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, allBackendsToAutoLoad.Keys())
|
|
}
|
|
|
|
for _, key := range allBackendsToAutoLoad.Keys() {
|
|
log.Info().Msgf("[%s] Attempting to load", key)
|
|
options := []Option{
|
|
WithBackendString(key),
|
|
WithModel(o.model),
|
|
WithLoadGRPCLoadModelOpts(o.gRPCOptions),
|
|
WithThreads(o.threads),
|
|
WithAssetDir(o.assetDir),
|
|
}
|
|
|
|
for k, v := range o.externalBackends {
|
|
options = append(options, WithExternalBackend(k, v))
|
|
}
|
|
|
|
model, modelerr := ml.BackendLoader(options...)
|
|
if modelerr == nil && model != nil {
|
|
log.Info().Msgf("[%s] Loads OK", key)
|
|
return model, nil
|
|
} else if modelerr != nil {
|
|
err = errors.Join(err, modelerr)
|
|
log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
|
|
} else if model == nil {
|
|
err = errors.Join(err, fmt.Errorf("backend returned no usable model"))
|
|
log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
|
|
}
|
|
}
|
|
|
|
return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
|
|
}
|