fix(llama-cpp): consistently select fallback (#3789)

* fix(llama-cpp): consistently select fallback We didn't took in consideration the case where the host has the CPU flagset, but the binaries were not actually present in the asset dir. This made possible for instance for models that specified the llama-cpp backend directly in the config to not eventually pick-up the fallback binary in case the optimized binaries were not present. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: adjust and simplify selection Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: move failure recovery to BackendLoader() Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * comments Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * minor fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-18 23:18:10 +00:00 · 2024-10-11 16:55:57 +02:00
parent 1996e6f4c9
commit be6c4e6061
2 changed files with 65 additions and 70 deletions
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -28,7 +28,7 @@ var Aliases map[string]string = map[string]string{
 	"langchain-huggingface": LCHuggingFaceBackend,
 }

-var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
+var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"

 const (
 	LlamaGGML = "llama-ggml"
@ -62,7 +62,7 @@ func backendPath(assetDir, backend string) string {

 // backendsInAssetDir returns the list of backends in the asset directory
 // that should be loaded
-func backendsInAssetDir(assetDir string) ([]string, error) {
+func backendsInAssetDir(assetDir string) (map[string][]string, error) {
 	// Exclude backends from automatic loading
 	excludeBackends := []string{LocalStoreBackend}
 	entry, err := os.ReadDir(backendPath(assetDir, ""))
@ -86,7 +86,7 @@ ENTRY:

 		// Skip the llama.cpp variants if we are autoDetecting
 		// But we always load the fallback variant if it exists
-		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
+		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
 			continue
 		}

@ -94,7 +94,7 @@ ENTRY:
 	}

 	// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
-	if autoDetect {
+	if AutoDetect {
 		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
 		// when starting the service
 		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
@ -136,6 +136,10 @@ ENTRY:
 		}
 	}

+	return backends, nil
+}
+
+func orderBackends(backends map[string][]string) ([]string, error) {
 	// order backends from the asset directory.
 	// as we scan for backends, we want to keep some order which backends are tried of.
 	// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
@ -181,8 +185,9 @@ ENTRY:
 	return orderedBackends.Keys(), nil
 }

-// selectGRPCProcess selects the GRPC process to start based on system capabilities
-func selectGRPCProcess(backend, assetDir string, f16 bool) string {
+// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
+// Note: this is now relevant only for llama.cpp
+func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
 	foundCUDA := false
 	foundAMDGPU := false
 	foundIntelGPU := false
@ -199,6 +204,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		return backendPath(assetDir, LLamaCPPGRPC)
 	}

+	// Check for GPU-binaries that are shipped with single binary releases
 	gpus, err := xsysinfo.GPUs()
 	if err == nil {
 		for _, gpu := range gpus {
@ -243,32 +249,37 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		return grpcProcess
 	}

+	// No GPU found or no specific binaries found, try to load the CPU variant(s)
+
+	// Select the Fallback by default
+	selectedProcess := backendPath(assetDir, LLamaCPPFallback)
+
+	// IF we find any optimized binary, we use that
 	if xsysinfo.HasCPUCaps(cpuid.AVX2) {
 		p := backendPath(assetDir, LLamaCPPAVX2)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-			grpcProcess = p
+			selectedProcess = p
 		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
 		p := backendPath(assetDir, LLamaCPPAVX)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-			grpcProcess = p
-		}
-	} else {
-		p := backendPath(assetDir, LLamaCPPFallback)
-		if _, err := os.Stat(p); err == nil {
-			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
-			grpcProcess = p
+			selectedProcess = p
 		}
 	}

-	return grpcProcess
+	// Check if the binary exists!
+	if _, err := os.Stat(selectedProcess); err == nil {
+		return selectedProcess
+	}
+
+	return ""
 }

 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
-func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
+func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
 	return func(modelID, modelName, modelFile string) (*Model, error) {

 		log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
@ -324,9 +335,9 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 				return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
 			}

-			if autoDetect {
+			if autodetect {
 				// autoDetect GRPC process to start based on system capabilities
-				if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
+				if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
 					grpcProcess = selectedProcess
 				}
 			}
@ -407,7 +418,11 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 }

 func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error) {
-	return backendsInAssetDir(assetdir)
+	backends, err := backendsInAssetDir(assetdir)
+	if err != nil {
+		return nil, err
+	}
+	return orderBackends(backends)
 }

 func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
@ -421,13 +436,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
 	}

-	if o.singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
-		err := ml.StopGRPC(allExcept(o.modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel")
-		}
-	}
+	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)

 	var backendToConsume string

@ -439,14 +448,40 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		backendToConsume = backend
 	}

-	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
+	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
 	if err != nil {
-		return nil, err
+		// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
+		// We failed somehow starting the binary. For instance, could be that we are missing
+		// some libraries if running in binary-only mode.
+		// In this case, we attempt to load the model with the fallback variant.
+
+		// If not llama-cpp backend, return error immediately
+		if backend != LLamaCPP {
+			return nil, err
+		}
+
+		// Otherwise attempt with fallback
+		log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
+		model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
+		if err != nil {
+			return nil, err
+		}
 	}

 	return model.GRPC(o.parallelRequests, ml.wd), nil
 }

+func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
+	// If we can have only one backend active, kill all the others (except external backends)
+	if singleActiveBackend {
+		log.Debug().Msgf("Stopping all backends except '%s'", modelID)
+		err := ml.StopGRPC(allExcept(modelID))
+		if err != nil {
+			log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
+		}
+	}
+}
+
 func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	o := NewOptions(opts...)

@ -458,19 +493,12 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}

-	// If we can have only one backend active, kill all the others (except external backends)
-	if o.singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
-		err := ml.StopGRPC(allExcept(o.modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
-		}
-	}
+	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)

 	var err error

 	// get backends embedded in the binary
-	autoLoadBackends, err := backendsInAssetDir(o.assetDir)
+	autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
 	if err != nil {
 		return nil, err
 	}
@ -501,39 +529,6 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 			err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
 			log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
 		}
-
-		if autoDetect && key == LLamaCPP && err != nil {
-			// try as hard as possible to run the llama.cpp variants
-			backendToUse := ""
-			if xsysinfo.HasCPUCaps(cpuid.AVX2) {
-				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
-					backendToUse = LLamaCPPAVX2
-				}
-			} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
-				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
-					backendToUse = LLamaCPPAVX
-				}
-			} else {
-				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
-					backendToUse = LLamaCPPFallback
-				} else {
-					// If we don't have a fallback, just skip fallback
-					continue
-				}
-			}
-
-			// Autodetection failed, try the fallback
-			log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
-			options = append(options, WithBackendString(backendToUse))
-			model, modelerr = ml.BackendLoader(options...)
-			if modelerr == nil && model != nil {
-				log.Info().Msgf("[%s] Loads OK", key)
-				return model, nil
-			} else {
-				err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
-				log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
-			}
-		}
 	}

 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
--- a/pkg/model/loader_options.go
+++ b/pkg/model/loader_options.go