2023-05-11 14:34:16 +00:00
package model
import (
2023-07-14 23:19:43 +00:00
"context"
2024-05-04 15:56:12 +00:00
"errors"
2023-05-11 14:34:16 +00:00
"fmt"
2023-07-14 23:19:43 +00:00
"os"
2023-05-14 15:49:10 +00:00
"path/filepath"
2024-05-04 15:56:12 +00:00
"slices"
2023-05-11 14:34:16 +00:00
"strings"
2023-07-14 23:19:43 +00:00
"time"
2023-05-11 14:34:16 +00:00
2024-05-14 23:17:02 +00:00
"github.com/klauspost/cpuid/v2"
2024-06-23 08:24:36 +00:00
grpc "github.com/mudler/LocalAI/pkg/grpc"
"github.com/mudler/LocalAI/pkg/library"
2024-06-24 15:32:12 +00:00
"github.com/mudler/LocalAI/pkg/utils"
2024-06-23 08:24:36 +00:00
"github.com/mudler/LocalAI/pkg/xsysinfo"
2023-07-14 23:19:43 +00:00
"github.com/phayes/freeport"
2023-05-11 14:34:16 +00:00
"github.com/rs/zerolog/log"
2024-05-13 09:37:52 +00:00
"github.com/elliotchance/orderedmap/v2"
2023-05-11 14:34:16 +00:00
)
2023-12-16 17:22:45 +00:00
var Aliases map [ string ] string = map [ string ] string {
2024-05-04 15:56:12 +00:00
"go-llama" : LLamaCPP ,
"llama" : LLamaCPP ,
"embedded-store" : LocalStoreBackend ,
"langchain-huggingface" : LCHuggingFaceBackend ,
2023-12-16 17:22:45 +00:00
}
2024-05-14 23:17:02 +00:00
var autoDetect = os . Getenv ( "DISABLE_AUTODETECT" ) != "true"
2023-05-11 14:34:16 +00:00
const (
2024-05-04 15:56:12 +00:00
LlamaGGML = "llama-ggml"
2024-05-14 23:17:02 +00:00
LLamaCPP = "llama-cpp"
2024-05-14 17:40:18 +00:00
2024-05-13 09:37:52 +00:00
LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX = "llama-cpp-avx"
2024-05-04 15:56:12 +00:00
LLamaCPPFallback = "llama-cpp-fallback"
2024-05-14 17:40:18 +00:00
LLamaCPPCUDA = "llama-cpp-cuda"
2024-06-05 06:44:15 +00:00
LLamaCPPHipblas = "llama-cpp-hipblas"
2024-06-06 06:40:51 +00:00
LLamaCPPSycl16 = "llama-cpp-sycl_16"
LLamaCPPSycl32 = "llama-cpp-sycl_32"
LLamaCPPGRPC = "llama-cpp-grpc"
2024-05-04 15:56:12 +00:00
2023-05-16 17:32:53 +00:00
BertEmbeddingsBackend = "bert-embeddings"
RwkvBackend = "rwkv"
WhisperBackend = "whisper"
StableDiffusionBackend = "stablediffusion"
2023-12-24 19:27:24 +00:00
TinyDreamBackend = "tinydream"
2023-06-22 15:53:10 +00:00
PiperBackend = "piper"
2024-05-04 15:56:12 +00:00
LCHuggingFaceBackend = "huggingface"
2024-03-22 20:14:04 +00:00
LocalStoreBackend = "local-store"
2023-05-11 14:34:16 +00:00
)
2024-05-04 15:56:12 +00:00
func backendPath ( assetDir , backend string ) string {
return filepath . Join ( assetDir , "backend-assets" , "grpc" , backend )
}
2024-05-05 07:10:23 +00:00
// backendsInAssetDir returns the list of backends in the asset directory
// that should be loaded
2024-05-14 23:17:02 +00:00
func backendsInAssetDir ( assetDir string ) ( [ ] string , error ) {
2024-05-05 07:10:23 +00:00
// Exclude backends from automatic loading
excludeBackends := [ ] string { LocalStoreBackend }
2024-05-04 15:56:12 +00:00
entry , err := os . ReadDir ( backendPath ( assetDir , "" ) )
if err != nil {
return nil , err
}
2024-05-13 09:37:52 +00:00
backends := make ( map [ string ] [ ] string )
2024-05-04 15:56:12 +00:00
ENTRY :
for _ , e := range entry {
for _ , exclude := range excludeBackends {
if e . Name ( ) == exclude {
continue ENTRY
}
}
2024-05-14 23:17:02 +00:00
if e . IsDir ( ) {
continue
2024-05-13 09:37:52 +00:00
}
2024-05-14 23:17:02 +00:00
// Skip the llama.cpp variants if we are autoDetecting
// But we always load the fallback variant if it exists
if strings . Contains ( e . Name ( ) , LLamaCPP ) && ! strings . Contains ( e . Name ( ) , LLamaCPPFallback ) && autoDetect {
continue
}
backends [ e . Name ( ) ] = [ ] string { }
2024-05-13 09:37:52 +00:00
}
2024-05-14 23:17:02 +00:00
// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
if autoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service
2024-06-06 06:40:51 +00:00
foundLCPPAVX , foundLCPPAVX2 , foundLCPPFallback , foundLCPPGRPC , foundLCPPCuda , foundLCPPHipblas , foundSycl16 , foundSycl32 := false , false , false , false , false , false , false , false
2024-05-14 23:17:02 +00:00
if _ , ok := backends [ LLamaCPP ] ; ! ok {
for _ , e := range entry {
if strings . Contains ( e . Name ( ) , LLamaCPPAVX2 ) && ! foundLCPPAVX2 {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPAVX2 )
foundLCPPAVX2 = true
}
if strings . Contains ( e . Name ( ) , LLamaCPPAVX ) && ! foundLCPPAVX {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPAVX )
foundLCPPAVX = true
}
if strings . Contains ( e . Name ( ) , LLamaCPPFallback ) && ! foundLCPPFallback {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPFallback )
foundLCPPFallback = true
}
if strings . Contains ( e . Name ( ) , LLamaCPPGRPC ) && ! foundLCPPGRPC {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPGRPC )
foundLCPPGRPC = true
}
if strings . Contains ( e . Name ( ) , LLamaCPPCUDA ) && ! foundLCPPCuda {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPCUDA )
foundLCPPCuda = true
}
2024-06-05 06:44:15 +00:00
if strings . Contains ( e . Name ( ) , LLamaCPPHipblas ) && ! foundLCPPHipblas {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPHipblas )
foundLCPPHipblas = true
}
2024-06-06 06:40:51 +00:00
if strings . Contains ( e . Name ( ) , LLamaCPPSycl16 ) && ! foundSycl16 {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPSycl16 )
foundSycl16 = true
}
if strings . Contains ( e . Name ( ) , LLamaCPPSycl32 ) && ! foundSycl32 {
backends [ LLamaCPP ] = append ( backends [ LLamaCPP ] , LLamaCPPSycl32 )
foundSycl32 = true
}
2024-05-13 09:37:52 +00:00
}
2024-05-04 15:56:12 +00:00
}
}
// order backends from the asset directory.
// as we scan for backends, we want to keep some order which backends are tried of.
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
2024-05-14 23:17:02 +00:00
// sets a priority list - first has more priority
2024-05-04 15:56:12 +00:00
priorityList := [ ] string {
2024-05-14 23:17:02 +00:00
// First llama.cpp(variants) and llama-ggml to follow.
// We keep the fallback to prevent that if the llama.cpp variants
// that depends on shared libs if breaks have still a safety net.
2024-08-07 21:35:55 +00:00
LLamaCPP , LlamaGGML , LLamaCPPFallback ,
2024-05-04 15:56:12 +00:00
}
2024-05-13 09:37:52 +00:00
2024-05-04 15:56:12 +00:00
toTheEnd := [ ] string {
// last has to be huggingface
LCHuggingFaceBackend ,
// then bert embeddings
BertEmbeddingsBackend ,
}
2024-05-13 09:37:52 +00:00
// create an ordered map
orderedBackends := orderedmap . NewOrderedMap [ string , any ] ( )
// add priorityList first
for _ , p := range priorityList {
if _ , ok := backends [ p ] ; ok {
orderedBackends . Set ( p , backends [ p ] )
2024-05-04 15:56:12 +00:00
}
}
2024-05-13 09:37:52 +00:00
for k , v := range backends {
if ! slices . Contains ( toTheEnd , k ) {
if _ , ok := orderedBackends . Get ( k ) ; ! ok {
orderedBackends . Set ( k , v )
2024-05-04 15:56:12 +00:00
}
}
}
2024-05-13 09:37:52 +00:00
for _ , t := range toTheEnd {
if _ , ok := backends [ t ] ; ok {
orderedBackends . Set ( t , backends [ t ] )
}
}
2024-05-14 23:17:02 +00:00
return orderedBackends . Keys ( ) , nil
}
// selectGRPCProcess selects the GRPC process to start based on system capabilities
2024-06-06 06:40:51 +00:00
func selectGRPCProcess ( backend , assetDir string , f16 bool ) string {
2024-05-14 23:17:02 +00:00
foundCUDA := false
2024-06-05 06:44:15 +00:00
foundAMDGPU := false
2024-06-06 06:40:51 +00:00
foundIntelGPU := false
2024-05-14 23:17:02 +00:00
var grpcProcess string
// Select backend now just for llama.cpp
if backend != LLamaCPP {
return ""
}
// Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
if os . Getenv ( "LLAMACPP_GRPC_SERVERS" ) != "" {
log . Info ( ) . Msgf ( "[%s] attempting to load with GRPC variant" , LLamaCPPGRPC )
return backendPath ( assetDir , LLamaCPPGRPC )
}
gpus , err := xsysinfo . GPUs ( )
if err == nil {
for _ , gpu := range gpus {
if strings . Contains ( gpu . String ( ) , "nvidia" ) {
p := backendPath ( assetDir , LLamaCPPCUDA )
if _ , err := os . Stat ( p ) ; err == nil {
log . Info ( ) . Msgf ( "[%s] attempting to load with CUDA variant" , backend )
grpcProcess = p
foundCUDA = true
} else {
2024-07-23 21:35:31 +00:00
log . Debug ( ) . Msgf ( "Nvidia GPU device found, no embedded CUDA variant found. You can ignore this message if you are using container with CUDA support" )
2024-05-14 23:17:02 +00:00
}
}
2024-06-05 06:44:15 +00:00
if strings . Contains ( gpu . String ( ) , "amd" ) {
p := backendPath ( assetDir , LLamaCPPHipblas )
if _ , err := os . Stat ( p ) ; err == nil {
log . Info ( ) . Msgf ( "[%s] attempting to load with HIPBLAS variant" , backend )
grpcProcess = p
foundAMDGPU = true
} else {
2024-07-23 21:35:31 +00:00
log . Debug ( ) . Msgf ( "AMD GPU device found, no embedded HIPBLAS variant found. You can ignore this message if you are using container with HIPBLAS support" )
2024-06-05 06:44:15 +00:00
}
}
2024-06-06 06:40:51 +00:00
if strings . Contains ( gpu . String ( ) , "intel" ) {
backend := LLamaCPPSycl16
if ! f16 {
backend = LLamaCPPSycl32
}
p := backendPath ( assetDir , backend )
if _ , err := os . Stat ( p ) ; err == nil {
log . Info ( ) . Msgf ( "[%s] attempting to load with Intel variant" , backend )
grpcProcess = p
foundIntelGPU = true
} else {
2024-07-23 21:35:31 +00:00
log . Debug ( ) . Msgf ( "Intel GPU device found, no embedded SYCL variant found. You can ignore this message if you are using container with SYCL support" )
2024-06-06 06:40:51 +00:00
}
}
2024-05-14 23:17:02 +00:00
}
}
2024-06-06 06:40:51 +00:00
if foundCUDA || foundAMDGPU || foundIntelGPU {
2024-05-14 23:17:02 +00:00
return grpcProcess
}
if xsysinfo . HasCPUCaps ( cpuid . AVX2 ) {
2024-07-01 20:50:36 +00:00
p := backendPath ( assetDir , LLamaCPPAVX2 )
if _ , err := os . Stat ( p ) ; err == nil {
log . Info ( ) . Msgf ( "[%s] attempting to load with AVX2 variant" , backend )
grpcProcess = p
}
2024-05-14 23:17:02 +00:00
} else if xsysinfo . HasCPUCaps ( cpuid . AVX ) {
2024-07-01 20:50:36 +00:00
p := backendPath ( assetDir , LLamaCPPAVX )
if _ , err := os . Stat ( p ) ; err == nil {
log . Info ( ) . Msgf ( "[%s] attempting to load with AVX variant" , backend )
grpcProcess = p
}
2024-05-14 23:17:02 +00:00
} else {
2024-07-01 20:50:36 +00:00
p := backendPath ( assetDir , LLamaCPPFallback )
if _ , err := os . Stat ( p ) ; err == nil {
log . Info ( ) . Msgf ( "[%s] attempting to load with fallback variant" , backend )
grpcProcess = p
}
2024-05-14 23:17:02 +00:00
}
return grpcProcess
2023-05-11 18:20:07 +00:00
}
2023-07-20 20:10:12 +00:00
// starts the grpcModelProcess for the backend, and returns a grpc client
// It also loads the model
2024-01-05 17:04:46 +00:00
func ( ml * ModelLoader ) grpcModel ( backend string , o * Options ) func ( string , string ) ( ModelAddress , error ) {
2023-11-16 07:20:05 +00:00
return func ( modelName , modelFile string ) ( ModelAddress , error ) {
2024-06-06 06:40:51 +00:00
2023-11-11 12:14:59 +00:00
log . Debug ( ) . Msgf ( "Loading Model %s with gRPC (file: %s) (backend: %s): %+v" , modelName , modelFile , backend , * o )
2023-07-14 23:19:43 +00:00
2023-11-16 07:20:05 +00:00
var client ModelAddress
2023-07-14 23:19:43 +00:00
2023-07-20 20:10:12 +00:00
getFreeAddress := func ( ) ( string , error ) {
port , err := freeport . GetFreePort ( )
if err != nil {
return "" , fmt . Errorf ( "failed allocating free ports: %s" , err . Error ( ) )
}
return fmt . Sprintf ( "127.0.0.1:%d" , port ) , nil
2023-07-14 23:19:43 +00:00
}
2024-03-07 13:37:45 +00:00
// If no specific model path is set for transformers/HF, set it to the model path
for _ , env := range [ ] string { "HF_HOME" , "TRANSFORMERS_CACHE" , "HUGGINGFACE_HUB_CACHE" } {
if os . Getenv ( env ) == "" {
2024-04-29 13:11:42 +00:00
err := os . Setenv ( env , ml . ModelPath )
if err != nil {
log . Error ( ) . Err ( err ) . Str ( "name" , env ) . Str ( "modelPath" , ml . ModelPath ) . Msg ( "unable to set environment variable to modelPath" )
}
2024-03-07 13:37:45 +00:00
}
}
2023-07-20 20:10:12 +00:00
// Check if the backend is provided as external
if uri , ok := o . externalBackends [ backend ] ; ok {
log . Debug ( ) . Msgf ( "Loading external backend: %s" , uri )
// check if uri is a file or a address
2024-08-23 22:27:14 +00:00
if fi , err := os . Stat ( uri ) ; err == nil {
log . Debug ( ) . Msgf ( "external backend is file: %+v" , fi )
2023-07-20 20:10:12 +00:00
serverAddress , err := getFreeAddress ( )
if err != nil {
2024-04-17 21:33:49 +00:00
return "" , fmt . Errorf ( "failed allocating free ports: %s" , err . Error ( ) )
2023-07-20 20:10:12 +00:00
}
// Make sure the process is executable
2023-08-07 20:39:10 +00:00
if err := ml . startProcess ( uri , o . model , serverAddress ) ; err != nil {
2024-08-23 22:27:14 +00:00
log . Error ( ) . Err ( err ) . Str ( "path" , uri ) . Msg ( "failed to launch " )
2023-11-16 07:20:05 +00:00
return "" , err
2023-07-20 20:10:12 +00:00
}
log . Debug ( ) . Msgf ( "GRPC Service Started" )
2023-11-16 07:20:05 +00:00
client = ModelAddress ( serverAddress )
2023-07-20 20:10:12 +00:00
} else {
2024-08-23 22:27:14 +00:00
log . Debug ( ) . Msg ( "external backend is uri" )
2023-07-20 20:10:12 +00:00
// address
2023-11-16 07:20:05 +00:00
client = ModelAddress ( uri )
2023-07-14 23:19:43 +00:00
}
2023-07-20 20:10:12 +00:00
} else {
2024-05-04 15:56:12 +00:00
grpcProcess := backendPath ( o . assetDir , backend )
2024-06-24 15:32:12 +00:00
if err := utils . VerifyPath ( grpcProcess , o . assetDir ) ; err != nil {
return "" , fmt . Errorf ( "grpc process not found in assetdir: %s" , err . Error ( ) )
}
2024-05-13 09:37:52 +00:00
2024-05-14 23:17:02 +00:00
if autoDetect {
// autoDetect GRPC process to start based on system capabilities
2024-06-06 06:40:51 +00:00
if selectedProcess := selectGRPCProcess ( backend , o . assetDir , o . gRPCOptions . F16Memory ) ; selectedProcess != "" {
2024-05-14 23:17:02 +00:00
grpcProcess = selectedProcess
2024-05-13 09:37:52 +00:00
}
}
2023-07-20 20:10:12 +00:00
// Check if the file exists
if _ , err := os . Stat ( grpcProcess ) ; os . IsNotExist ( err ) {
2023-11-16 07:20:05 +00:00
return "" , fmt . Errorf ( "grpc process not found: %s. some backends(stablediffusion, tts) require LocalAI compiled with GO_TAGS" , grpcProcess )
2023-07-14 23:19:43 +00:00
}
2023-07-20 20:10:12 +00:00
serverAddress , err := getFreeAddress ( )
2023-07-14 23:19:43 +00:00
if err != nil {
2023-11-16 07:20:05 +00:00
return "" , fmt . Errorf ( "failed allocating free ports: %s" , err . Error ( ) )
2023-07-14 23:19:43 +00:00
}
2023-07-20 20:10:12 +00:00
2024-06-18 20:43:43 +00:00
args := [ ] string { }
// Load the ld.so if it exists
args , grpcProcess = library . LoadLDSO ( o . assetDir , args , grpcProcess )
// Make sure the process is executable in any circumstance
if err := ml . startProcess ( grpcProcess , o . model , serverAddress , args ... ) ; err != nil {
2023-11-16 07:20:05 +00:00
return "" , err
2023-07-14 23:19:43 +00:00
}
2023-07-20 20:10:12 +00:00
log . Debug ( ) . Msgf ( "GRPC Service Started" )
2023-07-14 23:19:43 +00:00
2023-11-16 07:20:05 +00:00
client = ModelAddress ( serverAddress )
2023-07-20 20:10:12 +00:00
}
2023-07-14 23:19:43 +00:00
// Wait for the service to start up
ready := false
2023-08-15 23:11:32 +00:00
for i := 0 ; i < o . grpcAttempts ; i ++ {
2024-01-07 23:37:02 +00:00
alive , err := client . GRPC ( o . parallelRequests , ml . wd ) . HealthCheck ( context . Background ( ) )
if alive {
2023-07-14 23:19:43 +00:00
log . Debug ( ) . Msgf ( "GRPC Service Ready" )
ready = true
break
}
2024-01-07 23:37:02 +00:00
if err != nil && i == o . grpcAttempts - 1 {
2024-04-04 07:24:22 +00:00
log . Error ( ) . Err ( err ) . Msg ( "failed starting/connecting to the gRPC service" )
2024-01-07 23:37:02 +00:00
}
2023-08-15 23:11:32 +00:00
time . Sleep ( time . Duration ( o . grpcAttemptsDelay ) * time . Second )
2023-07-14 23:19:43 +00:00
}
if ! ready {
log . Debug ( ) . Msgf ( "GRPC Service NOT ready" )
2024-04-17 21:33:49 +00:00
return "" , fmt . Errorf ( "grpc service not ready" )
2023-07-14 23:19:43 +00:00
}
options := * o . gRPCOptions
2023-08-07 20:39:10 +00:00
options . Model = modelName
options . ModelFile = modelFile
2023-07-14 23:19:43 +00:00
log . Debug ( ) . Msgf ( "GRPC: Loading model with options: %+v" , options )
2023-11-26 17:36:23 +00:00
res , err := client . GRPC ( o . parallelRequests , ml . wd ) . LoadModel ( o . context , & options )
2023-07-14 23:19:43 +00:00
if err != nil {
2024-04-17 21:33:49 +00:00
return "" , fmt . Errorf ( "could not load model: %w" , err )
2023-07-14 23:19:43 +00:00
}
if ! res . Success {
2024-04-17 21:33:49 +00:00
return "" , fmt . Errorf ( "could not load model (no success): %s" , res . Message )
2023-07-14 23:19:43 +00:00
}
return client , nil
}
}
2024-01-23 07:56:36 +00:00
func ( ml * ModelLoader ) resolveAddress ( addr ModelAddress , parallel bool ) ( grpc . Backend , error ) {
2023-11-16 07:20:05 +00:00
if parallel {
2023-11-26 17:36:23 +00:00
return addr . GRPC ( parallel , ml . wd ) , nil
2023-11-16 07:20:05 +00:00
}
if _ , ok := ml . grpcClients [ string ( addr ) ] ; ! ok {
2023-11-26 17:36:23 +00:00
ml . grpcClients [ string ( addr ) ] = addr . GRPC ( parallel , ml . wd )
2023-11-16 07:20:05 +00:00
}
return ml . grpcClients [ string ( addr ) ] , nil
}
2024-01-23 07:56:36 +00:00
func ( ml * ModelLoader ) BackendLoader ( opts ... Option ) ( client grpc . Backend , err error ) {
2023-07-14 23:19:43 +00:00
o := NewOptions ( opts ... )
2024-01-07 23:37:02 +00:00
if o . model != "" {
log . Info ( ) . Msgf ( "Loading model '%s' with backend %s" , o . model , o . backendString )
} else {
log . Info ( ) . Msgf ( "Loading model with backend %s" , o . backendString )
}
2023-07-14 23:19:43 +00:00
backend := strings . ToLower ( o . backendString )
2023-12-16 17:22:45 +00:00
if realBackend , exists := Aliases [ backend ] ; exists {
backend = realBackend
log . Debug ( ) . Msgf ( "%s is an alias of %s" , backend , realBackend )
}
2023-07-20 20:10:12 +00:00
2023-08-18 23:49:33 +00:00
if o . singleActiveBackend {
ml . mu . Lock ( )
log . Debug ( ) . Msgf ( "Stopping all backends except '%s'" , o . model )
2024-04-29 13:11:42 +00:00
err := ml . StopAllExcept ( o . model )
2023-08-18 23:49:33 +00:00
ml . mu . Unlock ( )
2024-04-29 13:11:42 +00:00
if err != nil {
log . Error ( ) . Err ( err ) . Str ( "keptModel" , o . model ) . Msg ( "error while shutting down all backends except for the keptModel" )
return nil , err
}
2023-08-18 23:49:33 +00:00
}
2023-11-16 07:20:05 +00:00
var backendToConsume string
2023-07-20 20:10:12 +00:00
2023-07-14 23:19:43 +00:00
switch backend {
case PiperBackend :
o . gRPCOptions . LibrarySearchPath = filepath . Join ( o . assetDir , "backend-assets" , "espeak-ng-data" )
2023-11-16 07:20:05 +00:00
backendToConsume = PiperBackend
2023-05-11 14:34:16 +00:00
default :
2023-11-16 07:20:05 +00:00
backendToConsume = backend
}
addr , err := ml . LoadModel ( o . model , ml . grpcModel ( backendToConsume , o ) )
if err != nil {
return nil , err
2023-05-11 14:34:16 +00:00
}
2023-11-16 07:20:05 +00:00
return ml . resolveAddress ( addr , o . parallelRequests )
2023-05-11 14:34:16 +00:00
}
2024-01-23 07:56:36 +00:00
func ( ml * ModelLoader ) GreedyLoader ( opts ... Option ) ( grpc . Backend , error ) {
2023-07-14 23:19:43 +00:00
o := NewOptions ( opts ... )
2023-05-11 14:34:16 +00:00
ml . mu . Lock ( )
2023-08-18 23:49:33 +00:00
// Return earlier if we have a model already loaded
// (avoid looping through all the backends)
2023-11-16 07:20:05 +00:00
if m := ml . CheckIsLoaded ( o . model ) ; m != "" {
2023-08-07 20:39:10 +00:00
log . Debug ( ) . Msgf ( "Model '%s' already loaded" , o . model )
2023-05-11 14:34:16 +00:00
ml . mu . Unlock ( )
2023-11-16 07:20:05 +00:00
return ml . resolveAddress ( m , o . parallelRequests )
2023-05-11 14:34:16 +00:00
}
2023-08-18 23:49:33 +00:00
// If we can have only one backend active, kill all the others (except external backends)
if o . singleActiveBackend {
log . Debug ( ) . Msgf ( "Stopping all backends except '%s'" , o . model )
2024-04-29 13:11:42 +00:00
err := ml . StopAllExcept ( o . model )
if err != nil {
log . Error ( ) . Err ( err ) . Str ( "keptModel" , o . model ) . Msg ( "error while shutting down all backends except for the keptModel - greedyloader continuing" )
}
2023-08-18 23:49:33 +00:00
}
2023-05-11 14:34:16 +00:00
ml . mu . Unlock ( )
2023-08-18 23:49:33 +00:00
2023-05-11 14:34:16 +00:00
var err error
2024-05-14 23:17:02 +00:00
// get backends embedded in the binary
2024-05-04 15:56:12 +00:00
autoLoadBackends , err := backendsInAssetDir ( o . assetDir )
if err != nil {
return nil , err
}
2024-05-13 09:37:52 +00:00
2024-05-14 23:17:02 +00:00
// append externalBackends supplied by the user via the CLI
2023-07-20 20:10:12 +00:00
for _ , b := range o . externalBackends {
2024-05-14 23:17:02 +00:00
autoLoadBackends = append ( autoLoadBackends , b )
2023-07-20 20:10:12 +00:00
}
2024-01-07 23:37:02 +00:00
2024-05-14 23:17:02 +00:00
log . Debug ( ) . Msgf ( "Loading from the following backends (in order): %+v" , autoLoadBackends )
2024-01-07 23:37:02 +00:00
if o . model != "" {
2024-05-14 23:17:02 +00:00
log . Info ( ) . Msgf ( "Trying to load the model '%s' with the backend '%s'" , o . model , autoLoadBackends )
2024-01-07 23:37:02 +00:00
}
2023-07-14 23:19:43 +00:00
2024-05-14 23:17:02 +00:00
for _ , key := range autoLoadBackends {
2024-05-13 09:37:52 +00:00
log . Info ( ) . Msgf ( "[%s] Attempting to load" , key )
2023-07-20 20:10:12 +00:00
options := [ ] Option {
2024-05-13 09:37:52 +00:00
WithBackendString ( key ) ,
2023-08-07 20:39:10 +00:00
WithModel ( o . model ) ,
2023-08-09 06:38:51 +00:00
WithLoadGRPCLoadModelOpts ( o . gRPCOptions ) ,
2023-07-14 23:19:43 +00:00
WithThreads ( o . threads ) ,
WithAssetDir ( o . assetDir ) ,
2023-07-20 20:10:12 +00:00
}
for k , v := range o . externalBackends {
options = append ( options , WithExternalBackend ( k , v ) )
}
model , modelerr := ml . BackendLoader ( options ... )
2023-05-11 14:34:16 +00:00
if modelerr == nil && model != nil {
2024-05-13 09:37:52 +00:00
log . Info ( ) . Msgf ( "[%s] Loads OK" , key )
2023-05-11 14:34:16 +00:00
return model , nil
} else if modelerr != nil {
2024-06-05 06:45:24 +00:00
err = errors . Join ( err , fmt . Errorf ( "[%s]: %w" , key , modelerr ) )
2024-05-13 09:37:52 +00:00
log . Info ( ) . Msgf ( "[%s] Fails: %s" , key , modelerr . Error ( ) )
2023-07-17 21:58:15 +00:00
} else if model == nil {
2024-06-05 06:45:24 +00:00
err = errors . Join ( err , fmt . Errorf ( "backend %s returned no usable model" , key ) )
2024-05-13 09:37:52 +00:00
log . Info ( ) . Msgf ( "[%s] Fails: %s" , key , "backend returned no usable model" )
2023-05-11 14:34:16 +00:00
}
2024-07-01 16:11:04 +00:00
if autoDetect && key == LLamaCPP && err != nil {
2024-07-01 20:50:36 +00:00
// try as hard as possible to run the llama.cpp variants
backendToUse := ""
2024-07-01 16:11:04 +00:00
if xsysinfo . HasCPUCaps ( cpuid . AVX2 ) {
2024-07-01 20:50:36 +00:00
if _ , err := os . Stat ( backendPath ( o . assetDir , LLamaCPPAVX2 ) ) ; err == nil {
backendToUse = LLamaCPPAVX2
}
2024-07-01 16:11:04 +00:00
} else if xsysinfo . HasCPUCaps ( cpuid . AVX ) {
2024-07-01 20:50:36 +00:00
if _ , err := os . Stat ( backendPath ( o . assetDir , LLamaCPPAVX2 ) ) ; err == nil {
backendToUse = LLamaCPPAVX
}
} else {
if _ , err := os . Stat ( backendPath ( o . assetDir , LLamaCPPFallback ) ) ; err == nil {
backendToUse = LLamaCPPFallback
} else {
// If we don't have a fallback, just skip fallback
continue
}
2024-07-01 16:11:04 +00:00
}
// Autodetection failed, try the fallback
log . Info ( ) . Msgf ( "[%s] Autodetection failed, trying the fallback" , key )
options = append ( options , WithBackendString ( backendToUse ) )
model , modelerr = ml . BackendLoader ( options ... )
if modelerr == nil && model != nil {
log . Info ( ) . Msgf ( "[%s] Loads OK" , key )
return model , nil
} else {
err = errors . Join ( err , fmt . Errorf ( "[%s]: %w" , key , modelerr ) )
log . Info ( ) . Msgf ( "[%s] Fails: %s" , key , modelerr . Error ( ) )
}
}
2023-05-11 14:34:16 +00:00
}
return nil , fmt . Errorf ( "could not load model - all backends returned error: %s" , err . Error ( ) )
}