diff --git a/core/cli/util.go b/core/cli/util.go index 57b8ad9e..5802d996 100644 --- a/core/cli/util.go +++ b/core/cli/util.go @@ -7,11 +7,11 @@ import ( "github.com/rs/zerolog/log" + gguf "github.com/gpustack/gguf-parser-go" cliContext "github.com/mudler/LocalAI/core/cli/context" "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/gallery" "github.com/mudler/LocalAI/pkg/downloader" - gguf "github.com/thxcode/gguf-parser-go" ) type UtilCMD struct { @@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error { log.Info(). Any("eosTokenID", f.Tokenizer().EOSTokenID). Any("bosTokenID", f.Tokenizer().BOSTokenID). - Any("modelName", f.Model().Name). + Any("modelName", f.Metadata().Name). Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0]) log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer") diff --git a/core/config/gguf.go b/core/config/gguf.go index cf9eacaa..1c8db29c 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -3,9 +3,10 @@ package config import ( "strings" + "github.com/mudler/LocalAI/pkg/xsysinfo" "github.com/rs/zerolog/log" - gguf "github.com/thxcode/gguf-parser-go" + gguf "github.com/gpustack/gguf-parser-go" ) type familyType uint8 @@ -23,6 +24,7 @@ const ( const ( defaultContextSize = 1024 + defaultNGPULayers = 99999999 ) type settingsConfig struct { @@ -147,7 +149,7 @@ var knownTemplates = map[string]familyType{ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) { if defaultCtx == 0 && cfg.ContextSize == nil { - ctxSize := f.EstimateLLaMACppUsage().ContextSize + ctxSize := f.EstimateLLaMACppRun().ContextSize if ctxSize > 0 { cSize := int(ctxSize) cfg.ContextSize = &cSize @@ -157,6 +159,46 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) { } } + // GPU options + if cfg.Options == nil { + if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") { + cfg.Options = []string{"gpu"} + } + } + + // vram estimation + vram, err := xsysinfo.TotalAvailableVRAM() + if err != nil { + log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err) + } else { + estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram) + if err != nil { + log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err) + } else { + if estimate.IsFullOffload { + log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended") + } + + if estimate.EstimatedVRAM > vram { + log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM") + } + + if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 { + log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers) + cfg.NGPULayers = &estimate.EstimatedLayers + } + } + } + + if cfg.NGPULayers == nil { + // we assume we want to offload all layers + defaultHigh := defaultNGPULayers + cfg.NGPULayers = &defaultHigh + } + + log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set") + + // template estimations if cfg.HasTemplate() { // nothing to guess here log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set") @@ -166,12 +208,12 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) { log.Debug(). Any("eosTokenID", f.Tokenizer().EOSTokenID). Any("bosTokenID", f.Tokenizer().BOSTokenID). - Any("modelName", f.Model().Name). + Any("modelName", f.Metadata().Name). Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName()) // guess the name if cfg.Name == "" { - cfg.Name = f.Model().Name + cfg.Name = f.Metadata().Name } family := identifyFamily(f) @@ -207,6 +249,7 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) { cfg.TemplateConfig.JinjaTemplate = true cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString() } + } func identifyFamily(f *gguf.GGUFFile) familyType { @@ -231,7 +274,7 @@ func identifyFamily(f *gguf.GGUFFile) familyType { commandR := arch == "command-r" && eosTokenID == 255001 qwen2 := arch == "qwen2" phi3 := arch == "phi-3" - gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma") + gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma") deepseek2 := arch == "deepseek2" switch { diff --git a/core/config/guesser.go b/core/config/guesser.go index e66df70d..260f5a64 100644 --- a/core/config/guesser.go +++ b/core/config/guesser.go @@ -4,9 +4,8 @@ import ( "os" "path/filepath" - "github.com/mudler/LocalAI/pkg/xsysinfo" + gguf "github.com/gpustack/gguf-parser-go" "github.com/rs/zerolog/log" - gguf "github.com/thxcode/gguf-parser-go" ) func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) { @@ -36,10 +35,4 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) } cfg.ContextSize = &defaultCtx } - - if cfg.Options == nil { - if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") { - cfg.Options = []string{"gpu"} - } - } } diff --git a/go.mod b/go.mod index 856d41f5..757376ab 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/golang/protobuf v1.5.4 github.com/google/go-containerregistry v0.19.2 github.com/google/uuid v1.6.0 + github.com/gpustack/gguf-parser-go v0.17.0 github.com/grpc-ecosystem/grpc-gateway v1.5.0 github.com/hpcloud/tail v1.0.0 github.com/ipfs/go-log v1.0.5 @@ -110,6 +111,7 @@ require ( github.com/pion/turn/v2 v2.1.6 // indirect github.com/pion/turn/v4 v4.0.0 // indirect github.com/pion/webrtc/v4 v4.0.9 // indirect + github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee // indirect github.com/shirou/gopsutil/v4 v4.24.7 // indirect github.com/wlynxg/anet v0.0.5 // indirect @@ -188,7 +190,7 @@ require ( github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect - github.com/henvic/httpretty v0.1.3 // indirect + github.com/henvic/httpretty v0.1.4 // indirect github.com/huandu/xstrings v1.5.0 // indirect github.com/huin/goupnp v1.3.0 // indirect github.com/ipfs/boxo v0.27.4 // indirect @@ -278,7 +280,7 @@ require ( github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/shopspring/decimal v1.4.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect - github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b // indirect + github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect github.com/songgao/packets v0.0.0-20160404182456-549a10cd4091 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spf13/cast v1.7.0 // indirect diff --git a/go.sum b/go.sum index 06e0238f..aad5d177 100644 --- a/go.sum +++ b/go.sum @@ -295,6 +295,8 @@ github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gpustack/gguf-parser-go v0.17.0 h1:DkSziWLsiQM0pqqkr/zMcaBn94KY7iQTi4zmaHixDus= +github.com/gpustack/gguf-parser-go v0.17.0/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/grpc-gateway v1.5.0 h1:WcmKMm43DR7RdtlkEXQJyo5ws8iTp98CyhCCbOHMvNI= github.com/grpc-ecosystem/grpc-gateway v1.5.0/go.mod h1:RSKVYQBd5MCa4OVpNdGskqpgL2+G+NZTnrVHpWWfpdw= @@ -307,8 +309,8 @@ github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iP github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/henvic/httpretty v0.1.3 h1:4A6vigjz6Q/+yAfTD4wqipCv+Px69C7Th/NhT0ApuU8= -github.com/henvic/httpretty v0.1.3/go.mod h1:UUEv7c2kHZ5SPQ51uS3wBpzPDibg2U3Y+IaXyHy5GBg= +github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU= +github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= @@ -660,6 +662,8 @@ github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUc github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8= +github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= @@ -712,8 +716,8 @@ github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b h1:e9eeuSYSLmUKxy7ALzKcxo7ggTceQaVcBhjDIcewa9c= -github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0= +github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY= +github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0= github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= github.com/smartystreets/assertions v1.13.0 h1:Dx1kYM01xsSqKPno3aqLnrwac2LetPvN23diwyr69Qs= github.com/smartystreets/assertions v1.13.0/go.mod h1:wDmR7qL282YbGsPy6H/yAsesrxfxaaSlJazyFLYVFx8= diff --git a/pkg/xsysinfo/gguf.go b/pkg/xsysinfo/gguf.go new file mode 100644 index 00000000..d5c8b5bd --- /dev/null +++ b/pkg/xsysinfo/gguf.go @@ -0,0 +1,52 @@ +package xsysinfo + +import ( + "errors" + + gguf "github.com/gpustack/gguf-parser-go" +) + +type VRAMEstimate struct { + TotalVRAM uint64 + AvailableVRAM uint64 + ModelSize uint64 + EstimatedLayers int + EstimatedVRAM uint64 + IsFullOffload bool +} + +func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) { + // Get model metadata + m := f.Metadata() + a := f.Architecture() + + // Calculate base model size + modelSize := uint64(m.Size) + + if a.BlockCount == 0 { + return nil, errors.New("block count is 0") + } + + // Estimate number of layers that can fit in VRAM + // Each layer typically requires about 1/32 of the model size + layerSize := modelSize / uint64(a.BlockCount) + estimatedLayers := int(availableVRAM / layerSize) + + // If we can't fit even one layer, we need to do full offload + isFullOffload := estimatedLayers <= 0 + if isFullOffload { + estimatedLayers = 0 + } + + // Calculate estimated VRAM usage + estimatedVRAM := uint64(estimatedLayers) * layerSize + + return &VRAMEstimate{ + TotalVRAM: availableVRAM, + AvailableVRAM: availableVRAM, + ModelSize: modelSize, + EstimatedLayers: estimatedLayers, + EstimatedVRAM: estimatedVRAM, + IsFullOffload: isFullOffload, + }, nil +} diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go index a692c775..9a70e17b 100644 --- a/pkg/xsysinfo/gpu.go +++ b/pkg/xsysinfo/gpu.go @@ -16,6 +16,22 @@ func GPUs() ([]*gpu.GraphicsCard, error) { return gpu.GraphicsCards, nil } +func TotalAvailableVRAM() (uint64, error) { + gpus, err := GPUs() + if err != nil { + return 0, err + } + + var totalVRAM uint64 + for _, gpu := range gpus { + if gpu.Node.Memory.TotalUsableBytes > 0 { + totalVRAM += uint64(gpu.Node.Memory.TotalUsableBytes) + } + } + + return totalVRAM, nil +} + func HasGPU(vendor string) bool { gpus, err := GPUs() if err != nil {