mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
feat: initial watchdog implementation (#1341)
* feat: initial watchdog implementation Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> * fiuxups * Add more output * wip: idletime checker * wire idle watchdog checks * enlarge watchdog time window * small fixes * Use stopmodel * Always delete process Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
9482acfdfc
commit
824612f1b4
16
.env
16
.env
@ -72,4 +72,18 @@ MODELS_PATH=/models
|
|||||||
# LLAMACPP_PARALLEL=1
|
# LLAMACPP_PARALLEL=1
|
||||||
|
|
||||||
### Enable to run parallel requests
|
### Enable to run parallel requests
|
||||||
# PARALLEL_REQUESTS=true
|
# PARALLEL_REQUESTS=true
|
||||||
|
|
||||||
|
### Watchdog settings
|
||||||
|
###
|
||||||
|
# Enables watchdog to kill backends that are inactive for too much time
|
||||||
|
# WATCHDOG_IDLE=true
|
||||||
|
#
|
||||||
|
# Enables watchdog to kill backends that are busy for too much time
|
||||||
|
# WATCHDOG_BUSY=true
|
||||||
|
#
|
||||||
|
# Time in duration format (e.g. 1h30m) after which a backend is considered idle
|
||||||
|
# WATCHDOG_IDLE_TIMEOUT=5m
|
||||||
|
#
|
||||||
|
# Time in duration format (e.g. 1h30m) after which a backend is considered busy
|
||||||
|
# WATCHDOG_BUSY_TIMEOUT=5m
|
17
api/api.go
17
api/api.go
@ -13,6 +13,7 @@ import (
|
|||||||
"github.com/go-skynet/LocalAI/internal"
|
"github.com/go-skynet/LocalAI/internal"
|
||||||
"github.com/go-skynet/LocalAI/metrics"
|
"github.com/go-skynet/LocalAI/metrics"
|
||||||
"github.com/go-skynet/LocalAI/pkg/assets"
|
"github.com/go-skynet/LocalAI/pkg/assets"
|
||||||
|
"github.com/go-skynet/LocalAI/pkg/model"
|
||||||
|
|
||||||
"github.com/gofiber/fiber/v2"
|
"github.com/gofiber/fiber/v2"
|
||||||
"github.com/gofiber/fiber/v2/middleware/cors"
|
"github.com/gofiber/fiber/v2/middleware/cors"
|
||||||
@ -79,6 +80,22 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
|
|||||||
options.Loader.StopAllGRPC()
|
options.Loader.StopAllGRPC()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if options.WatchDog {
|
||||||
|
wd := model.NewWatchDog(
|
||||||
|
options.Loader,
|
||||||
|
options.WatchDogBusyTimeout,
|
||||||
|
options.WatchDogIdleTimeout,
|
||||||
|
options.WatchDogBusy,
|
||||||
|
options.WatchDogIdle)
|
||||||
|
options.Loader.SetWatchDog(wd)
|
||||||
|
go wd.Run()
|
||||||
|
go func() {
|
||||||
|
<-options.Context.Done()
|
||||||
|
log.Debug().Msgf("Context canceled, shutting down")
|
||||||
|
wd.Shutdown()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
return options, cl, nil
|
return options, cl, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,7 +128,7 @@ func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
|
|||||||
return fmt.Errorf("backend %s is not currently loaded", backendId)
|
return fmt.Errorf("backend %s is not currently loaded", backendId)
|
||||||
}
|
}
|
||||||
|
|
||||||
status, rpcErr := model.GRPC(false).Status(context.TODO())
|
status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
|
||||||
if rpcErr != nil {
|
if rpcErr != nil {
|
||||||
log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
|
log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
|
||||||
val, slbErr := bm.SampleLocalBackendProcess(backendId)
|
val, slbErr := bm.SampleLocalBackendProcess(backendId)
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"embed"
|
"embed"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/go-skynet/LocalAI/metrics"
|
"github.com/go-skynet/LocalAI/metrics"
|
||||||
"github.com/go-skynet/LocalAI/pkg/gallery"
|
"github.com/go-skynet/LocalAI/pkg/gallery"
|
||||||
@ -38,6 +39,11 @@ type Option struct {
|
|||||||
|
|
||||||
SingleBackend bool
|
SingleBackend bool
|
||||||
ParallelBackendRequests bool
|
ParallelBackendRequests bool
|
||||||
|
|
||||||
|
WatchDogIdle bool
|
||||||
|
WatchDogBusy bool
|
||||||
|
WatchDog bool
|
||||||
|
WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
type AppOption func(*Option)
|
type AppOption func(*Option)
|
||||||
@ -63,6 +69,32 @@ func WithCors(b bool) AppOption {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var EnableWatchDog = func(o *Option) {
|
||||||
|
o.WatchDog = true
|
||||||
|
}
|
||||||
|
|
||||||
|
var EnableWatchDogIdleCheck = func(o *Option) {
|
||||||
|
o.WatchDog = true
|
||||||
|
o.WatchDogIdle = true
|
||||||
|
}
|
||||||
|
|
||||||
|
var EnableWatchDogBusyCheck = func(o *Option) {
|
||||||
|
o.WatchDog = true
|
||||||
|
o.WatchDogBusy = true
|
||||||
|
}
|
||||||
|
|
||||||
|
func SetWatchDogBusyTimeout(t time.Duration) AppOption {
|
||||||
|
return func(o *Option) {
|
||||||
|
o.WatchDogBusyTimeout = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func SetWatchDogIdleTimeout(t time.Duration) AppOption {
|
||||||
|
return func(o *Option) {
|
||||||
|
o.WatchDogIdleTimeout = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var EnableSingleBackend = func(o *Option) {
|
var EnableSingleBackend = func(o *Option) {
|
||||||
o.SingleBackend = true
|
o.SingleBackend = true
|
||||||
}
|
}
|
||||||
|
47
main.go
47
main.go
@ -10,6 +10,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
api "github.com/go-skynet/LocalAI/api"
|
api "github.com/go-skynet/LocalAI/api"
|
||||||
"github.com/go-skynet/LocalAI/api/backend"
|
"github.com/go-skynet/LocalAI/api/backend"
|
||||||
@ -154,6 +155,30 @@ func main() {
|
|||||||
Usage: "List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys.",
|
Usage: "List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys.",
|
||||||
EnvVars: []string{"API_KEY"},
|
EnvVars: []string{"API_KEY"},
|
||||||
},
|
},
|
||||||
|
&cli.BoolFlag{
|
||||||
|
Name: "enable-watchdog-idle",
|
||||||
|
Usage: "Enable watchdog for stopping idle backends. This will stop the backends if are in idle state for too long.",
|
||||||
|
EnvVars: []string{"WATCHDOG_IDLE"},
|
||||||
|
Value: false,
|
||||||
|
},
|
||||||
|
&cli.BoolFlag{
|
||||||
|
Name: "enable-watchdog-busy",
|
||||||
|
Usage: "Enable watchdog for stopping busy backends that exceed a defined threshold.",
|
||||||
|
EnvVars: []string{"WATCHDOG_BUSY"},
|
||||||
|
Value: false,
|
||||||
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "watchdog-busy-timeout",
|
||||||
|
Usage: "Watchdog timeout. This will restart the backend if it crashes.",
|
||||||
|
EnvVars: []string{"WATCHDOG_BUSY_TIMEOUT"},
|
||||||
|
Value: "5m",
|
||||||
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "watchdog-idle-timeout",
|
||||||
|
Usage: "Watchdog idle timeout. This will restart the backend if it crashes.",
|
||||||
|
EnvVars: []string{"WATCHDOG_IDLE_TIMEOUT"},
|
||||||
|
Value: "15m",
|
||||||
|
},
|
||||||
&cli.BoolFlag{
|
&cli.BoolFlag{
|
||||||
Name: "preload-backend-only",
|
Name: "preload-backend-only",
|
||||||
Usage: "If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups.",
|
Usage: "If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups.",
|
||||||
@ -198,6 +223,28 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
|
|||||||
options.WithUploadLimitMB(ctx.Int("upload-limit")),
|
options.WithUploadLimitMB(ctx.Int("upload-limit")),
|
||||||
options.WithApiKeys(ctx.StringSlice("api-keys")),
|
options.WithApiKeys(ctx.StringSlice("api-keys")),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
idleWatchDog := ctx.Bool("enable-watchdog-idle")
|
||||||
|
busyWatchDog := ctx.Bool("enable-watchdog-busy")
|
||||||
|
if idleWatchDog || busyWatchDog {
|
||||||
|
opts = append(opts, options.EnableWatchDog)
|
||||||
|
if idleWatchDog {
|
||||||
|
opts = append(opts, options.EnableWatchDogIdleCheck)
|
||||||
|
dur, err := time.ParseDuration(ctx.String("watchdog-idle-timeout"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
opts = append(opts, options.SetWatchDogIdleTimeout(dur))
|
||||||
|
}
|
||||||
|
if busyWatchDog {
|
||||||
|
opts = append(opts, options.EnableWatchDogBusyCheck)
|
||||||
|
dur, err := time.ParseDuration(ctx.String("watchdog-busy-timeout"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
opts = append(opts, options.SetWatchDogBusyTimeout(dur))
|
||||||
|
}
|
||||||
|
}
|
||||||
if ctx.Bool("parallel-requests") {
|
if ctx.Bool("parallel-requests") {
|
||||||
opts = append(opts, options.EnableParallelBackendRequests)
|
opts = append(opts, options.EnableParallelBackendRequests)
|
||||||
}
|
}
|
||||||
|
@ -19,12 +19,22 @@ type Client struct {
|
|||||||
parallel bool
|
parallel bool
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
opMutex sync.Mutex
|
opMutex sync.Mutex
|
||||||
|
wd WatchDog
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewClient(address string, parallel bool) *Client {
|
type WatchDog interface {
|
||||||
|
Mark(address string)
|
||||||
|
UnMark(address string)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewClient(address string, parallel bool, wd WatchDog, enableWatchDog bool) *Client {
|
||||||
|
if !enableWatchDog {
|
||||||
|
wd = nil
|
||||||
|
}
|
||||||
return &Client{
|
return &Client{
|
||||||
address: address,
|
address: address,
|
||||||
parallel: parallel,
|
parallel: parallel,
|
||||||
|
wd: wd,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,6 +89,10 @@ func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -96,6 +110,10 @@ func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grp
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -113,6 +131,10 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -129,6 +151,10 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -164,6 +190,10 @@ func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest,
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -180,6 +210,10 @@ func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOp
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -196,6 +230,10 @@ func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -232,6 +270,10 @@ func (c *Client) TokenizeString(ctx context.Context, in *pb.PredictOptions, opts
|
|||||||
}
|
}
|
||||||
c.setBusy(true)
|
c.setBusy(true)
|
||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
|
if c.wd != nil {
|
||||||
|
c.wd.Mark(c.address)
|
||||||
|
defer c.wd.UnMark(c.address)
|
||||||
|
}
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -121,7 +121,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
|||||||
// Wait for the service to start up
|
// Wait for the service to start up
|
||||||
ready := false
|
ready := false
|
||||||
for i := 0; i < o.grpcAttempts; i++ {
|
for i := 0; i < o.grpcAttempts; i++ {
|
||||||
if client.GRPC(o.parallelRequests).HealthCheck(context.Background()) {
|
if client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background()) {
|
||||||
log.Debug().Msgf("GRPC Service Ready")
|
log.Debug().Msgf("GRPC Service Ready")
|
||||||
ready = true
|
ready = true
|
||||||
break
|
break
|
||||||
@ -140,7 +140,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
|||||||
|
|
||||||
log.Debug().Msgf("GRPC: Loading model with options: %+v", options)
|
log.Debug().Msgf("GRPC: Loading model with options: %+v", options)
|
||||||
|
|
||||||
res, err := client.GRPC(o.parallelRequests).LoadModel(o.context, &options)
|
res, err := client.GRPC(o.parallelRequests, ml.wd).LoadModel(o.context, &options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("could not load model: %w", err)
|
return "", fmt.Errorf("could not load model: %w", err)
|
||||||
}
|
}
|
||||||
@ -154,11 +154,11 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
|||||||
|
|
||||||
func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (*grpc.Client, error) {
|
func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (*grpc.Client, error) {
|
||||||
if parallel {
|
if parallel {
|
||||||
return addr.GRPC(parallel), nil
|
return addr.GRPC(parallel, ml.wd), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, ok := ml.grpcClients[string(addr)]; !ok {
|
if _, ok := ml.grpcClients[string(addr)]; !ok {
|
||||||
ml.grpcClients[string(addr)] = addr.GRPC(parallel)
|
ml.grpcClients[string(addr)] = addr.GRPC(parallel, ml.wd)
|
||||||
}
|
}
|
||||||
return ml.grpcClients[string(addr)], nil
|
return ml.grpcClients[string(addr)], nil
|
||||||
}
|
}
|
||||||
|
@ -63,12 +63,17 @@ type ModelLoader struct {
|
|||||||
models map[string]ModelAddress
|
models map[string]ModelAddress
|
||||||
grpcProcesses map[string]*process.Process
|
grpcProcesses map[string]*process.Process
|
||||||
templates map[TemplateType]map[string]*template.Template
|
templates map[TemplateType]map[string]*template.Template
|
||||||
|
wd *WatchDog
|
||||||
}
|
}
|
||||||
|
|
||||||
type ModelAddress string
|
type ModelAddress string
|
||||||
|
|
||||||
func (m ModelAddress) GRPC(parallel bool) *grpc.Client {
|
func (m ModelAddress) GRPC(parallel bool, wd *WatchDog) *grpc.Client {
|
||||||
return grpc.NewClient(string(m), parallel)
|
enableWD := false
|
||||||
|
if wd != nil {
|
||||||
|
enableWD = true
|
||||||
|
}
|
||||||
|
return grpc.NewClient(string(m), parallel, wd, enableWD)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewModelLoader(modelPath string) *ModelLoader {
|
func NewModelLoader(modelPath string) *ModelLoader {
|
||||||
@ -79,10 +84,15 @@ func NewModelLoader(modelPath string) *ModelLoader {
|
|||||||
templates: make(map[TemplateType]map[string]*template.Template),
|
templates: make(map[TemplateType]map[string]*template.Template),
|
||||||
grpcProcesses: make(map[string]*process.Process),
|
grpcProcesses: make(map[string]*process.Process),
|
||||||
}
|
}
|
||||||
|
|
||||||
nml.initializeTemplateMap()
|
nml.initializeTemplateMap()
|
||||||
return nml
|
return nml
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ml *ModelLoader) SetWatchDog(wd *WatchDog) {
|
||||||
|
ml.wd = wd
|
||||||
|
}
|
||||||
|
|
||||||
func (ml *ModelLoader) ExistsInModelPath(s string) bool {
|
func (ml *ModelLoader) ExistsInModelPath(s string) bool {
|
||||||
return existsInPath(ml.ModelPath, s)
|
return existsInPath(ml.ModelPath, s)
|
||||||
}
|
}
|
||||||
@ -139,11 +149,17 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (
|
|||||||
func (ml *ModelLoader) ShutdownModel(modelName string) error {
|
func (ml *ModelLoader) ShutdownModel(modelName string) error {
|
||||||
ml.mu.Lock()
|
ml.mu.Lock()
|
||||||
defer ml.mu.Unlock()
|
defer ml.mu.Unlock()
|
||||||
|
|
||||||
|
return ml.StopModel(modelName)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ml *ModelLoader) StopModel(modelName string) error {
|
||||||
|
defer ml.deleteProcess(modelName)
|
||||||
if _, ok := ml.models[modelName]; !ok {
|
if _, ok := ml.models[modelName]; !ok {
|
||||||
return fmt.Errorf("model %s not found", modelName)
|
return fmt.Errorf("model %s not found", modelName)
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
return ml.deleteProcess(modelName)
|
//return ml.deleteProcess(modelName)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
|
func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
|
||||||
@ -153,7 +169,7 @@ func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
|
|||||||
if c, ok := ml.grpcClients[s]; ok {
|
if c, ok := ml.grpcClients[s]; ok {
|
||||||
client = c
|
client = c
|
||||||
} else {
|
} else {
|
||||||
client = m.GRPC(false)
|
client = m.GRPC(false, ml.wd)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !client.HealthCheck(context.Background()) {
|
if !client.HealthCheck(context.Background()) {
|
||||||
|
@ -17,7 +17,7 @@ import (
|
|||||||
func (ml *ModelLoader) StopAllExcept(s string) {
|
func (ml *ModelLoader) StopAllExcept(s string) {
|
||||||
ml.StopGRPC(func(id string, p *process.Process) bool {
|
ml.StopGRPC(func(id string, p *process.Process) bool {
|
||||||
if id != s {
|
if id != s {
|
||||||
for ml.models[id].GRPC(false).IsBusy() {
|
for ml.models[id].GRPC(false, ml.wd).IsBusy() {
|
||||||
log.Debug().Msgf("%s busy. Waiting.", id)
|
log.Debug().Msgf("%s busy. Waiting.", id)
|
||||||
time.Sleep(2 * time.Second)
|
time.Sleep(2 * time.Second)
|
||||||
}
|
}
|
||||||
@ -80,6 +80,11 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
|
|||||||
process.WithEnvironment(os.Environ()...),
|
process.WithEnvironment(os.Environ()...),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if ml.wd != nil {
|
||||||
|
ml.wd.Add(serverAddress, grpcControlProcess)
|
||||||
|
ml.wd.AddAddressModelMap(serverAddress, id)
|
||||||
|
}
|
||||||
|
|
||||||
ml.grpcProcesses[id] = grpcControlProcess
|
ml.grpcProcesses[id] = grpcControlProcess
|
||||||
|
|
||||||
if err := grpcControlProcess.Run(); err != nil {
|
if err := grpcControlProcess.Run(); err != nil {
|
||||||
|
155
pkg/model/watchdog.go
Normal file
155
pkg/model/watchdog.go
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
package model
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
process "github.com/mudler/go-processmanager"
|
||||||
|
"github.com/rs/zerolog/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
// All GRPC Clients created by ModelLoader should have an associated injected
|
||||||
|
// watchdog that will keep track of the state of each backend (busy or not)
|
||||||
|
// and for how much time it has been busy.
|
||||||
|
// If a backend is busy for too long, the watchdog will kill the process and
|
||||||
|
// force a reload of the model
|
||||||
|
// The watchdog runs as a separate go routine,
|
||||||
|
// and the GRPC client talks to it via a channel to send status updates
|
||||||
|
|
||||||
|
type WatchDog struct {
|
||||||
|
sync.Mutex
|
||||||
|
timetable map[string]time.Time
|
||||||
|
idleTime map[string]time.Time
|
||||||
|
timeout, idletimeout time.Duration
|
||||||
|
addressMap map[string]*process.Process
|
||||||
|
addressModelMap map[string]string
|
||||||
|
pm ProcessManager
|
||||||
|
stop chan bool
|
||||||
|
|
||||||
|
busyCheck, idleCheck bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type ProcessManager interface {
|
||||||
|
StopModel(modelName string) error
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewWatchDog(pm ProcessManager, timeoutBusy, timeoutIdle time.Duration, busy, idle bool) *WatchDog {
|
||||||
|
return &WatchDog{
|
||||||
|
timeout: timeoutBusy,
|
||||||
|
idletimeout: timeoutIdle,
|
||||||
|
pm: pm,
|
||||||
|
timetable: make(map[string]time.Time),
|
||||||
|
idleTime: make(map[string]time.Time),
|
||||||
|
addressMap: make(map[string]*process.Process),
|
||||||
|
busyCheck: busy,
|
||||||
|
idleCheck: idle,
|
||||||
|
addressModelMap: make(map[string]string),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) Shutdown() {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
wd.stop <- true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) AddAddressModelMap(address string, model string) {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
wd.addressModelMap[address] = model
|
||||||
|
|
||||||
|
}
|
||||||
|
func (wd *WatchDog) Add(address string, p *process.Process) {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
wd.addressMap[address] = p
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) Mark(address string) {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
wd.timetable[address] = time.Now()
|
||||||
|
delete(wd.idleTime, address)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) UnMark(ModelAddress string) {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
delete(wd.timetable, ModelAddress)
|
||||||
|
wd.idleTime[ModelAddress] = time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) Run() {
|
||||||
|
log.Info().Msg("[WatchDog] starting watchdog")
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-wd.stop:
|
||||||
|
log.Info().Msg("[WatchDog] Stopping watchdog")
|
||||||
|
return
|
||||||
|
case <-time.After(30 * time.Second):
|
||||||
|
if !wd.busyCheck && !wd.idleCheck {
|
||||||
|
log.Info().Msg("[WatchDog] No checks enabled, stopping watchdog")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if wd.busyCheck {
|
||||||
|
wd.checkBusy()
|
||||||
|
}
|
||||||
|
if wd.idleCheck {
|
||||||
|
wd.checkIdle()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) checkIdle() {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
log.Debug().Msg("[WatchDog] Watchdog checks for idle connections")
|
||||||
|
for address, t := range wd.idleTime {
|
||||||
|
log.Debug().Msgf("[WatchDog] %s: idle connection", address)
|
||||||
|
if time.Since(t) > wd.idletimeout {
|
||||||
|
log.Warn().Msgf("[WatchDog] Address %s is idle for too long, killing it", address)
|
||||||
|
p, ok := wd.addressModelMap[address]
|
||||||
|
if ok {
|
||||||
|
if err := wd.pm.StopModel(p); err != nil {
|
||||||
|
log.Error().Msgf("[watchdog] Error shutting down model %s: %v", p, err)
|
||||||
|
}
|
||||||
|
delete(wd.idleTime, address)
|
||||||
|
delete(wd.addressModelMap, address)
|
||||||
|
delete(wd.addressMap, address)
|
||||||
|
} else {
|
||||||
|
log.Warn().Msgf("[WatchDog] Address %s unresolvable", address)
|
||||||
|
delete(wd.idleTime, address)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wd *WatchDog) checkBusy() {
|
||||||
|
wd.Lock()
|
||||||
|
defer wd.Unlock()
|
||||||
|
log.Debug().Msg("[WatchDog] Watchdog checks for busy connections")
|
||||||
|
|
||||||
|
for address, t := range wd.timetable {
|
||||||
|
log.Debug().Msgf("[WatchDog] %s: active connection", address)
|
||||||
|
|
||||||
|
if time.Since(t) > wd.timeout {
|
||||||
|
|
||||||
|
model, ok := wd.addressModelMap[address]
|
||||||
|
if ok {
|
||||||
|
log.Warn().Msgf("[WatchDog] Model %s is busy for too long, killing it", model)
|
||||||
|
if err := wd.pm.StopModel(model); err != nil {
|
||||||
|
log.Error().Msgf("[watchdog] Error shutting down model %s: %v", model, err)
|
||||||
|
}
|
||||||
|
delete(wd.timetable, address)
|
||||||
|
delete(wd.addressModelMap, address)
|
||||||
|
delete(wd.addressMap, address)
|
||||||
|
} else {
|
||||||
|
log.Warn().Msgf("[WatchDog] Address %s unresolvable", address)
|
||||||
|
delete(wd.timetable, address)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user