fix(config): set better defaults for inferencing (#1822)

* fix(defaults): set better defaults for inferencing This changeset aim to have better defaults and to properly detect when no inference settings are provided with the model. If not specified, we defaults to mirostat sampling, and offload all the GPU layers (if a GPU is detected). Related to https://github.com/mudler/LocalAI/issues/1373 and https://github.com/mudler/LocalAI/issues/1723 * Adapt tests * Also pre-initialize default seed
2025-05-02 08:43:10 +00:00 · 2024-03-13 10:05:30 +01:00 · 2024-03-13 10:05:30 +01:00 · f895d06605
commit f895d06605
parent bc8f648a91
12 changed files with 235 additions and 133 deletions
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@ -23,7 +23,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo

 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(backendConfig.Threads)),
+		model.WithThreads(uint32(*backendConfig.Threads)),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
--- a/core/backend/image.go
+++ b/core/backend/image.go
@ -9,14 +9,14 @@ import (

 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
 	threads := backendConfig.Threads
-	if threads == 0 && appConfig.Threads != 0 {
-		threads = appConfig.Threads
+	if *threads == 0 && appConfig.Threads != 0 {
+		threads = &appConfig.Threads
 	}
 	gRPCOpts := gRPCModelOpts(backendConfig)
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithThreads(uint32(threads)),
+		model.WithThreads(uint32(*threads)),
 		model.WithContext(appConfig.Context),
 		model.WithModel(backendConfig.Model),
 		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -29,8 +29,8 @@ type TokenUsage struct {
 func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
-	if threads == 0 && o.Threads != 0 {
-		threads = o.Threads
+	if *threads == 0 && o.Threads != 0 {
+		threads = &o.Threads
 	}
 	grpcOpts := gRPCModelOpts(c)

@ -39,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode

 	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(threads)), // some models uses this to allocate threads during startup
+		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -46,15 +46,15 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CFGScale:             c.Diffusers.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
-		F16Memory:            c.F16,
+		F16Memory:            *c.F16,
 		LoraBase:             c.LoraBase,
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
-		ContextSize:          int32(c.ContextSize),
-		Seed:                 int32(c.Seed),
+		ContextSize:          int32(*c.ContextSize),
+		Seed:                 int32(*c.Seed),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
 		DraftModel:           c.DraftModel,
@ -72,18 +72,18 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		YarnBetaSlow:         c.YarnBetaSlow,
 		NGQA:                 c.NGQA,
 		RMSNormEps:           c.RMSNormEps,
-		MLock:                c.MMlock,
+		MLock:                *c.MMlock,
 		RopeFreqBase:         c.RopeFreqBase,
 		RopeScaling:          c.RopeScaling,
 		Type:                 c.ModelType,
 		RopeFreqScale:        c.RopeFreqScale,
 		NUMA:                 c.NUMA,
 		Embeddings:           c.Embeddings,
-		LowVRAM:              c.LowVRAM,
-		NGPULayers:           int32(c.NGPULayers),
-		MMap:                 c.MMap,
+		LowVRAM:              *c.LowVRAM,
+		NGPULayers:           int32(*c.NGPULayers),
+		MMap:                 *c.MMap,
 		MainGPU:              c.MainGPU,
-		Threads:              int32(c.Threads),
+		Threads:              int32(*c.Threads),
 		TensorSplit:          c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
@ -102,36 +102,37 @@ func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOption
 		os.MkdirAll(filepath.Dir(p), 0755)
 		promptCachePath = p
 	}
+
 	return &pb.PredictOptions{
-		Temperature:         float32(c.Temperature),
-		TopP:                float32(c.TopP),
+		Temperature:         float32(*c.Temperature),
+		TopP:                float32(*c.TopP),
 		NDraft:              c.NDraft,
-		TopK:                int32(c.TopK),
-		Tokens:              int32(c.Maxtokens),
-		Threads:             int32(c.Threads),
+		TopK:                int32(*c.TopK),
+		Tokens:              int32(*c.Maxtokens),
+		Threads:             int32(*c.Threads),
 		PromptCacheAll:      c.PromptCacheAll,
 		PromptCacheRO:       c.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
-		F16KV:               c.F16,
-		DebugMode:           c.Debug,
+		F16KV:               *c.F16,
+		DebugMode:           *c.Debug,
 		Grammar:             c.Grammar,
 		NegativePromptScale: c.NegativePromptScale,
 		RopeFreqBase:        c.RopeFreqBase,
 		RopeFreqScale:       c.RopeFreqScale,
 		NegativePrompt:      c.NegativePrompt,
-		Mirostat:            int32(c.LLMConfig.Mirostat),
-		MirostatETA:         float32(c.LLMConfig.MirostatETA),
-		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
-		Debug:               c.Debug,
+		Mirostat:            int32(*c.LLMConfig.Mirostat),
+		MirostatETA:         float32(*c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
+		Debug:               *c.Debug,
 		StopPrompts:         c.StopWords,
 		Repeat:              int32(c.RepeatPenalty),
 		NKeep:               int32(c.Keep),
 		Batch:               int32(c.Batch),
 		IgnoreEOS:           c.IgnoreEOS,
-		Seed:                int32(c.Seed),
+		Seed:                int32(*c.Seed),
 		FrequencyPenalty:    float32(c.FrequencyPenalty),
-		MLock:               c.MMlock,
-		MMap:                c.MMap,
+		MLock:               *c.MMlock,
+		MMap:                *c.MMap,
 		MainGPU:             c.MainGPU,
 		TensorSplit:         c.TensorSplit,
 		TailFreeSamplingZ:   float32(c.TFZ),
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@ -17,7 +17,7 @@ func ModelTranscription(audio, language string, ml *model.ModelLoader, backendCo
 		model.WithBackendString(model.WhisperBackend),
 		model.WithModel(backendConfig.Model),
 		model.WithContext(appConfig.Context),
-		model.WithThreads(uint32(backendConfig.Threads)),
+		model.WithThreads(uint32(*backendConfig.Threads)),
 		model.WithAssetDir(appConfig.AssetsDestination),
 	})

@ -33,6 +33,6 @@ func ModelTranscription(audio, language string, ml *model.ModelLoader, backendCo
 	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
 		Dst:      audio,
 		Language: language,
-		Threads:  uint32(backendConfig.Threads),
+		Threads:  uint32(*backendConfig.Threads),
 	})
 }
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"strings"
@ -20,9 +21,9 @@ type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`

-	F16            bool              `yaml:"f16"`
-	Threads        int               `yaml:"threads"`
-	Debug          bool              `yaml:"debug"`
+	F16            *bool             `yaml:"f16"`
+	Threads        *int              `yaml:"threads"`
+	Debug          *bool             `yaml:"debug"`
 	Roles          map[string]string `yaml:"roles"`
 	Embeddings     bool              `yaml:"embeddings"`
 	Backend        string            `yaml:"backend"`
@ -105,20 +106,20 @@ type LLMConfig struct {
 	PromptCachePath string   `yaml:"prompt_cache_path"`
 	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
 	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
-	MirostatETA     float64  `yaml:"mirostat_eta"`
-	MirostatTAU     float64  `yaml:"mirostat_tau"`
-	Mirostat        int      `yaml:"mirostat"`
-	NGPULayers      int      `yaml:"gpu_layers"`
-	MMap            bool     `yaml:"mmap"`
-	MMlock          bool     `yaml:"mmlock"`
-	LowVRAM         bool     `yaml:"low_vram"`
+	MirostatETA     *float64 `yaml:"mirostat_eta"`
+	MirostatTAU     *float64 `yaml:"mirostat_tau"`
+	Mirostat        *int     `yaml:"mirostat"`
+	NGPULayers      *int     `yaml:"gpu_layers"`
+	MMap            *bool    `yaml:"mmap"`
+	MMlock          *bool    `yaml:"mmlock"`
+	LowVRAM         *bool    `yaml:"low_vram"`
 	Grammar         string   `yaml:"grammar"`
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`

-	ContextSize          int     `yaml:"context_size"`
+	ContextSize          *int    `yaml:"context_size"`
 	NUMA                 bool    `yaml:"numa"`
 	LoraAdapter          string  `yaml:"lora_adapter"`
 	LoraBase             string  `yaml:"lora_base"`
@ -185,19 +186,96 @@ func (c *BackendConfig) FunctionToCall() string {
 	return c.functionCallNameString
 }

-func defaultPredictOptions(modelFile string) schema.PredictionOptions {
-	return schema.PredictionOptions{
-		TopP:        0.7,
-		TopK:        80,
-		Maxtokens:   512,
-		Temperature: 0.9,
-		Model:       modelFile,
-	}
-}
+func (cfg *BackendConfig) SetDefaults(debug bool, threads, ctx int, f16 bool) {
+	defaultTopP := 0.7
+	defaultTopK := 80
+	defaultTemp := 0.9
+	defaultMaxTokens := 2048
+	defaultMirostat := 2
+	defaultMirostatTAU := 5.0
+	defaultMirostatETA := 0.1

-func DefaultConfig(modelFile string) *BackendConfig {
-	return &BackendConfig{
-		PredictionOptions: defaultPredictOptions(modelFile),
+	// Try to offload all GPU layers (if GPU is found)
+	defaultNGPULayers := 99999999
+
+	trueV := true
+	falseV := false
+
+	if cfg.Seed == nil {
+		//  random number generator seed
+		defaultSeed := int(rand.Int31())
+		cfg.Seed = &defaultSeed
+	}
+
+	if cfg.TopK == nil {
+		cfg.TopK = &defaultTopK
+	}
+
+	if cfg.MMap == nil {
+		// MMap is enabled by default
+		cfg.MMap = &trueV
+	}
+
+	if cfg.MMlock == nil {
+		// MMlock is disabled by default
+		cfg.MMlock = &falseV
+	}
+
+	if cfg.TopP == nil {
+		cfg.TopP = &defaultTopP
+	}
+	if cfg.Temperature == nil {
+		cfg.Temperature = &defaultTemp
+	}
+
+	if cfg.Maxtokens == nil {
+		cfg.Maxtokens = &defaultMaxTokens
+	}
+
+	if cfg.Mirostat == nil {
+		cfg.Mirostat = &defaultMirostat
+	}
+
+	if cfg.MirostatETA == nil {
+		cfg.MirostatETA = &defaultMirostatETA
+	}
+
+	if cfg.MirostatTAU == nil {
+		cfg.MirostatTAU = &defaultMirostatTAU
+	}
+	if cfg.NGPULayers == nil {
+		cfg.NGPULayers = &defaultNGPULayers
+	}
+
+	if cfg.LowVRAM == nil {
+		cfg.LowVRAM = &falseV
+	}
+
+	// Value passed by the top level are treated as default (no implicit defaults)
+	// defaults are set by the user
+	if ctx == 0 {
+		ctx = 1024
+	}
+
+	if cfg.ContextSize == nil {
+		cfg.ContextSize = &ctx
+	}
+
+	if threads == 0 {
+		// Threads can't be 0
+		threads = 4
+	}
+
+	if cfg.Threads == nil {
+		cfg.Threads = &threads
+	}
+
+	if cfg.F16 == nil {
+		cfg.F16 = &f16
+	}
+
+	if debug {
+		cfg.Debug = &debug
 	}
 }

@ -208,23 +286,63 @@ type BackendConfigLoader struct {
 	sync.Mutex
 }

+type LoadOptions struct {
+	debug            bool
+	threads, ctxSize int
+	f16              bool
+}
+
+func LoadOptionDebug(debug bool) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.debug = debug
+	}
+}
+
+func LoadOptionThreads(threads int) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.threads = threads
+	}
+}
+
+func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.ctxSize = ctxSize
+	}
+}
+
+func LoadOptionF16(f16 bool) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.f16 = f16
+	}
+}
+
+type ConfigLoaderOption func(*LoadOptions)
+
+func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
+	for _, l := range options {
+		l(lo)
+	}
+}
+
 // Load a config file for a model
-func LoadBackendConfigFileByName(modelName, modelPath string, cl *BackendConfigLoader, debug bool, threads, ctx int, f16 bool) (*BackendConfig, error) {
+func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+
+	lo := &LoadOptions{}
+	lo.Apply(opts...)
+
 	// Load a config file if present after the model name
-	modelConfig := filepath.Join(modelPath, modelName+".yaml")
-
-	var cfg *BackendConfig
-
-	defaults := func() {
-		cfg = DefaultConfig(modelName)
-		cfg.ContextSize = ctx
-		cfg.Threads = threads
-		cfg.F16 = f16
-		cfg.Debug = debug
+	cfg := &BackendConfig{
+		PredictionOptions: schema.PredictionOptions{
+			Model: modelName,
+		},
 	}

 	cfgExisting, exists := cl.GetBackendConfig(modelName)
-	if !exists {
+	if exists {
+		cfg = &cfgExisting
+	} else {
+		// Try loading a model config file
+		modelConfig := filepath.Join(modelPath, modelName+".yaml")
 		if _, err := os.Stat(modelConfig); err == nil {
 			if err := cl.LoadBackendConfig(modelConfig); err != nil {
 				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
@ -232,32 +350,11 @@ func LoadBackendConfigFileByName(modelName, modelPath string, cl *BackendConfigL
 			cfgExisting, exists = cl.GetBackendConfig(modelName)
 			if exists {
 				cfg = &cfgExisting
-			} else {
-				defaults()
 			}
-		} else {
-			defaults()
-		}
-	} else {
-		cfg = &cfgExisting
-	}
-
-	// Set the parameters for the language model prediction
-	//updateConfig(cfg, input)
-
-	// Don't allow 0 as setting
-	if cfg.Threads == 0 {
-		if threads != 0 {
-			cfg.Threads = threads
-		} else {
-			cfg.Threads = 4
 		}
 	}

-	// Enforce debug flag if passed from CLI
-	if debug {
-		cfg.Debug = true
-	}
+	cfg.SetDefaults(lo.debug, lo.threads, lo.ctxSize, lo.f16)

 	return cfg, nil
 }
@ -267,7 +364,10 @@ func NewBackendConfigLoader() *BackendConfigLoader {
 		configs: make(map[string]BackendConfig),
 	}
 }
-func ReadBackendConfigFile(file string) ([]*BackendConfig, error) {
+func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
+	lo := &LoadOptions{}
+	lo.Apply(opts...)
+
 	c := &[]*BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
@ -277,10 +377,17 @@ func ReadBackendConfigFile(file string) ([]*BackendConfig, error) {
 		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}

+	for _, cc := range *c {
+		cc.SetDefaults(lo.debug, lo.threads, lo.ctxSize, lo.f16)
+	}
+
 	return *c, nil
 }

-func ReadBackendConfig(file string) (*BackendConfig, error) {
+func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+	lo := &LoadOptions{}
+	lo.Apply(opts...)
+
 	c := &BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
@ -290,13 +397,14 @@ func ReadBackendConfig(file string) (*BackendConfig, error) {
 		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}

+	c.SetDefaults(lo.debug, lo.threads, lo.ctxSize, lo.f16)
 	return c, nil
 }

-func (cm *BackendConfigLoader) LoadBackendConfigFile(file string) error {
+func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
 	cm.Lock()
 	defer cm.Unlock()
-	c, err := ReadBackendConfigFile(file)
+	c, err := ReadBackendConfigFile(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot load config file: %w", err)
 	}
@ -307,10 +415,10 @@ func (cm *BackendConfigLoader) LoadBackendConfigFile(file string) error {
 	return nil
 }

-func (cl *BackendConfigLoader) LoadBackendConfig(file string) error {
+func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
 	cl.Lock()
 	defer cl.Unlock()
-	c, err := ReadBackendConfig(file)
+	c, err := ReadBackendConfig(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot read config file: %w", err)
 	}
@ -407,7 +515,9 @@ func (cl *BackendConfigLoader) Preload(modelPath string) error {
 	return nil
 }

-func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string) error {
+// LoadBackendConfigsFromPath reads all the configurations of the models from a path
+// (non-recursive)
+func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
 	cm.Lock()
 	defer cm.Unlock()
 	entries, err := os.ReadDir(path)
@ -427,7 +537,7 @@ func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string) error {
 		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
 			continue
 		}
-		c, err := ReadBackendConfig(filepath.Join(path, file.Name()))
+		c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
 		if err == nil {
 			cm.configs[c.Name] = *c
 		}
--- a/core/http/api_test.go
+++ b/core/http/api_test.go
@ -386,7 +386,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
+				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))

--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@ -26,7 +26,14 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
-		cfg, err := config.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath, cl, false, 0, 0, false)
+
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@ -196,7 +196,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon

 				baseURL := c.BaseURL()

-				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, src, output, ml, *config, appConfig)
+				fn, err := backend.ImageGeneration(height, width, mode, step, *config.Seed, positive_prompt, negative_prompt, src, output, ml, *config, appConfig)
 				if err != nil {
 					return err
 				}
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@ -74,10 +74,10 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 	if input.Echo {
 		config.Echo = input.Echo
 	}
-	if input.TopK != 0 {
+	if input.TopK != nil {
 		config.TopK = input.TopK
 	}
-	if input.TopP != 0 {
+	if input.TopP != nil {
 		config.TopP = input.TopP
 	}

@ -117,11 +117,11 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 		config.Grammar = input.Grammar
 	}

-	if input.Temperature != 0 {
+	if input.Temperature != nil {
 		config.Temperature = input.Temperature
 	}

-	if input.Maxtokens != 0 {
+	if input.Maxtokens != nil {
 		config.Maxtokens = input.Maxtokens
 	}

@ -193,30 +193,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 		config.Batch = input.Batch
 	}

-	if input.F16 {
-		config.F16 = input.F16
-	}
-
 	if input.IgnoreEOS {
 		config.IgnoreEOS = input.IgnoreEOS
 	}

-	if input.Seed != 0 {
+	if input.Seed != nil {
 		config.Seed = input.Seed
 	}

-	if input.Mirostat != 0 {
-		config.LLMConfig.Mirostat = input.Mirostat
-	}
-
-	if input.MirostatETA != 0 {
-		config.LLMConfig.MirostatETA = input.MirostatETA
-	}
-
-	if input.MirostatTAU != 0 {
-		config.LLMConfig.MirostatTAU = input.MirostatTAU
-	}
-
 	if input.TypicalP != 0 {
 		config.TypicalP = input.TypicalP
 	}
@ -272,7 +256,12 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 }

 func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.BackendConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.BackendConfig, *schema.OpenAIRequest, error) {
-	cfg, err := config.LoadBackendConfigFileByName(modelFile, loader.ModelPath, cm, debug, threads, ctx, f16)
+	cfg, err := cm.LoadBackendConfigFileByName(modelFile, loader.ModelPath,
+		config.LoadOptionDebug(debug),
+		config.LoadOptionThreads(threads),
+		config.LoadOptionContextSize(ctx),
+		config.LoadOptionF16(f16),
+	)

 	// Set the parameters for the language model prediction
 	updateRequestConfig(cfg, input)
--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@ -12,28 +12,23 @@ type PredictionOptions struct {
 	N int `json:"n"`

 	// Common options between all the API calls, part of the OpenAI spec
-	TopP        float64 `json:"top_p" yaml:"top_p"`
-	TopK        int     `json:"top_k" yaml:"top_k"`
-	Temperature float64 `json:"temperature" yaml:"temperature"`
-	Maxtokens   int     `json:"max_tokens" yaml:"max_tokens"`
-	Echo        bool    `json:"echo"`
+	TopP        *float64 `json:"top_p" yaml:"top_p"`
+	TopK        *int     `json:"top_k" yaml:"top_k"`
+	Temperature *float64 `json:"temperature" yaml:"temperature"`
+	Maxtokens   *int     `json:"max_tokens" yaml:"max_tokens"`
+	Echo        bool     `json:"echo"`

 	// Custom parameters - not present in the OpenAI API
 	Batch         int     `json:"batch" yaml:"batch"`
-	F16           bool    `json:"f16" yaml:"f16"`
 	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
 	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
 	Keep          int     `json:"n_keep" yaml:"n_keep"`

-	MirostatETA float64 `json:"mirostat_eta" yaml:"mirostat_eta"`
-	MirostatTAU float64 `json:"mirostat_tau" yaml:"mirostat_tau"`
-	Mirostat    int     `json:"mirostat" yaml:"mirostat"`
-
 	FrequencyPenalty float64 `json:"frequency_penalty" yaml:"frequency_penalty"`
 	TFZ              float64 `json:"tfz" yaml:"tfz"`

 	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
-	Seed     int     `json:"seed" yaml:"seed"`
+	Seed     *int    `json:"seed" yaml:"seed"`

 	NegativePrompt      string  `json:"negative_prompt" yaml:"negative_prompt"`
 	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
--- a/main.go
+++ b/main.go
@ -497,7 +497,7 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
 						return errors.New("model not found")
 					}

-					c.Threads = threads
+					c.Threads = &threads

 					defer ml.StopAllGRPC()