Mixed enhancements (#196)

2025-06-15 13:38:07 +00:00 · 2023-05-06 00:00:58 +02:00
parent 91db3d4d5c 8c8cf38d4d
commit 7e5fe35ae4
20 changed files with 487 additions and 244 deletions
--- a/2
+++ b/2
@ -130,7 +130,7 @@ test-models/testmodel:

 test: prepare test-models/testmodel
 	cp tests/fixtures/* test-models
-	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) test -v -timeout 30m ./...
+	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo -v -r ./...

 ## Help:
 help: ## Show this help.
--- a/README.md
+++ b/README.md
@ -19,6 +19,8 @@

 LocalAI is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! It was initially created by [mudler](https://github.com/mudler/) at the [SpectroCloud OSS Office](https://github.com/spectrocloud).

+See [examples on how to integrate LocalAI](https://github.com/go-skynet/LocalAI/tree/master/examples/).
+
 ### News

 - 02-05-2023: Support for `rwkv.cpp` models ( https://github.com/go-skynet/LocalAI/pull/158 ) and for `/edits` endpoint
--- a/api/api.go
+++ b/api/api.go
@ -6,6 +6,7 @@ import (
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/logger"
 	"github.com/gofiber/fiber/v2/middleware/recover"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
@ -40,6 +41,12 @@ func App(configFile string, loader *model.ModelLoader, threads, ctxSize int, f16
 		},
 	})

+	if debug {
+		app.Use(logger.New(logger.Config{
+			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
+		}))
+	}
+
 	cm := make(ConfigMerger)
 	if err := cm.LoadConfigs(loader.ModelPath); err != nil {
 		log.Error().Msgf("error loading config files: %s", err.Error())
@ -73,6 +80,10 @@ func App(configFile string, loader *model.ModelLoader, threads, ctxSize int, f16
 	app.Post("/v1/embeddings", embeddingsEndpoint(cm, debug, loader, threads, ctxSize, f16))
 	app.Post("/embeddings", embeddingsEndpoint(cm, debug, loader, threads, ctxSize, f16))

+	// /v1/engines/{engine_id}/embeddings
+
+	app.Post("/v1/engines/:model/embeddings", embeddingsEndpoint(cm, debug, loader, threads, ctxSize, f16))
+
 	app.Get("/v1/models", listModels(loader, cm))
 	app.Get("/models", listModels(loader, cm))

--- a/api/config.go
+++ b/api/config.go
@ -1,12 +1,16 @@
 package api

 import (
+	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strings"

+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v3"
 )

@ -27,6 +31,8 @@ type Config struct {
 	MirostatETA    float64           `yaml:"mirostat_eta"`
 	MirostatTAU    float64           `yaml:"mirostat_tau"`
 	Mirostat       int               `yaml:"mirostat"`
+
+	PromptStrings, InputStrings []string
 }

 type TemplateConfig struct {
@ -104,3 +110,172 @@ func (cm ConfigMerger) LoadConfigs(path string) error {

 	return nil
 }
+
+func updateConfig(config *Config, input *OpenAIRequest) {
+	if input.Echo {
+		config.Echo = input.Echo
+	}
+	if input.TopK != 0 {
+		config.TopK = input.TopK
+	}
+	if input.TopP != 0 {
+		config.TopP = input.TopP
+	}
+
+	if input.Temperature != 0 {
+		config.Temperature = input.Temperature
+	}
+
+	if input.Maxtokens != 0 {
+		config.Maxtokens = input.Maxtokens
+	}
+
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
+	}
+
+	if input.RepeatPenalty != 0 {
+		config.RepeatPenalty = input.RepeatPenalty
+	}
+
+	if input.Keep != 0 {
+		config.Keep = input.Keep
+	}
+
+	if input.Batch != 0 {
+		config.Batch = input.Batch
+	}
+
+	if input.F16 {
+		config.F16 = input.F16
+	}
+
+	if input.IgnoreEOS {
+		config.IgnoreEOS = input.IgnoreEOS
+	}
+
+	if input.Seed != 0 {
+		config.Seed = input.Seed
+	}
+
+	if input.Mirostat != 0 {
+		config.Mirostat = input.Mirostat
+	}
+
+	if input.MirostatETA != 0 {
+		config.MirostatETA = input.MirostatETA
+	}
+
+	if input.MirostatTAU != 0 {
+		config.MirostatTAU = input.MirostatTAU
+	}
+
+	switch inputs := input.Input.(type) {
+	case string:
+		if inputs != "" {
+			config.InputStrings = append(config.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			if s, ok := pp.(string); ok {
+				config.InputStrings = append(config.InputStrings, s)
+			}
+		}
+	}
+
+	switch p := input.Prompt.(type) {
+	case string:
+		config.PromptStrings = append(config.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				config.PromptStrings = append(config.PromptStrings, s)
+			}
+		}
+	}
+}
+
+func readConfig(cm ConfigMerger, c *fiber.Ctx, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*Config, *OpenAIRequest, error) {
+	input := new(OpenAIRequest)
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return nil, nil, err
+	}
+
+	modelFile := input.Model
+
+	if c.Params("model") != "" {
+		modelFile = c.Params("model")
+	}
+
+	received, _ := json.Marshal(input)
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	// Set model from bearer token, if available
+	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+
+	// If no model was specified, take the first available
+	if modelFile == "" && !bearerExists {
+		models, _ := loader.ListModels()
+		if len(models) > 0 {
+			modelFile = models[0]
+			log.Debug().Msgf("No model specified, using: %s", modelFile)
+		} else {
+			log.Debug().Msgf("No model specified, returning error")
+			return nil, nil, fmt.Errorf("no model specified")
+		}
+	}
+
+	// If a model is found in bearer token takes precedence
+	if bearerExists {
+		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		modelFile = bearer
+	}
+
+	// Load a config file if present after the model name
+	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
+	if _, err := os.Stat(modelConfig); err == nil {
+		if err := cm.LoadConfig(modelConfig); err != nil {
+			return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+		}
+	}
+
+	var config *Config
+	cfg, exists := cm[modelFile]
+	if !exists {
+		config = &Config{
+			OpenAIRequest: defaultRequest(modelFile),
+			ContextSize:   ctx,
+			Threads:       threads,
+			F16:           f16,
+			Debug:         debug,
+		}
+	} else {
+		config = &cfg
+	}
+
+	// Set the parameters for the language model prediction
+	updateConfig(config, input)
+
+	// Don't allow 0 as setting
+	if config.Threads == 0 {
+		if threads != 0 {
+			config.Threads = threads
+		} else {
+			config.Threads = 4
+		}
+	}
+
+	return config, input, nil
+}
--- a/api/openai.go
+++ b/api/openai.go
@ -5,8 +5,6 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
-	"os"
-	"path/filepath"
 	"strings"

 	model "github.com/go-skynet/LocalAI/pkg/model"
@ -75,8 +73,8 @@ type OpenAIRequest struct {
 	Prompt interface{} `json:"prompt" yaml:"prompt"`

 	// Edit endpoint
-	Instruction string `json:"instruction" yaml:"instruction"`
-	Input       string `json:"input" yaml:"input"`
+	Instruction string      `json:"instruction" yaml:"instruction"`
+	Input       interface{} `json:"input" yaml:"input"`

 	Stop interface{} `json:"stop" yaml:"stop"`

@ -117,147 +115,6 @@ func defaultRequest(modelFile string) OpenAIRequest {
 	}
 }

-func updateConfig(config *Config, input *OpenAIRequest) {
-	if input.Echo {
-		config.Echo = input.Echo
-	}
-	if input.TopK != 0 {
-		config.TopK = input.TopK
-	}
-	if input.TopP != 0 {
-		config.TopP = input.TopP
-	}
-
-	if input.Temperature != 0 {
-		config.Temperature = input.Temperature
-	}
-
-	if input.Maxtokens != 0 {
-		config.Maxtokens = input.Maxtokens
-	}
-
-	switch stop := input.Stop.(type) {
-	case string:
-		if stop != "" {
-			config.StopWords = append(config.StopWords, stop)
-		}
-	case []interface{}:
-		for _, pp := range stop {
-			if s, ok := pp.(string); ok {
-				config.StopWords = append(config.StopWords, s)
-			}
-		}
-	}
-
-	if input.RepeatPenalty != 0 {
-		config.RepeatPenalty = input.RepeatPenalty
-	}
-
-	if input.Keep != 0 {
-		config.Keep = input.Keep
-	}
-
-	if input.Batch != 0 {
-		config.Batch = input.Batch
-	}
-
-	if input.F16 {
-		config.F16 = input.F16
-	}
-
-	if input.IgnoreEOS {
-		config.IgnoreEOS = input.IgnoreEOS
-	}
-
-	if input.Seed != 0 {
-		config.Seed = input.Seed
-	}
-
-	if input.Mirostat != 0 {
-		config.Mirostat = input.Mirostat
-	}
-
-	if input.MirostatETA != 0 {
-		config.MirostatETA = input.MirostatETA
-	}
-
-	if input.MirostatTAU != 0 {
-		config.MirostatTAU = input.MirostatTAU
-	}
-}
-
-func readConfig(cm ConfigMerger, c *fiber.Ctx, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*Config, *OpenAIRequest, error) {
-	input := new(OpenAIRequest)
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return nil, nil, err
-	}
-
-	modelFile := input.Model
-	received, _ := json.Marshal(input)
-
-	log.Debug().Msgf("Request received: %s", string(received))
-
-	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
-
-	// If no model was specified, take the first available
-	if modelFile == "" && !bearerExists {
-		models, _ := loader.ListModels()
-		if len(models) > 0 {
-			modelFile = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelFile)
-		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return nil, nil, fmt.Errorf("no model specified")
-		}
-	}
-
-	// If a model is found in bearer token takes precedence
-	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
-		modelFile = bearer
-	}
-
-	// Load a config file if present after the model name
-	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
-	if _, err := os.Stat(modelConfig); err == nil {
-		if err := cm.LoadConfig(modelConfig); err != nil {
-			return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-		}
-	}
-
-	var config *Config
-	cfg, exists := cm[modelFile]
-	if !exists {
-		config = &Config{
-			OpenAIRequest: defaultRequest(modelFile),
-		}
-	} else {
-		config = &cfg
-	}
-
-	// Set the parameters for the language model prediction
-	updateConfig(config, input)
-
-	if threads != 0 {
-		config.Threads = threads
-	}
-	if ctx != 0 {
-		config.ContextSize = ctx
-	}
-	if f16 {
-		config.F16 = true
-	}
-
-	if debug {
-		config.Debug = true
-	}
-
-	return config, input, nil
-}
-
 // https://platform.openai.com/docs/api-reference/completions
 func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, threads, ctx int, f16 bool) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
@ -268,19 +125,6 @@ func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		predInput := []string{}
-
-		switch p := input.Prompt.(type) {
-		case string:
-			predInput = append(predInput, p)
-		case []interface{}:
-			for _, pp := range p {
-				if s, ok := pp.(string); ok {
-					predInput = append(predInput, s)
-				}
-			}
-		}
-
 		templateFile := config.Model

 		if config.TemplateConfig.Completion != "" {
@ -288,7 +132,7 @@ func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,
 		}

 		var result []Choice
-		for _, i := range predInput {
+		for _, i := range config.PromptStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := loader.TemplatePrefix(templateFile, struct {
 				Input string
@ -331,20 +175,26 @@ func embeddingsEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,
 		}

 		log.Debug().Msgf("Parameter Config: %+v", config)
+		items := []Item{}

-		// get the model function to call for the result
-		embedFn, err := ModelEmbedding(input.Input, loader, *config)
-		if err != nil {
-			return err
+		for i, s := range config.InputStrings {
+
+			// get the model function to call for the result
+			embedFn, err := ModelEmbedding(s, loader, *config)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

-		embeddings, err := embedFn()
-		if err != nil {
-			return err
-		}
 		resp := &OpenAIResponse{
 			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:   []Item{{Embedding: embeddings, Index: 0, Object: "embedding"}},
+			Data:   items,
 			Object: "list",
 		}

@ -480,28 +330,32 @@ func editEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		predInput := input.Input
 		templateFile := config.Model

 		if config.TemplateConfig.Edit != "" {
 			templateFile = config.TemplateConfig.Edit
 		}

-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(templateFile, struct {
-			Input       string
-			Instruction string
-		}{Input: predInput, Instruction: input.Instruction})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		}
+		var result []Choice
+		for _, i := range config.InputStrings {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := loader.TemplatePrefix(templateFile, struct {
+				Input       string
+				Instruction string
+			}{Input: i})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}

-		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
-			*c = append(*c, Choice{Text: s})
-		}, nil)
-		if err != nil {
-			return err
+			r, err := ComputeChoices(i, input, config, loader, func(s string, c *[]Choice) {
+				*c = append(*c, Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			result = append(result, r...)
 		}

 		resp := &OpenAIResponse{
--- a/api/prediction.go
+++ b/api/prediction.go
@ -28,6 +28,7 @@ func defaultLLamaOpts(c Config) []llama.ModelOption {
 	if c.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
+
 	return llamaOpts
 }

@ -55,7 +56,8 @@ func ModelEmbedding(s string, loader *model.ModelLoader, c Config) (func() ([]fl
 	switch model := inferenceModel.(type) {
 	case *llama.LLama:
 		fn = func() ([]float32, error) {
-			return model.Embeddings(s)
+			predictOptions := buildLLamaPredictOptions(c)
+			return model.Embeddings(s, predictOptions...)
 		}
 	default:
 		fn = func() ([]float32, error) {
@ -76,10 +78,77 @@ func ModelEmbedding(s string, loader *model.ModelLoader, c Config) (func() ([]fl
 		l.Lock()
 		defer l.Unlock()

-		return fn()
+		embeds, err := fn()
+		if err != nil {
+			return embeds, err
+		}
+		// Remove trailing 0s
+		for i := len(embeds) - 1; i >= 0; i-- {
+			if embeds[i] == 0.0 {
+				embeds = embeds[:i]
+			} else {
+				break
+			}
+		}
+		return embeds, nil
 	}, nil
 }

+func buildLLamaPredictOptions(c Config) []llama.PredictOption {
+	// Generate the prediction using the language model
+	predictOptions := []llama.PredictOption{
+		llama.SetTemperature(c.Temperature),
+		llama.SetTopP(c.TopP),
+		llama.SetTopK(c.TopK),
+		llama.SetTokens(c.Maxtokens),
+		llama.SetThreads(c.Threads),
+	}
+
+	if c.Mirostat != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostat(c.Mirostat))
+	}
+
+	if c.MirostatETA != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatETA(c.MirostatETA))
+	}
+
+	if c.MirostatTAU != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(c.MirostatTAU))
+	}
+
+	if c.Debug {
+		predictOptions = append(predictOptions, llama.Debug)
+	}
+
+	predictOptions = append(predictOptions, llama.SetStopWords(c.StopWords...))
+
+	if c.RepeatPenalty != 0 {
+		predictOptions = append(predictOptions, llama.SetPenalty(c.RepeatPenalty))
+	}
+
+	if c.Keep != 0 {
+		predictOptions = append(predictOptions, llama.SetNKeep(c.Keep))
+	}
+
+	if c.Batch != 0 {
+		predictOptions = append(predictOptions, llama.SetBatch(c.Batch))
+	}
+
+	if c.F16 {
+		predictOptions = append(predictOptions, llama.EnableF16KV)
+	}
+
+	if c.IgnoreEOS {
+		predictOptions = append(predictOptions, llama.IgnoreEOS)
+	}
+
+	if c.Seed != 0 {
+		predictOptions = append(predictOptions, llama.SetSeed(c.Seed))
+	}
+
+	return predictOptions
+}
+
 func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback func(string) bool) (func() (string, error), error) {
 	supportStreams := false
 	modelFile := c.Model
@ -197,56 +266,7 @@ func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback
 				model.SetTokenCallback(tokenCallback)
 			}

-			// Generate the prediction using the language model
-			predictOptions := []llama.PredictOption{
-				llama.SetTemperature(c.Temperature),
-				llama.SetTopP(c.TopP),
-				llama.SetTopK(c.TopK),
-				llama.SetTokens(c.Maxtokens),
-				llama.SetThreads(c.Threads),
-			}
-
-			if c.Mirostat != 0 {
-				predictOptions = append(predictOptions, llama.SetMirostat(c.Mirostat))
-			}
-
-			if c.MirostatETA != 0 {
-				predictOptions = append(predictOptions, llama.SetMirostatETA(c.MirostatETA))
-			}
-
-			if c.MirostatTAU != 0 {
-				predictOptions = append(predictOptions, llama.SetMirostatTAU(c.MirostatTAU))
-			}
-
-			if c.Debug {
-				predictOptions = append(predictOptions, llama.Debug)
-			}
-
-			predictOptions = append(predictOptions, llama.SetStopWords(c.StopWords...))
-
-			if c.RepeatPenalty != 0 {
-				predictOptions = append(predictOptions, llama.SetPenalty(c.RepeatPenalty))
-			}
-
-			if c.Keep != 0 {
-				predictOptions = append(predictOptions, llama.SetNKeep(c.Keep))
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, llama.SetBatch(c.Batch))
-			}
-
-			if c.F16 {
-				predictOptions = append(predictOptions, llama.EnableF16KV)
-			}
-
-			if c.IgnoreEOS {
-				predictOptions = append(predictOptions, llama.IgnoreEOS)
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, llama.SetSeed(c.Seed))
-			}
+			predictOptions := buildLLamaPredictOptions(c)

 			str, er := model.Predict(
 				s,
--- a/examples/query_data/.gitignore
+++ b/examples/query_data/.gitignore
@ -0,0 +1 @@
+storage/
--- a/examples/query_data/README.md
+++ b/examples/query_data/README.md
@ -0,0 +1,49 @@
+# Data query example
+
+This example makes use of [Llama-Index](https://gpt-index.readthedocs.io/en/stable/getting_started/installation.html) to enable question answering on a set of documents.
+
+It loosely follows [the quickstart](https://gpt-index.readthedocs.io/en/stable/guides/primer/usage_pattern.html).
+
+## Requirements
+
+For this in order to work, you will need a model compatible with the `llama.cpp` backend. This is will not work with gpt4all.
+
+The example uses `WizardLM`. Edit the config files in `models/` accordingly to specify the model you use (change `HERE`).
+
+You will also need a training data set. Copy that over `data`.
+
+## Setup
+
+Start the API:
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/query_data
+
+# Copy your models, edit config files accordingly
+
+# start with docker-compose
+docker-compose up -d --build
+```
+
+### Create a storage:
+
+```bash
+export OPENAI_API_BASE=http://localhost:8080/v1
+export OPENAI_API_KEY=sk-
+
+python store.py
+```
+
+After it finishes, a directory "storage" will be created with the vector index database.
+
+## Query
+
+```bash
+export OPENAI_API_BASE=http://localhost:8080/v1
+export OPENAI_API_KEY=sk-
+
+python query.py
+```
--- a/examples/query_data/data/.keep
+++ b/examples/query_data/data/.keep
--- a/examples/query_data/docker-compose.yml
+++ b/examples/query_data/docker-compose.yml
@ -0,0 +1,15 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - 8080:8080
+    env_file:
+      - .env
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai"]
--- a/examples/query_data/models/completion.tmpl
+++ b/examples/query_data/models/completion.tmpl
@ -0,0 +1 @@
+{{.Input}}
--- a/examples/query_data/models/embeddings.yaml
+++ b/examples/query_data/models/embeddings.yaml
@ -0,0 +1,18 @@
+name: text-embedding-ada-002
+parameters:
+  model: HERE
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+context_size: 1024
+threads: 14
+stopwords:
+- "HUMAN:"
+- "GPT:"
+roles:
+  user: " "
+  system: " "
+embeddings: true
+template:
+  completion: completion
+  chat: gpt4all
--- a/examples/query_data/models/gpt-3.5-turbo.yaml
+++ b/examples/query_data/models/gpt-3.5-turbo.yaml
@ -0,0 +1,18 @@
+name: gpt-3.5-turbo
+parameters:
+  model: HERE
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+context_size: 1024
+threads: 14
+embeddings: true
+stopwords:
+- "HUMAN:"
+- "GPT:"
+roles:
+  user: " "
+  system: " "
+template:
+  completion: completion
+  chat: wizardlm
--- a/examples/query_data/models/wizardlm.tmpl
+++ b/examples/query_data/models/wizardlm.tmpl
@ -0,0 +1,3 @@
+{{.Input}}
+
+### Response:
--- a/examples/query_data/query.py
+++ b/examples/query_data/query.py
@ -0,0 +1,32 @@
+import os
+
+# Uncomment to specify your OpenAI API key here (local testing only, not in production!), or add corresponding environment variable (recommended)
+# os.environ['OPENAI_API_KEY']= ""
+
+from llama_index import   LLMPredictor, PromptHelper, ServiceContext
+from langchain.llms.openai import OpenAI
+from llama_index import StorageContext, load_index_from_storage
+
+
+# This example uses text-davinci-003 by default; feel free to change if desired
+llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo",openai_api_base="http://localhost:8080/v1"))
+
+# Configure prompt parameters and initialise helper
+max_input_size = 1024
+num_output = 256
+max_chunk_overlap = 20
+
+prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
+
+# Load documents from the 'data' directory
+service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
+
+# rebuild storage context
+storage_context = StorageContext.from_defaults(persist_dir='./storage')
+
+# load index
+index = load_index_from_storage(storage_context,     service_context=service_context,    )
+
+query_engine = index.as_query_engine()
+response = query_engine.query("XXXXXX your question here XXXXX")
+print(response)
--- a/examples/query_data/store.py
+++ b/examples/query_data/store.py
@ -0,0 +1,25 @@
+import os
+
+# Uncomment to specify your OpenAI API key here (local testing only, not in production!), or add corresponding environment variable (recommended)
+# os.environ['OPENAI_API_KEY']= ""
+
+from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper, ServiceContext
+from langchain.llms.openai import OpenAI
+from llama_index import StorageContext, load_index_from_storage
+
+# This example uses text-davinci-003 by default; feel free to change if desired
+llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo",openai_api_base="http://localhost:8080/v1"))
+
+# Configure prompt parameters and initialise helper
+max_input_size = 256
+num_output = 256
+max_chunk_overlap = 10
+
+prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
+
+# Load documents from the 'data' directory
+documents = SimpleDirectoryReader('data').load_data()
+service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, chunk_size_limit = 257)
+index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
+index.storage_context.persist(persist_dir="./storage")
+
--- a/examples/rwkv/scripts/build.sh
+++ b/examples/rwkv/scripts/build.sh
@ -0,0 +1,11 @@
+#!/bin/bash
+set -ex
+
+URL=$1
+OUT=$2
+FILENAME=$(basename $URL)
+
+wget -nc $URL -O /build/$FILENAME
+
+python3 /build/rwkv.cpp/rwkv/convert_pytorch_to_ggml.py /build/$FILENAME /build/float-model float16
+python3 /build/rwkv.cpp/rwkv/quantize.py /build/float-model $OUT Q4_2
--- a/tests/fixtures/config.yaml
+++ b/tests/fixtures/config.yaml
@ -1,8 +1,10 @@
 - name: list1
  parameters:
    model: testmodel
-  context_size: 512
-  threads: 10
+    top_p: 80
+    top_k: 0.9
+    temperature: 0.1
+  context_size: 10
  stopwords:
  - "HUMAN:"
  - "### Response:"
@ -14,9 +16,11 @@
    chat: ggml-gpt4all-j
 - name: list2
  parameters:
+    top_p: 80
+    top_k: 0.9
+    temperature: 0.1
    model: testmodel
-  context_size: 512
-  threads: 10
+  context_size: 10
  stopwords:
  - "HUMAN:"
  - "### Response:"
--- a/tests/fixtures/gpt4.yaml
+++ b/tests/fixtures/gpt4.yaml
@ -1,8 +1,10 @@
 name: gpt4all
 parameters:
  model: testmodel
-context_size: 512
-threads: 10
+  top_p: 80
+  top_k: 0.9
+  temperature: 0.1
+context_size: 10
 stopwords:
 - "HUMAN:"
 - "### Response:"
--- a/tests/fixtures/gpt4_2.yaml
+++ b/tests/fixtures/gpt4_2.yaml
@ -1,8 +1,10 @@
 name: gpt4all-2
 parameters:
  model: testmodel
-context_size: 1024
-threads: 5
+  top_p: 80
+  top_k: 0.9
+  temperature: 0.1
+context_size: 10
 stopwords:
 - "HUMAN:"
 - "### Response:"