feat: drop default model and llama-specific API (#26)

Signed-off-by: mudler <mudler@c3os.io>
2025-06-02 15:30:52 +00:00 · 2023-04-16 10:40:50 +02:00 · 2023-04-16 10:40:50 +02:00 · 63601fabd1
commit 63601fabd1
parent 1370b4482f
5 changed files with 9 additions and 243 deletions
--- a/README.md
+++ b/README.md
@ -27,6 +27,7 @@ docker compose up -d --build

 # Now API is accessible at localhost:8080
 curl http://localhost:8080/v1/models
+
 # {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
 curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
     "model": "your-model.bin",            
@ -88,7 +89,7 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
 | template     | TEMPLATE             |               | A file containing a template for output formatting (optional).  |
 | instruction  | INSTRUCTION          |               | Input prompt text or instruction. "-" for STDIN.   |
 | input        | INPUT                | -             | Path to text or "-" for STDIN.                    |
-| model        | MODEL_PATH           |               | The path to the pre-trained GPT-based model.      |
+| model        | MODEL           |               | The path to the pre-trained GPT-based model.      |
 | tokens       | TOKENS               | 128           | The maximum number of tokens to generate. |
 | threads      | THREADS              | NumCPU()      | The number of threads to use for text generation. |
 | temperature  | TEMPERATURE          | 0.95          | Sampling temperature for model output. ( values between `0.1` and `1.0` )  |
@ -216,32 +217,6 @@ python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
 # There will be a new model with the ".tmp" extension, you have to use that one!
 ```

-### Golang client API
-
-The `llama-cli` codebase has also a small client in go that can be used alongside with the api:
-
-```golang
-package main
-
-import (
-	"fmt"
-
-	client "github.com/go-skynet/llama-cli/client"
-)
-
-func main() {
-
-	cli := client.NewClient("http://ip:port")
-
-	out, err := cli.Predict("What's an alpaca?")
-	if err != nil {
-		panic(err)
-	}
-
-	fmt.Println(out)
-}
-```
-
 ### Windows compatibility

 It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/llama-cli/issues/2
--- a/api/api.go
+++ b/api/api.go
@ -4,7 +4,6 @@ import (
 	"embed"
 	"fmt"
 	"net/http"
-	"strconv"
 	"strings"
 	"sync"

@ -70,7 +69,7 @@ type OpenAIRequest struct {
 var indexHTML embed.FS

 // https://platform.openai.com/docs/api-reference/completions
-func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+func openAIEndpoint(chat bool, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		var err error
 		var model *llama.LLama
@ -82,10 +81,7 @@ func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoa
 		}

 		if input.Model == "" {
-			if defaultModel == nil {
-				return fmt.Errorf("no default model loaded, and no model specified")
-			}
-			model = defaultModel
+			return fmt.Errorf("no model specified")
 		} else {
 			model, err = loader.LoadModel(input.Model)
 			if err != nil {
@ -204,7 +200,7 @@ func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoa
 	}
 }

-func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
+func Start(loader *model.ModelLoader, listenAddr string, threads int) error {
 	app := fiber.New()

 	// Default middleware config
@ -217,8 +213,8 @@ func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr stri
 	var mumutex = &sync.Mutex{}

 	// openAI compatible API endpoint
-	app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
-	app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Post("/v1/chat/completions", openAIEndpoint(true, loader, threads, mutex, mumutex, mu))
+	app.Post("/v1/completions", openAIEndpoint(false, loader, threads, mutex, mumutex, mu))
 	app.Get("/v1/models", func(c *fiber.Ctx) error {
 		models, err := loader.ListModels()
 		if err != nil {
@ -243,69 +239,6 @@ func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr stri
 		NotFoundFile: "index.html",
 	}))

-	/*
-		curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
-		    "text": "What is an alpaca?",
-		    "topP": 0.8,
-		    "topK": 50,
-		    "temperature": 0.7,
-		    "tokens": 100
-		}'
-	*/
-	// Endpoint to generate the prediction
-	app.Post("/predict", func(c *fiber.Ctx) error {
-		mutex.Lock()
-		defer mutex.Unlock()
-		// Get input data from the request body
-		input := new(struct {
-			Text string `json:"text"`
-		})
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		// Set the parameters for the language model prediction
-		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
-		if err != nil {
-			return err
-		}
-
-		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
-		if err != nil {
-			return err
-		}
-
-		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
-		if err != nil {
-			return err
-		}
-
-		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
-		if err != nil {
-			return err
-		}
-
-		// Generate the prediction using the language model
-		prediction, err := defaultModel.Predict(
-			input.Text,
-			llama.SetTemperature(temperature),
-			llama.SetTopP(topP),
-			llama.SetTopK(topK),
-			llama.SetTokens(tokens),
-			llama.SetThreads(threads),
-		)
-		if err != nil {
-			return err
-		}
-
-		// Return the prediction in the response body
-		return c.JSON(struct {
-			Prediction string `json:"prediction"`
-		}{
-			Prediction: prediction,
-		})
-	})
-
 	// Start the server
 	app.Listen(listenAddr)
 	return nil
--- a/client/client.go
+++ b/client/client.go
@ -1,75 +0,0 @@
-package client
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"net/http"
-)
-
-type Prediction struct {
-	Prediction string `json:"prediction"`
-}
-
-type Client struct {
-	baseURL  string
-	client   *http.Client
-	endpoint string
-}
-
-func NewClient(baseURL string) *Client {
-	return &Client{
-		baseURL:  baseURL,
-		client:   &http.Client{},
-		endpoint: "/predict",
-	}
-}
-
-type InputData struct {
-	Text        string  `json:"text"`
-	TopP        float64 `json:"topP,omitempty"`
-	TopK        int     `json:"topK,omitempty"`
-	Temperature float64 `json:"temperature,omitempty"`
-	Tokens      int     `json:"tokens,omitempty"`
-}
-
-func (c *Client) Predict(text string, opts ...InputOption) (string, error) {
-	input := NewInputData(opts...)
-	input.Text = text
-
-	// encode input data to JSON format
-	inputBytes, err := json.Marshal(input)
-	if err != nil {
-		return "", err
-	}
-
-	// create HTTP request
-	url := c.baseURL + c.endpoint
-	req, err := http.NewRequest("POST", url, bytes.NewBuffer(inputBytes))
-	if err != nil {
-		return "", err
-	}
-
-	// set request headers
-	req.Header.Set("Content-Type", "application/json")
-
-	// send request and get response
-	resp, err := c.client.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
-	}
-
-	// decode response body to Prediction struct
-	var prediction Prediction
-	err = json.NewDecoder(resp.Body).Decode(&prediction)
-	if err != nil {
-		return "", err
-	}
-
-	return prediction.Prediction, nil
-}
--- a/client/options.go
+++ b/client/options.go
@ -1,51 +0,0 @@
-package client
-
-import "net/http"
-
-type ClientOption func(c *Client)
-
-func WithHTTPClient(httpClient *http.Client) ClientOption {
-	return func(c *Client) {
-		c.client = httpClient
-	}
-}
-
-func WithEndpoint(endpoint string) ClientOption {
-	return func(c *Client) {
-		c.endpoint = endpoint
-	}
-}
-
-type InputOption func(d *InputData)
-
-func NewInputData(opts ...InputOption) *InputData {
-	data := &InputData{}
-	for _, opt := range opts {
-		opt(data)
-	}
-	return data
-}
-
-func WithTopP(topP float64) InputOption {
-	return func(d *InputData) {
-		d.TopP = topP
-	}
-}
-
-func WithTopK(topK int) InputOption {
-	return func(d *InputData) {
-		d.TopK = topK
-	}
-}
-
-func WithTemperature(temperature float64) InputOption {
-	return func(d *InputData) {
-		d.Temperature = temperature
-	}
-}
-
-func WithTokens(tokens int) InputOption {
-	return func(d *InputData) {
-		d.Tokens = tokens
-	}
-}
--- a/main.go
+++ b/main.go
@ -57,7 +57,7 @@ func templateString(t string, in interface{}) (string, error) {
 var modelFlags = []cli.Flag{
 	&cli.StringFlag{
 		Name:    "model",
-		EnvVars: []string{"MODEL_PATH"},
+		EnvVars: []string{"MODEL"},
 	},
 	&cli.IntFlag{
 		Name:    "tokens",
@ -134,10 +134,6 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
 						Name:    "models-path",
 						EnvVars: []string{"MODELS_PATH"},
 					},
-					&cli.StringFlag{
-						Name:    "default-model",
-						EnvVars: []string{"DEFAULT_MODEL"},
-					},
 					&cli.StringFlag{
 						Name:    "address",
 						EnvVars: []string{"ADDRESS"},
@ -150,19 +146,7 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
 					},
 				},
 				Action: func(ctx *cli.Context) error {
-
-					var defaultModel *llama.LLama
-					defModel := ctx.String("default-model")
-					if defModel != "" {
-						opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
-						var err error
-						defaultModel, err = llama.New(ctx.String("default-model"), opts...)
-						if err != nil {
-							return err
-						}
-					}
-
-					return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
+					return api.Start(model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
 				},
 			},
 		},