Add both API endpoints (completion, chat)

2025-02-21 09:41:45 +00:00 · 2023-04-09 12:30:55 +02:00 · 2023-04-09 12:30:55 +02:00 · f43aeeb4a1
commit f43aeeb4a1
parent c17dcc5e9d
1 changed files with 145 additions and 49 deletions
--- a/api.go
+++ b/api.go
@ -16,55 +16,38 @@ import (
 )
 type OpenAIResponse struct {
-	Created int      `json:"created"`
+	Created int      `json:"created,omitempty"`
-	Object  string   `json:"chat.completion"`
+	Object  string   `json:"chat.completion,omitempty"`
-	ID      string   `json:"id"`
+	ID      string   `json:"id,omitempty"`
-	Model   string   `json:"model"`
+	Model   string   `json:"model,omitempty"`
-	Choices []Choice `json:"choices"`
+	Choices []Choice `json:"choices,omitempty"`
 }
 type Choice struct {
-	Index        int     `json:"index"`
+	Index        int     `json:"index,omitempty"`
-	FinishReason string  `json:"finish_reason"`
+	FinishReason string  `json:"finish_reason,omitempty"`
-	Message      Message `json:"message"`
+	Message      Message `json:"message,omitempty"`
 	Text         string  `json:"text,omitempty"`
 }
 type Message struct {
-	Role    string `json:"role"`
+	Role    string `json:"role,omitempty"`
-	Content string `json:"content"`
+	Content string `json:"content,omitempty"`
 }
 //go:embed index.html
 var indexHTML embed.FS
-func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, threads int) error {
+func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
-	app := fiber.New()
+	return func(c *fiber.Ctx) error {
 	// Default middleware config
 	app.Use(recover.New())
 	app.Use(cors.New())
 	app.Use("/", filesystem.New(filesystem.Config{
 		Root:         http.FS(indexHTML),
 		NotFoundFile: "index.html",
 	}))
 	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 	var mutex = &sync.Mutex{}
 	mu := map[string]*sync.Mutex{}
 	var mumutex = &sync.Mutex{}
 	// openAI compatible API endpoint
 	app.Post("/v1/chat/completions", func(c *fiber.Ctx) error {
 		var err error
 		var model *llama.LLama
 		// Get input data from the request body
 		input := new(struct {
-			Messages []Message `json:"messages"`
+			Model  string `json:"model"`
-			Model    string    `json:"model"`
+			Prompt string `json:"prompt"`
 			Prompt   string    `json:"prompt"`
 		})
 		if err := c.BodyParser(input); err != nil {
 			return err
@ -84,19 +67,114 @@ func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, thre
 		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 		if input.Model != "" {
-			mumutex.Lock()
+			mutexMap.Lock()
-			l, ok := mu[input.Model]
+			l, ok := mutexes[input.Model]
 			if !ok {
 				m := &sync.Mutex{}
-				mu[input.Model] = m
+				mutexes[input.Model] = m
 				l = m
 			}
-			mumutex.Unlock()
+			mutexMap.Unlock()
 			l.Lock()
 			defer l.Unlock()
 		} else {
-			mutex.Lock()
+			defaultMutex.Lock()
-			defer mutex.Unlock()
+			defer defaultMutex.Unlock()
 		}
 		// Set the parameters for the language model prediction
 		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
 		if err != nil {
 			return err
 		}
 		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
 		if err != nil {
 			return err
 		}
 		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
 		if err != nil {
 			return err
 		}
 		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
 		if err != nil {
 			return err
 		}
 		predInput := input.Prompt
 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
 			Input string
 		}{Input: input.Prompt})
 		if err == nil {
 			predInput = templatedInput
 		}
 		// Generate the prediction using the language model
 		prediction, err := model.Predict(
 			predInput,
 			llama.SetTemperature(temperature),
 			llama.SetTopP(topP),
 			llama.SetTopK(topK),
 			llama.SetTokens(tokens),
 			llama.SetThreads(threads),
 		)
 		if err != nil {
 			return err
 		}
 		// Return the prediction in the response body
 		return c.JSON(OpenAIResponse{
 			Model:   input.Model,
 			Choices: []Choice{{Text: prediction}},
 		})
 	}
 }
 func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		var err error
 		var model *llama.LLama
 		// Get input data from the request body
 		input := new(struct {
 			Messages []Message `json:"messages"`
 			Model    string    `json:"model"`
 		})
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
 		if input.Model == "" {
 			if defaultModel == nil {
 				return fmt.Errorf("no default model loaded, and no model specified")
 			}
 			model = defaultModel
 		} else {
 			model, err = loader.LoadModel(input.Model)
 			if err != nil {
 				return err
 			}
 		}
 		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 		if input.Model != "" {
 			mutexMap.Lock()
 			l, ok := mutexes[input.Model]
 			if !ok {
 				m := &sync.Mutex{}
 				mutexes[input.Model] = m
 				l = m
 			}
 			mutexMap.Unlock()
 			l.Lock()
 			defer l.Unlock()
 		} else {
 			defaultMutex.Lock()
 			defer defaultMutex.Unlock()
 		}
 		// Set the parameters for the language model prediction
@ -127,16 +205,12 @@ func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, thre
 		predInput := strings.Join(mess, "\n")
-		if input.Prompt == "" {
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
-			templatedInput, err := loader.TemplatePrefix(input.Model, struct {
+			Input string
-				Input string
+		}{Input: predInput})
-			}{Input: predInput})
+		if err == nil {
-			if err == nil {
+			predInput = templatedInput
 				predInput = templatedInput
 			}
 		} else {
 			predInput = input.Prompt + predInput
 		}
 		// Generate the prediction using the language model
@ -157,7 +231,29 @@ func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, thre
 			Model:   input.Model,
 			Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
 		})
-	})
+	}
 }
 func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, threads int) error {
 	app := fiber.New()
 	// Default middleware config
 	app.Use(recover.New())
 	app.Use(cors.New())
 	app.Use("/", filesystem.New(filesystem.Config{
 		Root:         http.FS(indexHTML),
 		NotFoundFile: "index.html",
 	}))
 	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 	var mutex = &sync.Mutex{}
 	mu := map[string]*sync.Mutex{}
 	var mumutex = &sync.Mutex{}
 	// openAI compatible API endpoint
 	app.Post("/v1/chat/completions", chatEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
 	app.Post("/v1/completions", completionEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
 	/*
 		curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{