feat(templates): extract text from multimodal requests (#3866)

When offloading template construction to the backend, we want to keep text around in case of multimodal requests. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-17 06:28:07 +00:00 · 2024-10-17 17:33:50 +02:00
parent 9db068388b
commit d5da8c3509
1 changed files with 11 additions and 0 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -2,6 +2,7 @@ package backend
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"os"
 	"regexp"
@ -77,6 +78,16 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			switch ct := message.Content.(type) {
 			case string:
 				protoMessages[i].Content = ct
 			case []interface{}:
 				// If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here
 				data, _ := json.Marshal(ct)
 				resultData := []struct {
 					Text string `json:"text"`
 				}{}
 				json.Unmarshal(data, &resultData)
 				for _, r := range resultData {
 					protoMessages[i].Content += r.Text
 				}
 			default:
 				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
 			}