feat(templates): extract text from multimodal requests (#3866)

When offloading template construction to the backend, we want to keep text around in case of multimodal requests. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-13 14:03:15 +00:00 · 2024-10-17 17:33:50 +02:00 · 2024-10-17 17:33:50 +02:00 · d5da8c3509
commit d5da8c3509
parent 9db068388b
1 changed files with 11 additions and 0 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -2,6 +2,7 @@ package backend

 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"os"
 	"regexp"
@ -77,6 +78,16 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			switch ct := message.Content.(type) {
 			case string:
 				protoMessages[i].Content = ct
+			case []interface{}:
+				// If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here
+				data, _ := json.Marshal(ct)
+				resultData := []struct {
+					Text string `json:"text"`
+				}{}
+				json.Unmarshal(data, &resultData)
+				for _, r := range resultData {
+					protoMessages[i].Content += r.Text
+				}
 			default:
 				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
 			}