From d5da8c3509d1e23d1ebcf82a4c9d9964eb1b549a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 17 Oct 2024 17:33:50 +0200
Subject: [PATCH] feat(templates): extract text from multimodal requests
 (#3866)

When offloading template construction to the backend, we want to keep
text around in case of multimodal requests.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/backend/llm.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/core/backend/llm.go b/core/backend/llm.go
index d946d3f8..199a6233 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -2,6 +2,7 @@ package backend
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"os"
 	"regexp"
@@ -77,6 +78,16 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			switch ct := message.Content.(type) {
 			case string:
 				protoMessages[i].Content = ct
+			case []interface{}:
+				// If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here
+				data, _ := json.Marshal(ct)
+				resultData := []struct {
+					Text string `json:"text"`
+				}{}
+				json.Unmarshal(data, &resultData)
+				for _, r := range resultData {
+					protoMessages[i].Content += r.Text
+				}
 			default:
 				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
 			}