mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-05 18:18:20 +00:00
feat(api): allow to pass videos to backends (#3601)
This prepares the API to receive videos as well for video understanding. It works similarly to images, where the request should be in the form: { "type": "video_url", "video_url": { "url": "url or base64 data" } } Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
c6a819e92f
commit
fbb9facda4
@ -134,6 +134,7 @@ message PredictOptions {
|
|||||||
repeated string Images = 42;
|
repeated string Images = 42;
|
||||||
bool UseTokenizerTemplate = 43;
|
bool UseTokenizerTemplate = 43;
|
||||||
repeated Message Messages = 44;
|
repeated Message Messages = 44;
|
||||||
|
repeated string Videos = 45;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The response message containing the result
|
// The response message containing the result
|
||||||
|
@ -31,7 +31,7 @@ type TokenUsage struct {
|
|||||||
Completion int
|
Completion int
|
||||||
}
|
}
|
||||||
|
|
||||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||||
modelFile := c.Model
|
modelFile := c.Model
|
||||||
threads := c.Threads
|
threads := c.Threads
|
||||||
if *threads == 0 && o.Threads != 0 {
|
if *threads == 0 && o.Threads != 0 {
|
||||||
@ -101,6 +101,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
|||||||
opts.Messages = protoMessages
|
opts.Messages = protoMessages
|
||||||
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
|
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
|
||||||
opts.Images = images
|
opts.Images = images
|
||||||
|
opts.Videos = videos
|
||||||
|
|
||||||
tokenUsage := TokenUsage{}
|
tokenUsage := TokenUsage{}
|
||||||
|
|
||||||
|
@ -640,8 +640,12 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
|
|||||||
for _, m := range input.Messages {
|
for _, m := range input.Messages {
|
||||||
images = append(images, m.StringImages...)
|
images = append(images, m.StringImages...)
|
||||||
}
|
}
|
||||||
|
videos := []string{}
|
||||||
|
for _, m := range input.Messages {
|
||||||
|
videos = append(videos, m.StringVideos...)
|
||||||
|
}
|
||||||
|
|
||||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
|
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, ml, *config, o, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error().Err(err).Msg("model inference failed")
|
log.Error().Err(err).Msg("model inference failed")
|
||||||
return "", err
|
return "", err
|
||||||
|
@ -27,9 +27,13 @@ func ComputeChoices(
|
|||||||
for _, m := range req.Messages {
|
for _, m := range req.Messages {
|
||||||
images = append(images, m.StringImages...)
|
images = append(images, m.StringImages...)
|
||||||
}
|
}
|
||||||
|
videos := []string{}
|
||||||
|
for _, m := range req.Messages {
|
||||||
|
videos = append(videos, m.StringVideos...)
|
||||||
|
}
|
||||||
|
|
||||||
// get the model function to call for the result
|
// get the model function to call for the result
|
||||||
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
|
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, loader, *config, o, tokenCallback)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return result, backend.TokenUsage{}, err
|
return result, backend.TokenUsage{}, err
|
||||||
}
|
}
|
||||||
|
@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Decode each request's message content
|
// Decode each request's message content
|
||||||
index := 0
|
imgIndex, vidIndex := 0, 0
|
||||||
for i, m := range input.Messages {
|
for i, m := range input.Messages {
|
||||||
switch content := m.Content.(type) {
|
switch content := m.Content.(type) {
|
||||||
case string:
|
case string:
|
||||||
@ -144,20 +144,34 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
|||||||
dat, _ := json.Marshal(content)
|
dat, _ := json.Marshal(content)
|
||||||
c := []schema.Content{}
|
c := []schema.Content{}
|
||||||
json.Unmarshal(dat, &c)
|
json.Unmarshal(dat, &c)
|
||||||
|
CONTENT:
|
||||||
for _, pp := range c {
|
for _, pp := range c {
|
||||||
if pp.Type == "text" {
|
switch pp.Type {
|
||||||
|
case "text":
|
||||||
input.Messages[i].StringContent = pp.Text
|
input.Messages[i].StringContent = pp.Text
|
||||||
} else if pp.Type == "image_url" {
|
case "video", "video_url":
|
||||||
// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
|
// Decode content as base64 either if it's an URL or base64 text
|
||||||
base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
|
base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
|
||||||
if err == nil {
|
if err != nil {
|
||||||
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
|
log.Error().Msgf("Failed encoding video: %s", err)
|
||||||
// set a placeholder for each image
|
continue CONTENT
|
||||||
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
|
|
||||||
index++
|
|
||||||
} else {
|
|
||||||
log.Error().Msgf("Failed encoding image: %s", err)
|
|
||||||
}
|
}
|
||||||
|
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
|
||||||
|
// set a placeholder for each image
|
||||||
|
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
|
||||||
|
vidIndex++
|
||||||
|
case "image_url", "image":
|
||||||
|
// Decode content as base64 either if it's an URL or base64 text
|
||||||
|
|
||||||
|
base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Msgf("Failed encoding image: %s", err)
|
||||||
|
continue CONTENT
|
||||||
|
}
|
||||||
|
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
|
||||||
|
// set a placeholder for each image
|
||||||
|
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
|
||||||
|
imgIndex++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -58,6 +58,7 @@ type Content struct {
|
|||||||
Type string `json:"type" yaml:"type"`
|
Type string `json:"type" yaml:"type"`
|
||||||
Text string `json:"text" yaml:"text"`
|
Text string `json:"text" yaml:"text"`
|
||||||
ImageURL ContentURL `json:"image_url" yaml:"image_url"`
|
ImageURL ContentURL `json:"image_url" yaml:"image_url"`
|
||||||
|
VideoURL ContentURL `json:"video_url" yaml:"video_url"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ContentURL struct {
|
type ContentURL struct {
|
||||||
@ -76,6 +77,7 @@ type Message struct {
|
|||||||
|
|
||||||
StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"`
|
StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"`
|
||||||
StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
|
StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
|
||||||
|
StringVideos []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
|
||||||
|
|
||||||
// A result of a function call
|
// A result of a function call
|
||||||
FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
|
FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
|
||||||
|
@ -13,14 +13,8 @@ var base64DownloadClient http.Client = http.Client{
|
|||||||
Timeout: 30 * time.Second,
|
Timeout: 30 * time.Second,
|
||||||
}
|
}
|
||||||
|
|
||||||
// this function check if the string is an URL, if it's an URL downloads the image in memory
|
// GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers
|
||||||
// encodes it in base64 and returns the base64 string
|
func GetContentURIAsBase64(s string) (string, error) {
|
||||||
|
|
||||||
// This may look weird down in pkg/utils while it is currently only used in core/config
|
|
||||||
//
|
|
||||||
// but I believe it may be useful for MQTT as well in the near future, so I'm
|
|
||||||
// extracting it while I'm thinking of it.
|
|
||||||
func GetImageURLAsBase64(s string) (string, error) {
|
|
||||||
if strings.HasPrefix(s, "http") {
|
if strings.HasPrefix(s, "http") {
|
||||||
// download the image
|
// download the image
|
||||||
resp, err := base64DownloadClient.Get(s)
|
resp, err := base64DownloadClient.Get(s)
|
||||||
|
@ -10,20 +10,20 @@ var _ = Describe("utils/base64 tests", func() {
|
|||||||
It("GetImageURLAsBase64 can strip jpeg data url prefixes", func() {
|
It("GetImageURLAsBase64 can strip jpeg data url prefixes", func() {
|
||||||
// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
|
// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
|
||||||
input := ""
|
input := ""
|
||||||
b64, err := GetImageURLAsBase64(input)
|
b64, err := GetContentURIAsBase64(input)
|
||||||
Expect(err).To(BeNil())
|
Expect(err).To(BeNil())
|
||||||
Expect(b64).To(Equal("FOO"))
|
Expect(b64).To(Equal("FOO"))
|
||||||
})
|
})
|
||||||
It("GetImageURLAsBase64 can strip png data url prefixes", func() {
|
It("GetImageURLAsBase64 can strip png data url prefixes", func() {
|
||||||
// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
|
// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
|
||||||
input := ""
|
input := ""
|
||||||
b64, err := GetImageURLAsBase64(input)
|
b64, err := GetContentURIAsBase64(input)
|
||||||
Expect(err).To(BeNil())
|
Expect(err).To(BeNil())
|
||||||
Expect(b64).To(Equal("BAR"))
|
Expect(b64).To(Equal("BAR"))
|
||||||
})
|
})
|
||||||
It("GetImageURLAsBase64 returns an error for bogus data", func() {
|
It("GetImageURLAsBase64 returns an error for bogus data", func() {
|
||||||
input := "FOO"
|
input := "FOO"
|
||||||
b64, err := GetImageURLAsBase64(input)
|
b64, err := GetContentURIAsBase64(input)
|
||||||
Expect(b64).To(Equal(""))
|
Expect(b64).To(Equal(""))
|
||||||
Expect(err).ToNot(BeNil())
|
Expect(err).ToNot(BeNil())
|
||||||
Expect(err).To(MatchError("not valid string"))
|
Expect(err).To(MatchError("not valid string"))
|
||||||
@ -31,7 +31,7 @@ var _ = Describe("utils/base64 tests", func() {
|
|||||||
It("GetImageURLAsBase64 can actually download images and calculates something", func() {
|
It("GetImageURLAsBase64 can actually download images and calculates something", func() {
|
||||||
// This test doesn't actually _check_ the results at this time, which is bad, but there wasn't a test at all before...
|
// This test doesn't actually _check_ the results at this time, which is bad, but there wasn't a test at all before...
|
||||||
input := "https://upload.wikimedia.org/wikipedia/en/2/29/Wargames.jpg"
|
input := "https://upload.wikimedia.org/wikipedia/en/2/29/Wargames.jpg"
|
||||||
b64, err := GetImageURLAsBase64(input)
|
b64, err := GetContentURIAsBase64(input)
|
||||||
Expect(err).To(BeNil())
|
Expect(err).To(BeNil())
|
||||||
Expect(b64).ToNot(BeNil())
|
Expect(b64).ToNot(BeNil())
|
||||||
})
|
})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user