LocalAI/core/services/openai.go
Dave eed5706994
refactor: backend/service split, channel-based llm flow (#1963)
Refactor: channel based llm flow and services split

---------

Signed-off-by: Dave Lee <dave@gray101.com>
2024-04-13 09:45:34 +02:00

806 lines
30 KiB
Go

package services
import (
"encoding/json"
"errors"
"fmt"
"strings"
"sync"
"time"
"github.com/go-skynet/LocalAI/core/backend"
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/schema"
"github.com/go-skynet/LocalAI/pkg/concurrency"
"github.com/go-skynet/LocalAI/pkg/grammar"
"github.com/go-skynet/LocalAI/pkg/model"
"github.com/go-skynet/LocalAI/pkg/utils"
"github.com/google/uuid"
"github.com/imdario/mergo"
"github.com/rs/zerolog/log"
)
type endpointGenerationConfigurationFn func(bc *config.BackendConfig, request *schema.OpenAIRequest) endpointConfiguration
type endpointConfiguration struct {
SchemaObject string
TemplatePath string
TemplateData model.PromptTemplateData
ResultMappingFn func(resp *backend.LLMResponse, index int) schema.Choice
CompletionMappingFn func(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse]
TokenMappingFn func(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse]
}
// TODO: This is used for completion and edit. I am pretty sure I forgot parts, but fix it later.
func simpleMapper(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse] {
if resp.Error != nil || resp.Value == nil {
return concurrency.ErrorOr[*schema.OpenAIResponse]{Error: resp.Error}
}
return concurrency.ErrorOr[*schema.OpenAIResponse]{
Value: &schema.OpenAIResponse{
Choices: []schema.Choice{
{
Text: resp.Value.Response,
},
},
Usage: schema.OpenAIUsage{
PromptTokens: resp.Value.Usage.Prompt,
CompletionTokens: resp.Value.Usage.Completion,
TotalTokens: resp.Value.Usage.Prompt + resp.Value.Usage.Completion,
},
},
}
}
// TODO: Consider alternative names for this.
// The purpose of this struct is to hold a reference to the OpenAI request context information
// This keeps things simple within core/services/openai.go and allows consumers to "see" this information if they need it
type OpenAIRequestTraceID struct {
ID string
Created int
}
// This type split out from core/backend/llm.go - I'm still not _totally_ sure about this, but it seems to make sense to keep the generic LLM code from the OpenAI specific higher level functionality
type OpenAIService struct {
bcl *config.BackendConfigLoader
ml *model.ModelLoader
appConfig *config.ApplicationConfig
llmbs *backend.LLMBackendService
}
func NewOpenAIService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig, llmbs *backend.LLMBackendService) *OpenAIService {
return &OpenAIService{
bcl: bcl,
ml: ml,
appConfig: appConfig,
llmbs: llmbs,
}
}
// Keeping in place as a reminder to POTENTIALLY ADD MORE VALIDATION HERE???
func (oais *OpenAIService) getConfig(request *schema.OpenAIRequest) (*config.BackendConfig, *schema.OpenAIRequest, error) {
return oais.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, oais.appConfig)
}
// TODO: It would be a lot less messy to make a return struct that had references to each of these channels
// INTENTIONALLY not doing that quite yet - I believe we need to let the references to unused channels die for the GC to automatically collect -- can we manually free()?
// finalResultsChannel is the primary async return path: one result for the entire request.
// promptResultsChannels is DUBIOUS. It's expected to be raw fan-out used within the function itself, but I am exposing for testing? One bundle of LLMResponseBundle per PromptString? Gets all N completions for a single prompt.
// completionsChannel is a channel that emits one *LLMResponse per generated completion, be that different prompts or N. Seems the most useful other than "entire request" Request is available to attempt tracing???
// tokensChannel is a channel that emits one *LLMResponse per generated token. Let's see what happens!
func (oais *OpenAIService) Completion(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool) (
traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], promptResultsChannels []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle],
completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
return oais.GenerateTextFromRequest(request, func(bc *config.BackendConfig, request *schema.OpenAIRequest) endpointConfiguration {
return endpointConfiguration{
SchemaObject: "text_completion",
TemplatePath: bc.TemplateConfig.Completion,
TemplateData: model.PromptTemplateData{
SystemPrompt: bc.SystemPrompt,
},
ResultMappingFn: func(resp *backend.LLMResponse, promptIndex int) schema.Choice {
return schema.Choice{
Index: promptIndex,
FinishReason: "stop",
Text: resp.Response,
}
},
CompletionMappingFn: simpleMapper,
TokenMappingFn: simpleMapper,
}
}, notifyOnPromptResult, notifyOnToken, nil)
}
func (oais *OpenAIService) Edit(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool) (
traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], promptResultsChannels []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle],
completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
return oais.GenerateTextFromRequest(request, func(bc *config.BackendConfig, request *schema.OpenAIRequest) endpointConfiguration {
return endpointConfiguration{
SchemaObject: "edit",
TemplatePath: bc.TemplateConfig.Edit,
TemplateData: model.PromptTemplateData{
SystemPrompt: bc.SystemPrompt,
Instruction: request.Instruction,
},
ResultMappingFn: func(resp *backend.LLMResponse, promptIndex int) schema.Choice {
return schema.Choice{
Index: promptIndex,
FinishReason: "stop",
Text: resp.Response,
}
},
CompletionMappingFn: simpleMapper,
TokenMappingFn: simpleMapper,
}
}, notifyOnPromptResult, notifyOnToken, nil)
}
func (oais *OpenAIService) Chat(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool) (
traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse],
completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
return oais.GenerateFromMultipleMessagesChatRequest(request, notifyOnPromptResult, notifyOnToken, nil)
}
func (oais *OpenAIService) GenerateTextFromRequest(request *schema.OpenAIRequest, endpointConfigFn endpointGenerationConfigurationFn, notifyOnPromptResult bool, notifyOnToken bool, initialTraceID *OpenAIRequestTraceID) (
traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], promptResultsChannels []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle],
completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
if initialTraceID == nil {
traceID = &OpenAIRequestTraceID{
ID: uuid.New().String(),
Created: int(time.Now().Unix()),
}
} else {
traceID = initialTraceID
}
bc, request, err := oais.getConfig(request)
if err != nil {
log.Error().Msgf("[oais::GenerateTextFromRequest] error getting configuration: %q", err)
return
}
if request.ResponseFormat.Type == "json_object" {
request.Grammar = grammar.JSONBNF
}
bc.Grammar = request.Grammar
if request.Stream && len(bc.PromptStrings) > 1 {
log.Warn().Msg("potentially cannot handle more than 1 `PromptStrings` when Streaming?")
}
rawFinalResultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
finalResultChannel = rawFinalResultChannel
promptResultsChannels = []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle]{}
var rawCompletionsChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
var rawTokenChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
if notifyOnPromptResult {
rawCompletionsChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
}
if notifyOnToken {
rawTokenChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
}
promptResultsChannelLock := sync.Mutex{}
endpointConfig := endpointConfigFn(bc, request)
if len(endpointConfig.TemplatePath) == 0 {
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
if oais.ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", bc.Model)) {
endpointConfig.TemplatePath = bc.Model
} else {
log.Warn().Msgf("failed to find any template for %+v", request)
}
}
setupWG := sync.WaitGroup{}
var prompts []string
if lPS := len(bc.PromptStrings); lPS > 0 {
setupWG.Add(lPS)
prompts = bc.PromptStrings
} else {
setupWG.Add(len(bc.InputStrings))
prompts = bc.InputStrings
}
var setupError error = nil
for pI, p := range prompts {
go func(promptIndex int, prompt string) {
if endpointConfig.TemplatePath != "" {
promptTemplateData := model.PromptTemplateData{
Input: prompt,
}
err := mergo.Merge(promptTemplateData, endpointConfig.TemplateData, mergo.WithOverride)
if err == nil {
templatedInput, err := oais.ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, endpointConfig.TemplatePath, promptTemplateData)
if err == nil {
prompt = templatedInput
log.Debug().Msgf("Template found, input modified to: %s", prompt)
}
}
}
log.Debug().Msgf("[OAIS GenerateTextFromRequest] Prompt: %q", prompt)
promptResultsChannel, completionChannels, tokenChannels, err := oais.llmbs.GenerateText(prompt, request, bc,
func(r *backend.LLMResponse) schema.Choice {
return endpointConfig.ResultMappingFn(r, promptIndex)
}, notifyOnPromptResult, notifyOnToken)
if err != nil {
log.Error().Msgf("Unable to generate text prompt: %q\nerr: %q", prompt, err)
promptResultsChannelLock.Lock()
setupError = errors.Join(setupError, err)
promptResultsChannelLock.Unlock()
setupWG.Done()
return
}
if notifyOnPromptResult {
concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(completionChannels, endpointConfig.CompletionMappingFn), rawCompletionsChannel, true)
}
if notifyOnToken {
concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(tokenChannels, endpointConfig.TokenMappingFn), rawTokenChannel, true)
}
promptResultsChannelLock.Lock()
promptResultsChannels = append(promptResultsChannels, promptResultsChannel)
promptResultsChannelLock.Unlock()
setupWG.Done()
}(pI, p)
}
setupWG.Wait()
// If any of the setup goroutines experienced an error, quit early here.
if setupError != nil {
go func() {
log.Error().Msgf("[OAIS GenerateTextFromRequest] caught an error during setup: %q", setupError)
rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: setupError}
close(rawFinalResultChannel)
}()
return
}
initialResponse := &schema.OpenAIResponse{
ID: traceID.ID,
Created: traceID.Created,
Model: request.Model,
Object: endpointConfig.SchemaObject,
Usage: schema.OpenAIUsage{},
}
// utils.SliceOfChannelsRawMerger[[]schema.Choice](promptResultsChannels, rawFinalResultChannel, func(results []schema.Choice) (*schema.OpenAIResponse, error) {
concurrency.SliceOfChannelsReducer(
promptResultsChannels, rawFinalResultChannel,
func(iv concurrency.ErrorOr[*backend.LLMResponseBundle], result concurrency.ErrorOr[*schema.OpenAIResponse]) concurrency.ErrorOr[*schema.OpenAIResponse] {
if iv.Error != nil {
result.Error = iv.Error
return result
}
result.Value.Usage.PromptTokens += iv.Value.Usage.Prompt
result.Value.Usage.CompletionTokens += iv.Value.Usage.Completion
result.Value.Usage.TotalTokens = result.Value.Usage.PromptTokens + result.Value.Usage.CompletionTokens
result.Value.Choices = append(result.Value.Choices, iv.Value.Response...)
return result
}, concurrency.ErrorOr[*schema.OpenAIResponse]{Value: initialResponse}, true)
completionsChannel = rawCompletionsChannel
tokenChannel = rawTokenChannel
return
}
// TODO: For porting sanity, this is distinct from GenerateTextFromRequest and is _currently_ specific to Chat purposes
// this is not a final decision -- just a reality of moving a lot of parts at once
// / This has _become_ Chat which wasn't the goal... More cleanup in the future once it's stable?
func (oais *OpenAIService) GenerateFromMultipleMessagesChatRequest(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool, initialTraceID *OpenAIRequestTraceID) (
traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse],
completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
if initialTraceID == nil {
traceID = &OpenAIRequestTraceID{
ID: uuid.New().String(),
Created: int(time.Now().Unix()),
}
} else {
traceID = initialTraceID
}
bc, request, err := oais.getConfig(request)
if err != nil {
return
}
// Allow the user to set custom actions via config file
// to be "embedded" in each model
noActionName := "answer"
noActionDescription := "use this action to answer without performing any action"
if bc.FunctionsConfig.NoActionFunctionName != "" {
noActionName = bc.FunctionsConfig.NoActionFunctionName
}
if bc.FunctionsConfig.NoActionDescriptionName != "" {
noActionDescription = bc.FunctionsConfig.NoActionDescriptionName
}
if request.ResponseFormat.Type == "json_object" {
request.Grammar = grammar.JSONBNF
}
bc.Grammar = request.Grammar
processFunctions := false
funcs := grammar.Functions{}
// process functions if we have any defined or if we have a function call string
if len(request.Functions) > 0 && bc.ShouldUseFunctions() {
log.Debug().Msgf("Response needs to process functions")
processFunctions = true
noActionGrammar := grammar.Function{
Name: noActionName,
Description: noActionDescription,
Parameters: map[string]interface{}{
"properties": map[string]interface{}{
"message": map[string]interface{}{
"type": "string",
"description": "The message to reply the user with",
}},
},
}
// Append the no action function
funcs = append(funcs, request.Functions...)
if !bc.FunctionsConfig.DisableNoAction {
funcs = append(funcs, noActionGrammar)
}
// Force picking one of the functions by the request
if bc.FunctionToCall() != "" {
funcs = funcs.Select(bc.FunctionToCall())
}
// Update input grammar
jsStruct := funcs.ToJSONStructure()
bc.Grammar = jsStruct.Grammar("", bc.FunctionsConfig.ParallelCalls)
} else if request.JSONFunctionGrammarObject != nil {
bc.Grammar = request.JSONFunctionGrammarObject.Grammar("", bc.FunctionsConfig.ParallelCalls)
}
if request.Stream && processFunctions {
log.Warn().Msg("Streaming + Functions is highly experimental in this version")
}
var predInput string
if !bc.TemplateConfig.UseTokenizerTemplate || processFunctions {
suppressConfigSystemPrompt := false
mess := []string{}
for messageIndex, i := range request.Messages {
var content string
role := i.Role
// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
roleFn := "assistant_function_call"
r := bc.Roles[roleFn]
if r != "" {
role = roleFn
}
}
r := bc.Roles[role]
contentExists := i.Content != nil && i.StringContent != ""
fcall := i.FunctionCall
if len(i.ToolCalls) > 0 {
fcall = i.ToolCalls
}
// First attempt to populate content via a chat message specific template
if bc.TemplateConfig.ChatMessage != "" {
chatMessageData := model.ChatMessageTemplateData{
SystemPrompt: bc.SystemPrompt,
Role: r,
RoleName: role,
Content: i.StringContent,
FunctionCall: fcall,
FunctionName: i.Name,
LastMessage: messageIndex == (len(request.Messages) - 1),
Function: bc.Grammar != "" && (messageIndex == (len(request.Messages) - 1)),
MessageIndex: messageIndex,
}
templatedChatMessage, err := oais.ml.EvaluateTemplateForChatMessage(bc.TemplateConfig.ChatMessage, chatMessageData)
if err != nil {
log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, bc.TemplateConfig.ChatMessage, err)
} else {
if templatedChatMessage == "" {
log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", bc.TemplateConfig.ChatMessage, chatMessageData)
continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
}
log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
content = templatedChatMessage
}
}
marshalAnyRole := func(f any) {
j, err := json.Marshal(f)
if err == nil {
if contentExists {
content += "\n" + fmt.Sprint(r, " ", string(j))
} else {
content = fmt.Sprint(r, " ", string(j))
}
}
}
marshalAny := func(f any) {
j, err := json.Marshal(f)
if err == nil {
if contentExists {
content += "\n" + string(j)
} else {
content = string(j)
}
}
}
// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
if content == "" {
if r != "" {
if contentExists {
content = fmt.Sprint(r, i.StringContent)
}
if i.FunctionCall != nil {
marshalAnyRole(i.FunctionCall)
}
} else {
if contentExists {
content = fmt.Sprint(i.StringContent)
}
if i.FunctionCall != nil {
marshalAny(i.FunctionCall)
}
if i.ToolCalls != nil {
marshalAny(i.ToolCalls)
}
}
// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
if contentExists && role == "system" {
suppressConfigSystemPrompt = true
}
}
mess = append(mess, content)
}
predInput = strings.Join(mess, "\n")
log.Debug().Msgf("Prompt (before templating): %s", predInput)
templateFile := ""
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
if oais.ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", bc.Model)) {
templateFile = bc.Model
}
if bc.TemplateConfig.Chat != "" && !processFunctions {
templateFile = bc.TemplateConfig.Chat
}
if bc.TemplateConfig.Functions != "" && processFunctions {
templateFile = bc.TemplateConfig.Functions
}
if templateFile != "" {
templatedInput, err := oais.ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
SystemPrompt: bc.SystemPrompt,
SuppressSystemPrompt: suppressConfigSystemPrompt,
Input: predInput,
Functions: funcs,
})
if err == nil {
predInput = templatedInput
log.Debug().Msgf("Template found, input modified to: %s", predInput)
} else {
log.Debug().Msgf("Template failed loading: %s", err.Error())
}
}
}
log.Debug().Msgf("Prompt (after templating): %s", predInput)
if processFunctions {
log.Debug().Msgf("Grammar: %+v", bc.Grammar)
}
rawFinalResultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
var rawCompletionsChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
var rawTokenChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
if notifyOnPromptResult {
rawCompletionsChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
}
if notifyOnToken {
rawTokenChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
}
rawResultChannel, individualCompletionChannels, tokenChannels, err := oais.llmbs.GenerateText(predInput, request, bc, func(resp *backend.LLMResponse) schema.Choice {
return schema.Choice{
Index: 0, // ???
FinishReason: "stop",
Message: &schema.Message{
Role: "assistant",
Content: resp.Response,
},
}
}, notifyOnPromptResult, notifyOnToken)
chatSimpleMappingFn := func(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse] {
if resp.Error != nil || resp.Value == nil {
return concurrency.ErrorOr[*schema.OpenAIResponse]{Error: resp.Error}
}
return concurrency.ErrorOr[*schema.OpenAIResponse]{
Value: &schema.OpenAIResponse{
ID: traceID.ID,
Created: traceID.Created,
Model: request.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{
{
Delta: &schema.Message{
Role: "assistant",
Content: resp.Value.Response,
},
Index: 0,
},
},
Object: "chat.completion.chunk",
Usage: schema.OpenAIUsage{
PromptTokens: resp.Value.Usage.Prompt,
CompletionTokens: resp.Value.Usage.Completion,
TotalTokens: resp.Value.Usage.Prompt + resp.Value.Usage.Completion,
},
},
}
}
if notifyOnPromptResult {
concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(individualCompletionChannels, chatSimpleMappingFn), rawCompletionsChannel, true)
}
if notifyOnToken {
concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(tokenChannels, chatSimpleMappingFn), rawTokenChannel, true)
}
go func() {
rawResult := <-rawResultChannel
if rawResult.Error != nil {
log.Warn().Msgf("OpenAIService::processTools GenerateText error [DEBUG THIS?] %q", rawResult.Error)
return
}
llmResponseChoices := rawResult.Value.Response
if processFunctions && len(llmResponseChoices) > 1 {
log.Warn().Msgf("chat functions response with %d choices in response, debug this?", len(llmResponseChoices))
log.Debug().Msgf("%+v", llmResponseChoices)
}
for _, result := range rawResult.Value.Response {
// If no functions, just return the raw result.
if !processFunctions {
resp := schema.OpenAIResponse{
ID: traceID.ID,
Created: traceID.Created,
Model: request.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{result},
Object: "chat.completion.chunk",
Usage: schema.OpenAIUsage{
PromptTokens: rawResult.Value.Usage.Prompt,
CompletionTokens: rawResult.Value.Usage.Completion,
TotalTokens: rawResult.Value.Usage.Prompt + rawResult.Value.Usage.Prompt,
},
}
rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &resp}
continue
}
// At this point, things are function specific!
// Oh no this can't be the right way to do this... but it works. Save us, mudler!
fString := fmt.Sprintf("%s", result.Message.Content)
results := parseFunctionCall(fString, bc.FunctionsConfig.ParallelCalls)
noActionToRun := (len(results) > 0 && results[0].name == noActionName)
if noActionToRun {
log.Debug().Msg("-- noActionToRun branch --")
initialMessage := schema.OpenAIResponse{
ID: traceID.ID,
Created: traceID.Created,
Model: request.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: ""}}},
Object: "stop",
}
rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &initialMessage}
result, err := oais.handleQuestion(bc, request, results[0].arguments, predInput)
if err != nil {
log.Error().Msgf("error handling question: %s", err.Error())
return
}
resp := schema.OpenAIResponse{
ID: traceID.ID,
Created: traceID.Created,
Model: request.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0}},
Object: "chat.completion.chunk",
Usage: schema.OpenAIUsage{
PromptTokens: rawResult.Value.Usage.Prompt,
CompletionTokens: rawResult.Value.Usage.Completion,
TotalTokens: rawResult.Value.Usage.Prompt + rawResult.Value.Usage.Prompt,
},
}
rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &resp}
} else {
log.Debug().Msgf("[GenerateFromMultipleMessagesChatRequest] fnResultsBranch: %+v", results)
for i, ss := range results {
name, args := ss.name, ss.arguments
initialMessage := schema.OpenAIResponse{
ID: traceID.ID,
Created: traceID.Created,
Model: request.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{
FinishReason: "function_call",
Message: &schema.Message{
Role: "assistant",
ToolCalls: []schema.ToolCall{
{
Index: i,
ID: traceID.ID,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: name,
Arguments: args,
},
},
},
}}},
Object: "chat.completion.chunk",
}
rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &initialMessage}
}
}
}
close(rawFinalResultChannel)
}()
finalResultChannel = rawFinalResultChannel
completionsChannel = rawCompletionsChannel
tokenChannel = rawTokenChannel
return
}
func (oais *OpenAIService) handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, args, prompt string) (string, error) {
log.Debug().Msgf("[handleQuestion called] nothing to do, computing a reply")
// If there is a message that the LLM already sends as part of the JSON reply, use it
arguments := map[string]interface{}{}
json.Unmarshal([]byte(args), &arguments)
m, exists := arguments["message"]
if exists {
switch message := m.(type) {
case string:
if message != "" {
log.Debug().Msgf("Reply received from LLM: %s", message)
message = oais.llmbs.Finetune(*config, prompt, message)
log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
return message, nil
}
}
}
log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
// Otherwise ask the LLM to understand the JSON output and the context, and return a message
// Note: This costs (in term of CPU/GPU) another computation
config.Grammar = ""
images := []string{}
for _, m := range input.Messages {
images = append(images, m.StringImages...)
}
resultChannel, _, err := oais.llmbs.Inference(input.Context, &backend.LLMRequest{
Text: prompt,
Images: images,
RawMessages: input.Messages, // Experimental
}, config, false)
if err != nil {
log.Error().Msgf("inference setup error: %s", err.Error())
return "", err
}
raw := <-resultChannel
if raw.Error != nil {
log.Error().Msgf("inference error: %q", raw.Error.Error())
return "", err
}
if raw.Value == nil {
log.Warn().Msgf("nil inference response")
return "", nil
}
return oais.llmbs.Finetune(*config, prompt, raw.Value.Response), nil
}
type funcCallResults struct {
name string
arguments string
}
func parseFunctionCall(llmresult string, multipleResults bool) []funcCallResults {
results := []funcCallResults{}
// TODO: use generics to avoid this code duplication
if multipleResults {
ss := []map[string]interface{}{}
s := utils.EscapeNewLines(llmresult)
json.Unmarshal([]byte(s), &ss)
for _, s := range ss {
func_name, ok := s["function"]
if !ok {
continue
}
args, ok := s["arguments"]
if !ok {
continue
}
d, _ := json.Marshal(args)
funcName, ok := func_name.(string)
if !ok {
continue
}
results = append(results, funcCallResults{name: funcName, arguments: string(d)})
}
} else {
// As we have to change the result before processing, we can't stream the answer token-by-token (yet?)
ss := map[string]interface{}{}
// This prevent newlines to break JSON parsing for clients
// s := utils.EscapeNewLines(llmresult)
json.Unmarshal([]byte(llmresult), &ss)
// The grammar defines the function name as "function", while OpenAI returns "name"
func_name, ok := ss["function"]
if !ok {
log.Debug().Msg("ss[function] is not OK!")
return results
}
// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
args, ok := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
if !ok {
log.Debug().Msg("ss[arguments] is not OK!")
return results
}
d, _ := json.Marshal(args)
funcName, ok := func_name.(string)
if !ok {
log.Debug().Msgf("unexpected func_name: %+v", func_name)
return results
}
results = append(results, funcCallResults{name: funcName, arguments: string(d)})
}
return results
}