From 35290e146b8b575cd691c844dd611ead3c111c0b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 31 Mar 2024 13:04:09 +0200
Subject: [PATCH] fix(grammar): respect JSONmode and grammar from user input
 (#1935)

* fix(grammar): Fix JSON mode and custom grammar

* tests(aio): add jsonmode test

* tests(aio): add functioncall test

* fix(aio): use hermes-2-pro-mistral as llm for CPU profile

* add phi-2-orange
---
 aio/cpu/text-to-text.yaml                | 31 ++++++++--
 core/http/endpoints/openai/chat.go       |  2 +
 core/http/endpoints/openai/completion.go |  2 +
 embedded/models/phi-2-orange.yaml        | 30 +++++++++
 tests/e2e-aio/e2e_test.go                | 79 +++++++++++++++++++++++-
 5 files changed, 139 insertions(+), 5 deletions(-)
 create mode 100644 embedded/models/phi-2-orange.yaml
diff --git a/aio/cpu/text-to-text.yaml b/aio/cpu/text-to-text.yaml
index 4fd88500..aeb3c842 100644
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,25 +1,48 @@
 name: gpt-4
 mmap: true
 parameters:
-  model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q2_K.gguf
 
 template:
   chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{ if .FunctionCall }}<tool_call>{{end}}
+    {{ if eq .RoleName "tool" }}<tool_result>{{end}}
     {{if .Content}}{{.Content}}{{end}}
+    {{if .FunctionCall}}{{toJson .FunctionCall}}{{end}}
+    {{ if .FunctionCall }}</tool_call>{{end}}
+    {{ if eq .RoleName "tool" }}</tool_result>{{end}}
     <|im_end|>
+  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    Use the following pydantic model json schema for each tool call you will make:
+    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
+    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    <tool_call>
+    {'arguments': <args-dict>, 'name': <function-name>}
+    </tool_call><|im_end|>
+    {{.Input}}
+    <|im_start|>assistant
+    <tool_call>
   chat: |
     {{.Input}}
     <|im_start|>assistant
   completion: |
     {{.Input}}
-context_size: 2048
+context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 usage: |
       curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "phi-2-chat",
+          "model": "gpt-4",
           "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
       }'
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index f5f03eb4..837b6e12 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -185,6 +185,8 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			input.Grammar = grammar.JSONBNF
 		}
 
+		config.Grammar = input.Grammar
+
 		// process functions if we have any defined or if we have a function call string
 		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
 			log.Debug().Msgf("Response needs to process functions")
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index a67f0993..69923475 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -73,6 +73,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			input.Grammar = grammar.JSONBNF
 		}
 
+		config.Grammar = input.Grammar
+
 		log.Debug().Msgf("Parameter Config: %+v", config)
 
 		if input.Stream {
diff --git a/embedded/models/phi-2-orange.yaml b/embedded/models/phi-2-orange.yaml
new file mode 100644
index 00000000..9207d283
--- /dev/null
+++ b/embedded/models/phi-2-orange.yaml
@@ -0,0 +1,30 @@
+name: phi-2-chat
+mmap: true
+parameters:
+  model: huggingface://l3utterfly/phi-2-orange-GGUF/phi-2-orange.Q6_K.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+    {{if .Content}}{{.Content}}{{end}}
+    <|im_end|>
+  chat: |
+    {{.Input}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+
+description: |
+  This model is a chatbot that can be used for general conversation.
+  [Model card](https://huggingface.co/TheBloke/phi-2-orange-GGUF)
+
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "phi-2-chat",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
diff --git a/tests/e2e-aio/e2e_test.go b/tests/e2e-aio/e2e_test.go
index c52d789e..8fcd1280 100644
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -2,6 +2,7 @@ package e2e_test
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
@@ -9,8 +10,8 @@ import (
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-
 	"github.com/sashabaranov/go-openai"
+	"github.com/sashabaranov/go-openai/jsonschema"
 )
 
 var _ = Describe("E2E test", func() {
@@ -40,6 +41,82 @@ var _ = Describe("E2E test", func() {
 				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")), fmt.Sprint(resp.Choices[0].Message.Content))
 			})
 		})
+
+		Context("function calls", func() {
+			It("correctly invoke", func() {
+				params := jsonschema.Definition{
+					Type: jsonschema.Object,
+					Properties: map[string]jsonschema.Definition{
+						"location": {
+							Type:        jsonschema.String,
+							Description: "The city and state, e.g. San Francisco, CA",
+						},
+						"unit": {
+							Type: jsonschema.String,
+							Enum: []string{"celsius", "fahrenheit"},
+						},
+					},
+					Required: []string{"location"},
+				}
+
+				f := openai.FunctionDefinition{
+					Name:        "get_current_weather",
+					Description: "Get the current weather in a given location",
+					Parameters:  params,
+				}
+				t := openai.Tool{
+					Type:     openai.ToolTypeFunction,
+					Function: &f,
+				}
+
+				dialogue := []openai.ChatCompletionMessage{
+					{Role: openai.ChatMessageRoleUser, Content: "What is the weather in Boston today?"},
+				}
+				resp, err := client.CreateChatCompletion(context.TODO(),
+					openai.ChatCompletionRequest{
+						Model:    openai.GPT4,
+						Messages: dialogue,
+						Tools:    []openai.Tool{t},
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
+
+				msg := resp.Choices[0].Message
+				Expect(len(msg.ToolCalls)).To(Equal(1), fmt.Sprint(msg.ToolCalls))
+				Expect(msg.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), fmt.Sprint(msg.ToolCalls[0].Function.Name))
+				Expect(msg.ToolCalls[0].Function.Arguments).To(ContainSubstring("Boston"), fmt.Sprint(msg.ToolCalls[0].Function.Arguments))
+			})
+		})
+		Context("json", func() {
+			It("correctly", func() {
+				model := "gpt-4"
+
+				req := openai.ChatCompletionRequest{
+					ResponseFormat: &openai.ChatCompletionResponseFormat{Type: openai.ChatCompletionResponseFormatTypeJSONObject},
+					Model:          model,
+					Messages: []openai.ChatCompletionMessage{
+						{
+
+							Role:    "user",
+							Content: "An animal with 'name', 'gender' and 'legs' fields",
+						},
+					},
+				}
+
+				resp, err := client.CreateChatCompletion(context.TODO(), req)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
+
+				var i map[string]interface{}
+				err = json.Unmarshal([]byte(resp.Choices[0].Message.Content), &i)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(i).To(HaveKey("name"))
+				Expect(i).To(HaveKey("gender"))
+				Expect(i).To(HaveKey("legs"))
+			})
+		})
+
 		Context("images", func() {
 			It("correctly", func() {
 				resp, err := client.CreateImage(context.TODO(),