models(gallery): add mistral-0.3 and command-r, update functions (#2388)

* models(gallery): add mistral-0.3 and command-r, update functions Add also disable_parallel_new_lines to disable newlines in the JSON output when forcing parallel tools. Some models (like mistral) might be very sensible to that when being used for function calling. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * models(gallery): add aya-23-8b Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-16 14:08:09 +00:00 · 2024-05-23 19:16:08 +02:00
parent eb11a46a73
commit ea330d452d
12 changed files with 266 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,9 @@ get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp
+/backend/cpp/llama-*
+
+*.log

 go-ggml-transformers
 go-gpt2
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@ -2,6 +2,7 @@ name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192

 stopwords:
 - "<|im_end|>"
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@ -2,6 +2,7 @@ name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192

 stopwords:
 - "<|im_end|>"
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@ -1,5 +1,7 @@
 name: gpt-4
 mmap: false
+context_size: 8192
+
 f16: false
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
--- a/gallery/command-r.yaml
+++ b/gallery/command-r.yaml
@ -0,0 +1,69 @@
+---
+name: "command-r"
+
+config_file: |
+  context_size: 131072
+  stopwords:
+  - "<|END_OF_TURN_TOKEN|>"
+
+  function:
+    # disable injecting the "answer" tool
+    disable_no_action: true
+
+    grammar:
+      # This allows the grammar to also return messages
+      mixed_mode: true
+      # Not all models have a sketchpad or something to write thoughts on.
+      # This one will OR reply to strings OR JSON, but not both in the same reply
+      #no_mixed_free_string: true
+      # Disable grammar
+      # Base instructor model doesn't work well with grammars
+      #disable: true
+      disable_parallel_new_lines: true
+    return_name_in_function_response: true
+    replace_function_results:
+    # Replace everything that is not JSON array or object
+    - key: '(?s)^[^{\[]*'
+      value: ""
+    - key: '(?s)[^}\]]*$'
+      value: ""
+    # Convert single quotes to double quotes
+    - key: "'([^']*?)'"
+      value: "_DQUOTE_${1}_DQUOTE_"
+    - key: '\\"'
+      value: "__TEMP_QUOTE__"
+    - key: "\'"
+      value: "'"
+    - key: "_DQUOTE_"
+      value: '"'
+    - key: "__TEMP_QUOTE__"
+      value: '"'
+
+  template:
+    join_chat_messages_by_character: "" ## No newlines between messages
+    chat: |-
+      {{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+    chat_message: |-
+      {{if eq .RoleName "user" -}}
+      <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+      {{- else if eq .RoleName "system" -}}
+      <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+      {{- else if eq .RoleName "assistant" -}}
+      <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+      {{- else if eq .RoleName "tool" -}}
+      <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+      {{- else if .FunctionCall -}}
+      <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+      {{- end -}}
+
+    completion: |
+      {{.Input}}
+    function: |-
+      <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+      You are a function calling AI model, you can call the following functions:
+      ## Available Tools
+      {{range .Functions}}
+      - {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+      {{end}}
+      When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+      <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}
--- a/gallery/hermes-2-pro-mistral.yaml
+++ b/gallery/hermes-2-pro-mistral.yaml
@ -3,6 +3,7 @@ name: "hermes-2-pro-mistral"

 config_file: |
  mmap: true
+  context_size: 8192
  stopwords:
  - "<|im_end|>"
  - "<dummy32000>"
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@ -1,4 +1,35 @@
 ---
+## START Mistral
+- &mistral03
+  url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
+  name: "mistral-7b-instruct-v0.3"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/62dac1c7a8ead43d20e3e17a/wrLf5yaGC6ng4XME70w6Z.png
+  license: apache-2.0
+  description: |
+    The Mistral-7B-Instruct-v0.3 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-7B-v0.3.
+
+    Mistral-7B-v0.3 has the following changes compared to Mistral-7B-v0.2
+
+        Extended vocabulary to 32768
+        Supports v3 Tokenizer
+        Supports function calling
+  urls:
+    - https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+    - https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - mistral
+    - cpu
+    - function-calling
+  overrides:
+    parameters:
+      model: Mistral-7B-Instruct-v0.3.Q4_K_M.gguf
+  files:
+    - filename: "Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"
+      sha256: "14850c84ff9f06e9b51d505d64815d5cc0cea0257380353ac0b3d21b21f6e024"
+      uri: "huggingface://MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"
 ### START mudler's LocalAI specific-models
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@ -1134,6 +1165,46 @@
    - filename: Llama-3-Hercules-5.0-8B-Q4_K_M.gguf
      sha256: 83647caf4a23a91697585cff391e7d1236fac867392f9e49a6dab59f81b5f810
      uri: huggingface://bartowski/Llama-3-Hercules-5.0-8B-GGUF/Llama-3-Hercules-5.0-8B-Q4_K_M.gguf
+### START Command-r
+- &command-R
+  url: "github:mudler/LocalAI/gallery/command-r.yaml@master"
+  name: "command-r-v01:q1_s"
+  license: "cc-by-nc-4.0"
+  icon: https://cdn.sanity.io/images/rjtqmwfu/production/ae020d94b599cc453cc09ebc80be06d35d953c23-102x18.svg
+  urls:
+    - https://huggingface.co/CohereForAI/c4ai-command-r-v01
+    - https://huggingface.co/dranger003/c4ai-command-r-v01-iMat.GGUF
+  description: |
+    C4AI Command-R is a research release of a 35 billion parameter highly performant generative model. Command-R is a large language model with open weights optimized for a variety of use cases including reasoning, summarization, and question answering. Command-R has the capability for multilingual generation evaluated in 10 languages and highly performant RAG capabilities.
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - command-r
+    - cpu
+  overrides:
+    parameters:
+      model: ggml-c4ai-command-r-v01-iq1_s.gguf
+  files:
+    - filename: "ggml-c4ai-command-r-v01-iq1_s.gguf"
+      sha256: "aad4594ee45402fe344d8825937d63b9fa1f00becc6d1cc912b016dbb020e0f0"
+      uri: "huggingface://dranger003/c4ai-command-r-v01-iMat.GGUF/ggml-c4ai-command-r-v01-iq1_s.gguf"
+- !!merge <<: *command-R
+  name: "aya-23-8b"
+  urls:
+    - https://huggingface.co/CohereForAI/aya-23-8B
+    - https://huggingface.co/bartowski/aya-23-8B-GGUF
+  description: |
+    Aya 23 is an open weights research release of an instruction fine-tuned model with highly advanced multilingual capabilities. Aya 23 focuses on pairing a highly performant pre-trained Command family of models with the recently released Aya Collection. The result is a powerful multilingual large language model serving 23 languages.
+
+    This model card corresponds to the 8-billion version of the Aya 23 model. We also released a 35-billion version which you can find here.
+  overrides:
+    parameters:
+      model: aya-23-8B-Q4_K_M.gguf
+  files:
+    - filename: "aya-23-8B-Q4_K_M.gguf"
+      sha256: "21b3aa3abf067f78f6fe08deb80660cc4ee8ad7b4ab873a98d87761f9f858b0f"
+      uri: "huggingface://bartowski/aya-23-8B-GGUF/aya-23-8B-Q4_K_M.gguf"
 - &phi-2-chat
  ### START Phi-2
  url: "github:mudler/LocalAI/gallery/phi-2-chat.yaml@master"
--- a/gallery/mistral-0.3.yaml
+++ b/gallery/mistral-0.3.yaml
@ -0,0 +1,67 @@
+---
+name: "mistral-0.3"
+
+config_file: |
+  context_size: 8192
+  mmap: true
+  stopwords:
+  - "<|im_end|>"
+  - "<dummy32000>"
+  - "</tool_call>"
+  - "<|eot_id|>"
+  - "<|end_of_text|>"
+  - "</s>"
+  - "[/TOOL_CALLS]"
+  - "[/ACTIONS]"
+
+  function:
+    # disable injecting the "answer" tool
+    disable_no_action: true
+
+    grammar:
+      # This allows the grammar to also return messages
+      #mixed_mode: true
+      # Not all models have a sketchpad or something to write thoughts on.
+      # This one will OR reply to strings OR JSON, but not both in the same reply
+      #no_mixed_free_string: true
+      # Disable grammar
+      # Base instructor model doesn't work well with grammars
+      disable: true
+      parallel_calls: true
+      disable_parallel_new_lines: true
+
+    return_name_in_function_response: true
+    # Without grammar uncomment the lines below
+    # Warning: this is relying only on the capability of the
+    # LLM model to generate the correct function call.
+    json_regex_match:
+      - "(?s)\\[TOOL\\_CALLS\\](.*)"
+    replace_function_results:
+    # Replace everything that is not JSON array or object
+    - key: '(?s)^[^{\[]*'
+      value: ""
+    - key: '(?s)[^}\]]*$'
+      value: ""
+    - key: "(?s)\\[TOOL\\_CALLS\\]"
+      value: ""
+    - key: "(?s)\\[\\/TOOL\\_CALLS\\]"
+      value: ""
+
+  template:
+    join_chat_messages_by_character: "" ## No newlines between messages
+    chat: |
+      {{.Input -}}
+    chat_message: |-
+      {{if eq .RoleName "user" -}}
+      [INST] {{.Content }} [/INST]
+      {{- else if .FunctionCall -}}
+      [TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+      {{- else if eq .RoleName "tool" -}}
+      [TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+      {{- else -}}
+      {{ .Content -}}
+      {{ end -}}
+    completion: |
+      {{.Input}}
+    function: |-
+      [AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}
--- a/pkg/functions/grammar_json_schema.go
+++ b/pkg/functions/grammar_json_schema.go
@ -50,6 +50,9 @@ var (
 			[^"\\] |
 			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
 		  )* "\"" space`,
+		// TODO: we shouldn't forbid \" and \\ or all unicode and have this branch here,
+		// however, if we don't have it, the grammar will be ambiguous and
+		// empirically results are way worse.
 		"freestring": `(
 			[^"\\] |
 			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
@ -111,12 +114,18 @@ func (sc *JSONSchemaConverter) addRule(name, rule string) string {
 	return key
 }

-const array = `arr  ::=
+const arrayNewLines = `arr  ::=
  "[\n"  (
 		realvalue
    (",\n"  realvalue)*
  )? "]"`

+const array = `arr  ::=
+  "["  (
+		realvalue
+    (","  realvalue)*
+  )? "]"`
+
 func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption)) string {

 	grammarOpts := &GrammarOption{}
@ -124,6 +133,7 @@ func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption))

 	suffix := grammarOpts.Suffix
 	maybeArray := grammarOpts.MaybeArray
+	disableParallelNewLines := grammarOpts.DisableParallelNewLines
 	maybeString := grammarOpts.MaybeString
 	noMixedFreeString := grammarOpts.NoMixedFreeString

@ -177,7 +187,11 @@ func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption))
 	}

 	lines = append(lines, fmt.Sprintf("%s ::= %s", "root", newRoot))
-	lines = append(lines, array)
+	if disableParallelNewLines {
+		lines = append(lines, array)
+	} else {
+		lines = append(lines, arrayNewLines)
+	}

 	if maybeArray {
 		lines = append(lines, `mixedstring ::= freestring | freestring arr | freestring realvalue | realvalue | arr`)
--- a/pkg/functions/grammar_json_schema_test.go
+++ b/pkg/functions/grammar_json_schema_test.go
@ -427,5 +427,22 @@ var _ = Describe("JSON schema grammar tests", func() {
 			}
 			Expect(len(results)).To(Equal(len(strings.Split(grammar, "\n"))), grammar)
 		})
+
+		It("generates parallel tools without newlines in JSON", func() {
+			structuredGrammar := JSONFunctionStructureName{
+				OneOf: testFunctionsName}
+			content := `arr  ::=
+"["  (
+realvalue
+(","  realvalue)*
+)? "]"`
+			grammar := structuredGrammar.Grammar(functions.EnableMaybeString, functions.EnableMaybeArray, functions.DisableParallelNewLines)
+			results := strings.Split(content, "\n")
+			for _, r := range results {
+				if r != "" {
+					Expect(grammar).To(ContainSubstring(r))
+				}
+			}
+		})
 	})
 })
--- a/pkg/functions/options.go
+++ b/pkg/functions/options.go
@ -1,11 +1,12 @@
 package functions

 type GrammarOption struct {
-	PropOrder         string
-	Suffix            string
-	MaybeArray        bool
-	MaybeString       bool
-	NoMixedFreeString bool
+	PropOrder               string
+	Suffix                  string
+	MaybeArray              bool
+	DisableParallelNewLines bool
+	MaybeString             bool
+	NoMixedFreeString       bool
 }

 func (o *GrammarOption) Apply(options ...func(*GrammarOption)) {
@ -18,6 +19,10 @@ var EnableMaybeArray = func(o *GrammarOption) {
 	o.MaybeArray = true
 }

+var DisableParallelNewLines = func(o *GrammarOption) {
+	o.DisableParallelNewLines = true
+}
+
 var EnableMaybeString = func(o *GrammarOption) {
 	o.MaybeString = true
 }
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@ -12,6 +12,8 @@ type GrammarConfig struct {
 	// ParallelCalls enables the LLM to return multiple function calls in the same response
 	ParallelCalls bool `yaml:"parallel_calls"`

+	DisableParallelNewLines bool `yaml:"disable_parallel_new_lines"`
+
 	// MixedMode enables the LLM to return strings and not only JSON objects
 	// This is useful for models to not constraing returning only JSON and also messages back to the user
 	MixedMode bool `yaml:"mixed_mode"`
@ -81,6 +83,9 @@ func (g GrammarConfig) Options() []func(o *GrammarOption) {
 	if g.ParallelCalls {
 		opts = append(opts, EnableMaybeArray)
 	}
+	if g.DisableParallelNewLines {
+		opts = append(opts, DisableParallelNewLines)
+	}
 	if g.Prefix != "" {
 		opts = append(opts, SetPrefix(g.Prefix))
 	}
@ -134,7 +139,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
 			var singleObj map[string]interface{}
 			err = json.Unmarshal([]byte(s), &singleObj)
 			if err != nil {
-				log.Warn().Err(err).Str("escapedLLMResult", s).Msg("unable to unmarshal llm result")
+				log.Debug().Err(err).Str("escapedLLMResult", s).Msg("unable to unmarshal llm result in a single object or an array of JSON objects")
 			} else {
 				ss = []map[string]interface{}{singleObj}
 			}
@ -177,6 +182,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
 			match := respRegex.FindStringSubmatch(llmresult)
 			if len(match) >= 2 {
 				llmresult = match[1]
+				log.Debug().Msgf("LLM result(JSONRegexMatch): %s", llmresult)
 				break
 			}
 		}