From ea330d452d82bf8d6ec7d2d35eb0520daf368934 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 23 May 2024 19:16:08 +0200 Subject: [PATCH] models(gallery): add mistral-0.3 and command-r, update functions (#2388) * models(gallery): add mistral-0.3 and command-r, update functions Add also disable_parallel_new_lines to disable newlines in the JSON output when forcing parallel tools. Some models (like mistral) might be very sensible to that when being used for function calling. Signed-off-by: Ettore Di Giacinto * models(gallery): add aya-23-8b Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- .gitignore | 5 +- aio/cpu/text-to-text.yaml | 1 + aio/gpu-8g/text-to-text.yaml | 1 + aio/intel/text-to-text.yaml | 2 + gallery/command-r.yaml | 69 ++++++++++++++++++++++ gallery/hermes-2-pro-mistral.yaml | 1 + gallery/index.yaml | 71 +++++++++++++++++++++++ gallery/mistral-0.3.yaml | 67 +++++++++++++++++++++ pkg/functions/grammar_json_schema.go | 18 +++++- pkg/functions/grammar_json_schema_test.go | 17 ++++++ pkg/functions/options.go | 15 +++-- pkg/functions/parse.go | 8 ++- 12 files changed, 266 insertions(+), 9 deletions(-) create mode 100644 gallery/command-r.yaml create mode 100644 gallery/mistral-0.3.yaml diff --git a/.gitignore b/.gitignore index 07b8dbff..a67a71c4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ get-sources prepare-sources /backend/cpp/llama/grpc-server /backend/cpp/llama/llama.cpp +/backend/cpp/llama-* + +*.log go-ggml-transformers go-gpt2 @@ -49,4 +52,4 @@ prepare .scannerwork # backend virtual environments -**/venv \ No newline at end of file +**/venv diff --git a/aio/cpu/text-to-text.yaml b/aio/cpu/text-to-text.yaml index ec1f3753..74f46817 100644 --- a/aio/cpu/text-to-text.yaml +++ b/aio/cpu/text-to-text.yaml @@ -2,6 +2,7 @@ name: gpt-4 mmap: true parameters: model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf +context_size: 8192 stopwords: - "<|im_end|>" diff --git a/aio/gpu-8g/text-to-text.yaml b/aio/gpu-8g/text-to-text.yaml index a8904f93..62674a38 100644 --- a/aio/gpu-8g/text-to-text.yaml +++ b/aio/gpu-8g/text-to-text.yaml @@ -2,6 +2,7 @@ name: gpt-4 mmap: true parameters: model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf +context_size: 8192 stopwords: - "<|im_end|>" diff --git a/aio/intel/text-to-text.yaml b/aio/intel/text-to-text.yaml index 69693ec0..893b9acf 100644 --- a/aio/intel/text-to-text.yaml +++ b/aio/intel/text-to-text.yaml @@ -1,5 +1,7 @@ name: gpt-4 mmap: false +context_size: 8192 + f16: false parameters: model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf diff --git a/gallery/command-r.yaml b/gallery/command-r.yaml new file mode 100644 index 00000000..81a24fb1 --- /dev/null +++ b/gallery/command-r.yaml @@ -0,0 +1,69 @@ +--- +name: "command-r" + +config_file: | + context_size: 131072 + stopwords: + - "<|END_OF_TURN_TOKEN|>" + + function: + # disable injecting the "answer" tool + disable_no_action: true + + grammar: + # This allows the grammar to also return messages + mixed_mode: true + # Not all models have a sketchpad or something to write thoughts on. + # This one will OR reply to strings OR JSON, but not both in the same reply + #no_mixed_free_string: true + # Disable grammar + # Base instructor model doesn't work well with grammars + #disable: true + disable_parallel_new_lines: true + return_name_in_function_response: true + replace_function_results: + # Replace everything that is not JSON array or object + - key: '(?s)^[^{\[]*' + value: "" + - key: '(?s)[^}\]]*$' + value: "" + # Convert single quotes to double quotes + - key: "'([^']*?)'" + value: "_DQUOTE_${1}_DQUOTE_" + - key: '\\"' + value: "__TEMP_QUOTE__" + - key: "\'" + value: "'" + - key: "_DQUOTE_" + value: '"' + - key: "__TEMP_QUOTE__" + value: '"' + + template: + join_chat_messages_by_character: "" ## No newlines between messages + chat: |- + {{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> + chat_message: |- + {{if eq .RoleName "user" -}} + <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> + {{- else if eq .RoleName "system" -}} + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> + {{- else if eq .RoleName "assistant" -}} + <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> + {{- else if eq .RoleName "tool" -}} + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> + {{- else if .FunctionCall -}} + <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|> + {{- end -}} + + completion: | + {{.Input}} + function: |- + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> + You are a function calling AI model, you can call the following functions: + ## Available Tools + {{range .Functions}} + - {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }} + {{end}} + When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}} + <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}} diff --git a/gallery/hermes-2-pro-mistral.yaml b/gallery/hermes-2-pro-mistral.yaml index 195ebbc9..200a28b8 100644 --- a/gallery/hermes-2-pro-mistral.yaml +++ b/gallery/hermes-2-pro-mistral.yaml @@ -3,6 +3,7 @@ name: "hermes-2-pro-mistral" config_file: | mmap: true + context_size: 8192 stopwords: - "<|im_end|>" - "" diff --git a/gallery/index.yaml b/gallery/index.yaml index 2ac515c0..c1892ade 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,4 +1,35 @@ --- +## START Mistral +- &mistral03 + url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master" + name: "mistral-7b-instruct-v0.3" + icon: https://cdn-avatars.huggingface.co/v1/production/uploads/62dac1c7a8ead43d20e3e17a/wrLf5yaGC6ng4XME70w6Z.png + license: apache-2.0 + description: | + The Mistral-7B-Instruct-v0.3 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-7B-v0.3. + + Mistral-7B-v0.3 has the following changes compared to Mistral-7B-v0.2 + + Extended vocabulary to 32768 + Supports v3 Tokenizer + Supports function calling + urls: + - https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 + - https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF + tags: + - llm + - gguf + - gpu + - mistral + - cpu + - function-calling + overrides: + parameters: + model: Mistral-7B-Instruct-v0.3.Q4_K_M.gguf + files: + - filename: "Mistral-7B-Instruct-v0.3.Q4_K_M.gguf" + sha256: "14850c84ff9f06e9b51d505d64815d5cc0cea0257380353ac0b3d21b21f6e024" + uri: "huggingface://MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf" ### START mudler's LocalAI specific-models - &mudler url: "github:mudler/LocalAI/gallery/mudler.yaml@master" @@ -1134,6 +1165,46 @@ - filename: Llama-3-Hercules-5.0-8B-Q4_K_M.gguf sha256: 83647caf4a23a91697585cff391e7d1236fac867392f9e49a6dab59f81b5f810 uri: huggingface://bartowski/Llama-3-Hercules-5.0-8B-GGUF/Llama-3-Hercules-5.0-8B-Q4_K_M.gguf +### START Command-r +- &command-R + url: "github:mudler/LocalAI/gallery/command-r.yaml@master" + name: "command-r-v01:q1_s" + license: "cc-by-nc-4.0" + icon: https://cdn.sanity.io/images/rjtqmwfu/production/ae020d94b599cc453cc09ebc80be06d35d953c23-102x18.svg + urls: + - https://huggingface.co/CohereForAI/c4ai-command-r-v01 + - https://huggingface.co/dranger003/c4ai-command-r-v01-iMat.GGUF + description: | + C4AI Command-R is a research release of a 35 billion parameter highly performant generative model. Command-R is a large language model with open weights optimized for a variety of use cases including reasoning, summarization, and question answering. Command-R has the capability for multilingual generation evaluated in 10 languages and highly performant RAG capabilities. + tags: + - llm + - gguf + - gpu + - command-r + - cpu + overrides: + parameters: + model: ggml-c4ai-command-r-v01-iq1_s.gguf + files: + - filename: "ggml-c4ai-command-r-v01-iq1_s.gguf" + sha256: "aad4594ee45402fe344d8825937d63b9fa1f00becc6d1cc912b016dbb020e0f0" + uri: "huggingface://dranger003/c4ai-command-r-v01-iMat.GGUF/ggml-c4ai-command-r-v01-iq1_s.gguf" +- !!merge <<: *command-R + name: "aya-23-8b" + urls: + - https://huggingface.co/CohereForAI/aya-23-8B + - https://huggingface.co/bartowski/aya-23-8B-GGUF + description: | + Aya 23 is an open weights research release of an instruction fine-tuned model with highly advanced multilingual capabilities. Aya 23 focuses on pairing a highly performant pre-trained Command family of models with the recently released Aya Collection. The result is a powerful multilingual large language model serving 23 languages. + + This model card corresponds to the 8-billion version of the Aya 23 model. We also released a 35-billion version which you can find here. + overrides: + parameters: + model: aya-23-8B-Q4_K_M.gguf + files: + - filename: "aya-23-8B-Q4_K_M.gguf" + sha256: "21b3aa3abf067f78f6fe08deb80660cc4ee8ad7b4ab873a98d87761f9f858b0f" + uri: "huggingface://bartowski/aya-23-8B-GGUF/aya-23-8B-Q4_K_M.gguf" - &phi-2-chat ### START Phi-2 url: "github:mudler/LocalAI/gallery/phi-2-chat.yaml@master" diff --git a/gallery/mistral-0.3.yaml b/gallery/mistral-0.3.yaml new file mode 100644 index 00000000..502e7a5a --- /dev/null +++ b/gallery/mistral-0.3.yaml @@ -0,0 +1,67 @@ +--- +name: "mistral-0.3" + +config_file: | + context_size: 8192 + mmap: true + stopwords: + - "<|im_end|>" + - "" + - "" + - "<|eot_id|>" + - "<|end_of_text|>" + - "" + - "[/TOOL_CALLS]" + - "[/ACTIONS]" + + function: + # disable injecting the "answer" tool + disable_no_action: true + + grammar: + # This allows the grammar to also return messages + #mixed_mode: true + # Not all models have a sketchpad or something to write thoughts on. + # This one will OR reply to strings OR JSON, but not both in the same reply + #no_mixed_free_string: true + # Disable grammar + # Base instructor model doesn't work well with grammars + disable: true + parallel_calls: true + disable_parallel_new_lines: true + + return_name_in_function_response: true + # Without grammar uncomment the lines below + # Warning: this is relying only on the capability of the + # LLM model to generate the correct function call. + json_regex_match: + - "(?s)\\[TOOL\\_CALLS\\](.*)" + replace_function_results: + # Replace everything that is not JSON array or object + - key: '(?s)^[^{\[]*' + value: "" + - key: '(?s)[^}\]]*$' + value: "" + - key: "(?s)\\[TOOL\\_CALLS\\]" + value: "" + - key: "(?s)\\[\\/TOOL\\_CALLS\\]" + value: "" + + template: + join_chat_messages_by_character: "" ## No newlines between messages + chat: | + {{.Input -}} + chat_message: |- + {{if eq .RoleName "user" -}} + [INST] {{.Content }} [/INST] + {{- else if .FunctionCall -}} + [TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS] + {{- else if eq .RoleName "tool" -}} + [TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS] + {{- else -}} + {{ .Content -}} + {{ end -}} + completion: | + {{.Input}} + function: |- + [AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }} diff --git a/pkg/functions/grammar_json_schema.go b/pkg/functions/grammar_json_schema.go index 30c1901f..9e602a76 100644 --- a/pkg/functions/grammar_json_schema.go +++ b/pkg/functions/grammar_json_schema.go @@ -50,6 +50,9 @@ var ( [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) )* "\"" space`, + // TODO: we shouldn't forbid \" and \\ or all unicode and have this branch here, + // however, if we don't have it, the grammar will be ambiguous and + // empirically results are way worse. "freestring": `( [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) @@ -111,12 +114,18 @@ func (sc *JSONSchemaConverter) addRule(name, rule string) string { return key } -const array = `arr ::= +const arrayNewLines = `arr ::= "[\n" ( realvalue (",\n" realvalue)* )? "]"` +const array = `arr ::= + "[" ( + realvalue + ("," realvalue)* + )? "]"` + func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption)) string { grammarOpts := &GrammarOption{} @@ -124,6 +133,7 @@ func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption)) suffix := grammarOpts.Suffix maybeArray := grammarOpts.MaybeArray + disableParallelNewLines := grammarOpts.DisableParallelNewLines maybeString := grammarOpts.MaybeString noMixedFreeString := grammarOpts.NoMixedFreeString @@ -177,7 +187,11 @@ func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption)) } lines = append(lines, fmt.Sprintf("%s ::= %s", "root", newRoot)) - lines = append(lines, array) + if disableParallelNewLines { + lines = append(lines, array) + } else { + lines = append(lines, arrayNewLines) + } if maybeArray { lines = append(lines, `mixedstring ::= freestring | freestring arr | freestring realvalue | realvalue | arr`) diff --git a/pkg/functions/grammar_json_schema_test.go b/pkg/functions/grammar_json_schema_test.go index 3a864488..672fada2 100644 --- a/pkg/functions/grammar_json_schema_test.go +++ b/pkg/functions/grammar_json_schema_test.go @@ -427,5 +427,22 @@ var _ = Describe("JSON schema grammar tests", func() { } Expect(len(results)).To(Equal(len(strings.Split(grammar, "\n"))), grammar) }) + + It("generates parallel tools without newlines in JSON", func() { + structuredGrammar := JSONFunctionStructureName{ + OneOf: testFunctionsName} + content := `arr ::= +"[" ( +realvalue +("," realvalue)* +)? "]"` + grammar := structuredGrammar.Grammar(functions.EnableMaybeString, functions.EnableMaybeArray, functions.DisableParallelNewLines) + results := strings.Split(content, "\n") + for _, r := range results { + if r != "" { + Expect(grammar).To(ContainSubstring(r)) + } + } + }) }) }) diff --git a/pkg/functions/options.go b/pkg/functions/options.go index 10bbe314..e6b4ef90 100644 --- a/pkg/functions/options.go +++ b/pkg/functions/options.go @@ -1,11 +1,12 @@ package functions type GrammarOption struct { - PropOrder string - Suffix string - MaybeArray bool - MaybeString bool - NoMixedFreeString bool + PropOrder string + Suffix string + MaybeArray bool + DisableParallelNewLines bool + MaybeString bool + NoMixedFreeString bool } func (o *GrammarOption) Apply(options ...func(*GrammarOption)) { @@ -18,6 +19,10 @@ var EnableMaybeArray = func(o *GrammarOption) { o.MaybeArray = true } +var DisableParallelNewLines = func(o *GrammarOption) { + o.DisableParallelNewLines = true +} + var EnableMaybeString = func(o *GrammarOption) { o.MaybeString = true } diff --git a/pkg/functions/parse.go b/pkg/functions/parse.go index 735263c7..d6e9d320 100644 --- a/pkg/functions/parse.go +++ b/pkg/functions/parse.go @@ -12,6 +12,8 @@ type GrammarConfig struct { // ParallelCalls enables the LLM to return multiple function calls in the same response ParallelCalls bool `yaml:"parallel_calls"` + DisableParallelNewLines bool `yaml:"disable_parallel_new_lines"` + // MixedMode enables the LLM to return strings and not only JSON objects // This is useful for models to not constraing returning only JSON and also messages back to the user MixedMode bool `yaml:"mixed_mode"` @@ -81,6 +83,9 @@ func (g GrammarConfig) Options() []func(o *GrammarOption) { if g.ParallelCalls { opts = append(opts, EnableMaybeArray) } + if g.DisableParallelNewLines { + opts = append(opts, DisableParallelNewLines) + } if g.Prefix != "" { opts = append(opts, SetPrefix(g.Prefix)) } @@ -134,7 +139,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC var singleObj map[string]interface{} err = json.Unmarshal([]byte(s), &singleObj) if err != nil { - log.Warn().Err(err).Str("escapedLLMResult", s).Msg("unable to unmarshal llm result") + log.Debug().Err(err).Str("escapedLLMResult", s).Msg("unable to unmarshal llm result in a single object or an array of JSON objects") } else { ss = []map[string]interface{}{singleObj} } @@ -177,6 +182,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC match := respRegex.FindStringSubmatch(llmresult) if len(match) >= 2 { llmresult = match[1] + log.Debug().Msgf("LLM result(JSONRegexMatch): %s", llmresult) break } }