mirror of
https://github.com/mudler/LocalAI.git
synced 2025-01-29 15:44:17 +00:00
models(gallery): add hermes-3-llama-3.1(8B,70B,405B) with vLLM (#3360)
models(gallery): add hermes-3-llama-3.1 with vLLM it adds 8b, 70b and 405b to the gallery Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
fbaae8528d
commit
a913fd310d
91
gallery/hermes-vllm.yaml
Normal file
91
gallery/hermes-vllm.yaml
Normal file
@ -0,0 +1,91 @@
|
||||
---
|
||||
name: "hermes-vllm"
|
||||
|
||||
config_file: |
|
||||
backend: vllm
|
||||
context_size: 8192
|
||||
stopwords:
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
function:
|
||||
disable_no_action: true
|
||||
grammar:
|
||||
# Uncomment the line below to enable grammar matching for JSON results if the model is breaking
|
||||
# the output. This will make the model more accurate and won't break the JSON output.
|
||||
# This however, will make parallel_calls not functional (it is a known bug)
|
||||
# mixed_mode: true
|
||||
disable: true
|
||||
parallel_calls: true
|
||||
expect_strings_after_json: true
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*)"
|
||||
capture_llm_results:
|
||||
- (?s)<scratchpad>(.*?)</scratchpad>
|
||||
replace_llm_results:
|
||||
- key: (?s)<scratchpad>(.*?)</scratchpad>
|
||||
value: ""
|
||||
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
{{.Content }}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
# Uncomment to specify a quantization method (optional)
|
||||
# quantization: "awq"
|
||||
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
||||
# gpu_memory_utilization: 0.5
|
||||
# Uncomment to trust remote code from huggingface
|
||||
# trust_remote_code: true
|
||||
# Uncomment to enable eager execution
|
||||
# enforce_eager: true
|
||||
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
|
||||
# swap_space: 2
|
||||
# Uncomment to specify the maximum length of a sequence (including prompt and output)
|
||||
# max_model_len: 32768
|
||||
# Uncomment and specify the number of Tensor divisions.
|
||||
# Allows you to partition and run large models. Performance gains are limited.
|
||||
# https://github.com/vllm-project/vllm/issues/1435
|
||||
# tensor_parallel_size: 2
|
@ -4752,6 +4752,38 @@
|
||||
- filename: Hermes-3-Llama-3.1-70B.Q4_K_M.gguf
|
||||
sha256: 955c2f42caade4278f3c9dbffa32bb74572652b20e49e5340e782de3585bbe3f
|
||||
uri: huggingface://NousResearch/Hermes-3-Llama-3.1-70B-GGUF/Hermes-3-Llama-3.1-70B.Q4_K_M.gguf
|
||||
- &hermes-vllm
|
||||
url: "github:mudler/LocalAI/gallery/hermes-vllm.yaml@master"
|
||||
name: "hermes-3-llama-3.1-8b:vllm"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/vG6j5WxHX09yj32vgjJlI.jpeg
|
||||
tags:
|
||||
- llm
|
||||
- vllm
|
||||
- gpu
|
||||
- function-calling
|
||||
license: llama-3
|
||||
urls:
|
||||
- https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-8B
|
||||
description: |
|
||||
Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board. It is designed to focus on aligning LLMs to the user, with powerful steering capabilities and control given to the end user. The model uses ChatML as the prompt format, opening up a much more structured system for engaging the LLM in multi-turn chat dialogue. It also supports function calling and structured output capabilities, generalist assistant capabilities, and improved code generation skills.
|
||||
overrides:
|
||||
parameters:
|
||||
model: NousResearch/Hermes-3-Llama-3.1-8B
|
||||
- !!merge <<: *hermes-vllm
|
||||
name: "hermes-3-llama-3.1-70b:vllm"
|
||||
urls:
|
||||
- https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B
|
||||
overrides:
|
||||
parameters:
|
||||
model: NousResearch/Hermes-3-Llama-3.1-70B
|
||||
- !!merge <<: *hermes-vllm
|
||||
name: "hermes-3-llama-3.1-405b:vllm"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-405B
|
||||
overrides:
|
||||
parameters:
|
||||
model: NousResearch/Hermes-3-Llama-3.1-405B
|
||||
- !!merge <<: *hermes-2-pro-mistral
|
||||
name: "biomistral-7b"
|
||||
description: |
|
||||
|
29
gallery/vllm.yaml
Normal file
29
gallery/vllm.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
---
|
||||
name: "vllm"
|
||||
|
||||
config_file: |
|
||||
backend: vllm
|
||||
function:
|
||||
disable_no_action: true
|
||||
grammar:
|
||||
disable: true
|
||||
parallel_calls: true
|
||||
expect_strings_after_json: true
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
# Uncomment to specify a quantization method (optional)
|
||||
# quantization: "awq"
|
||||
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
||||
# gpu_memory_utilization: 0.5
|
||||
# Uncomment to trust remote code from huggingface
|
||||
# trust_remote_code: true
|
||||
# Uncomment to enable eager execution
|
||||
# enforce_eager: true
|
||||
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
|
||||
# swap_space: 2
|
||||
# Uncomment to specify the maximum length of a sequence (including prompt and output)
|
||||
# max_model_len: 32768
|
||||
# Uncomment and specify the number of Tensor divisions.
|
||||
# Allows you to partition and run large models. Performance gains are limited.
|
||||
# https://github.com/vllm-project/vllm/issues/1435
|
||||
# tensor_parallel_size: 2
|
Loading…
x
Reference in New Issue
Block a user