LocalAI/gallery/hermes-vllm.yaml

---
name: "hermes-vllm"

config_file: |
    backend: vllm
    parameters:
      max_tokens: 8192
    context_size: 8192
    stopwords:
    - "<|im_end|>"
    - "<dummy32000>"
    - "<|eot_id|>"
    - "<|end_of_text|>"
    function:
      disable_no_action: true
      grammar:
        # Uncomment the line below to enable grammar matching for JSON results if the model is breaking
        # the output. This will make the model more accurate and won't break the JSON output.
        # This however, will make parallel_calls not functional (it is a known bug)
        # mixed_mode: true
        disable: true
        parallel_calls: true
        expect_strings_after_json: true
      json_regex_match:
      - "(?s)<tool_call>(.*?)</tool_call>"
      - "(?s)<tool_call>(.*)"
      capture_llm_results:
        - (?s)<scratchpad>(.*?)</scratchpad>
      replace_llm_results:
        - key: (?s)<scratchpad>(.*?)</scratchpad>
          value: ""

    template:
      use_tokenizer_template: true
      chat: |
        {{.Input -}}
        <|im_start|>assistant
      chat_message: |
        <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
        {{- if .FunctionCall }}
        <tool_call>
        {{- else if eq .RoleName "tool" }}
        <tool_response>
        {{- end }}
        {{- if .Content}}
        {{.Content }}
        {{- end }}
        {{- if .FunctionCall}}
        {{toJson .FunctionCall}}
        {{- end }}
        {{- if .FunctionCall }}
        </tool_call>
        {{- else if eq .RoleName "tool" }}
        </tool_response>
        {{- end }}<|im_end|>
      completion: |
        {{.Input}}
      function: |
        <|im_start|>system
        You are a function calling AI model.
        Here are the available tools:
        <tools>
        {{range .Functions}}
        {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
        {{end}}
        </tools>
        You should call the tools provided to you sequentially
        Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
        <scratchpad>
        {step-by-step reasoning and plan in bullet points}
        </scratchpad>
        For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
        <tool_call>
        {"arguments": <args-dict>, "name": <function-name>}
        </tool_call><|im_end|>
        {{.Input -}}
        <|im_start|>assistant
# Uncomment to specify a quantization method (optional)
# quantization: "awq"
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
# gpu_memory_utilization: 0.5
# Uncomment to trust remote code from huggingface
# trust_remote_code: true
# Uncomment to enable eager execution
# enforce_eager: true
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
# swap_space: 2
# Uncomment to specify the maximum length of a sequence (including prompt and output)
# max_model_len: 32768
# Uncomment and specify the number of Tensor divisions.
# Allows you to partition and run large models. Performance gains are limited.
# https://github.com/vllm-project/vllm/issues/1435
# tensor_parallel_size: 2
models(gallery): add hermes-3-llama-3.1(8B,70B,405B) with vLLM (#3360) models(gallery): add hermes-3-llama-3.1 with vLLM it adds 8b, 70b and 405b to the gallery Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2024-08-23 07:24:34 +00:00			`---`
			`name: "hermes-vllm"`

			`config_file: \|`
			`backend: vllm`
chore(model-gallery): add more quants for popular models (#3365) * models(gallery): add higher quants for some llama and hermes Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * models(gallery): vllm: specify a reasonable max_tokens Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2024-08-23 22:29:24 +00:00			`parameters:`
			`max_tokens: 8192`
models(gallery): add hermes-3-llama-3.1(8B,70B,405B) with vLLM (#3360) models(gallery): add hermes-3-llama-3.1 with vLLM it adds 8b, 70b and 405b to the gallery Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2024-08-23 07:24:34 +00:00			`context_size: 8192`
			`stopwords:`
			`- "<\|im_end\|>"`
			`- "<dummy32000>"`
			`- "<\|eot_id\|>"`
			`- "<\|end_of_text\|>"`
			`function:`
			`disable_no_action: true`
			`grammar:`
			`# Uncomment the line below to enable grammar matching for JSON results if the model is breaking`
			`# the output. This will make the model more accurate and won't break the JSON output.`
			`# This however, will make parallel_calls not functional (it is a known bug)`
			`# mixed_mode: true`
			`disable: true`
			`parallel_calls: true`
			`expect_strings_after_json: true`
			`json_regex_match:`
			`- "(?s)<tool_call>(.*?)</tool_call>"`
			`- "(?s)<tool_call>(.*)"`
			`capture_llm_results:`
			`- (?s)<scratchpad>(.*?)</scratchpad>`
			`replace_llm_results:`
			`- key: (?s)<scratchpad>(.*?)</scratchpad>`
			`value: ""`

			`template:`
			`use_tokenizer_template: true`
			`chat: \|`
			`{{.Input -}}`
			`<\|im_start\|>assistant`
			`chat_message: \|`
			`<\|im_start\|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}`
			`{{- if .FunctionCall }}`
			`<tool_call>`
			`{{- else if eq .RoleName "tool" }}`
			`<tool_response>`
			`{{- end }}`
			`{{- if .Content}}`
			`{{.Content }}`
			`{{- end }}`
			`{{- if .FunctionCall}}`
			`{{toJson .FunctionCall}}`
			`{{- end }}`
			`{{- if .FunctionCall }}`
			`</tool_call>`
			`{{- else if eq .RoleName "tool" }}`
			`</tool_response>`
			`{{- end }}<\|im_end\|>`
			`completion: \|`
			`{{.Input}}`
			`function: \|`
			`<\|im_start\|>system`
			`You are a function calling AI model.`
			`Here are the available tools:`
			`<tools>`
			`{{range .Functions}}`
			`{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}`
			`{{end}}`
			`</tools>`
			`You should call the tools provided to you sequentially`
			`Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:`
			`<scratchpad>`
			`{step-by-step reasoning and plan in bullet points}`
			`</scratchpad>`
			`For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:`
			`<tool_call>`
			`{"arguments": <args-dict>, "name": <function-name>}`
			`</tool_call><\|im_end\|>`
			`{{.Input -}}`
			`<\|im_start\|>assistant`
			`# Uncomment to specify a quantization method (optional)`
			`# quantization: "awq"`
			`# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)`
			`# gpu_memory_utilization: 0.5`
			`# Uncomment to trust remote code from huggingface`
			`# trust_remote_code: true`
			`# Uncomment to enable eager execution`
			`# enforce_eager: true`
			`# Uncomment to specify the size of the CPU swap space per GPU (in GiB)`
			`# swap_space: 2`
			`# Uncomment to specify the maximum length of a sequence (including prompt and output)`
			`# max_model_len: 32768`
			`# Uncomment and specify the number of Tensor divisions.`
			`# Allows you to partition and run large models. Performance gains are limited.`
			`# https://github.com/vllm-project/vllm/issues/1435`
			`# tensor_parallel_size: 2`