LocalAI/gallery/vllm.yaml

---
name: "vllm"

config_file: |
    context_size: 8192
    parameters:
      max_tokens: 8192
    backend: vllm
    function:
      disable_no_action: true
      grammar:
        disable: true
        parallel_calls: true
        expect_strings_after_json: true
    template:
      use_tokenizer_template: true
    # Uncomment to specify a quantization method (optional)
    # quantization: "awq"
    # Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
    # gpu_memory_utilization: 0.5
    # Uncomment to trust remote code from huggingface
    # trust_remote_code: true
    # Uncomment to enable eager execution
    # enforce_eager: true
    # Uncomment to specify the size of the CPU swap space per GPU (in GiB)
    # swap_space: 2
    # Uncomment to specify the maximum length of a sequence (including prompt and output)
    # max_model_len: 32768
    # Uncomment and specify the number of Tensor divisions.
    # Allows you to partition and run large models. Performance gains are limited.
    # https://github.com/vllm-project/vllm/issues/1435
    # tensor_parallel_size: 2