feat: allow to specify a reply prefix (#4931)
Some checks are pending
Explorer deployment / build-linux (push) Waiting to run
GPU tests / ubuntu-latest (1.21.x) (push) Waiting to run
generate and publish intel docker caches / generate_caches (intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04, linux/amd64, ubuntu-latest) (push) Waiting to run
build container images / hipblas-jobs (-aio-gpu-hipblas, rocm/dev-ubuntu-22.04:6.1, hipblas, true, ubuntu:22.04, extras, latest-gpu-hipblas, latest-aio-gpu-hipblas, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -hipblas-ffmpeg) (push) Waiting to run
build container images / hipblas-jobs (rocm/dev-ubuntu-22.04:6.1, hipblas, false, ubuntu:22.04, core, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -hipblas-core) (push) Waiting to run
build container images / hipblas-jobs (rocm/dev-ubuntu-22.04:6.1, hipblas, false, ubuntu:22.04, extras, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -hipblas) (push) Waiting to run
build container images / hipblas-jobs (rocm/dev-ubuntu-22.04:6.1, hipblas, true, ubuntu:22.04, core, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -hipblas-ffmpeg-core) (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-intel-f16, quay.io/go-skynet/intel-oneapi-base:latest, sycl_f16, true, ubuntu:22.04, extras, latest-gpu-intel-f16, latest-aio-gpu-intel-f16, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -sycl-f16-ffmpeg) (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-intel-f32, quay.io/go-skynet/intel-oneapi-base:latest, sycl_f32, true, ubuntu:22.04, extras, latest-gpu-intel-f32, latest-aio-gpu-intel-f32, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -sycl-f32-ffmpeg) (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-nvidia-cuda-11, ubuntu:22.04, cublas, 11, 7, true, extras, latest-gpu-nvidia-cuda-11, latest-aio-gpu-nvidia-cuda-11, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -cublas-cuda11-ffmpeg) (push) Waiting to run
build container images / self-hosted-jobs (-aio-gpu-nvidia-cuda-12, ubuntu:22.04, cublas, 12, 0, true, extras, latest-gpu-nvidia-cuda-12, latest-aio-gpu-nvidia-cuda-12, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -cublas-cuda12-ffmpeg) (push) Waiting to run
build container images / self-hosted-jobs (quay.io/go-skynet/intel-oneapi-base:latest, sycl_f16, false, ubuntu:22.04, core, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f16-core) (push) Waiting to run
build container images / self-hosted-jobs (quay.io/go-skynet/intel-oneapi-base:latest, sycl_f16, true, ubuntu:22.04, core, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f16-ffmpeg-core) (push) Waiting to run
build container images / self-hosted-jobs (quay.io/go-skynet/intel-oneapi-base:latest, sycl_f32, false, ubuntu:22.04, core, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f32-core) (push) Waiting to run
build container images / self-hosted-jobs (quay.io/go-skynet/intel-oneapi-base:latest, sycl_f32, true, ubuntu:22.04, core, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -sycl-f32-ffmpeg-core) (push) Waiting to run
build container images / self-hosted-jobs (ubuntu:22.04, , , extras, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, ) (push) Waiting to run
build container images / self-hosted-jobs (ubuntu:22.04, , true, extras, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, auto, -ffmpeg) (push) Waiting to run
build container images / self-hosted-jobs (ubuntu:22.04, cublas, 11, 7, , extras, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -cublas-cuda11) (push) Waiting to run
build container images / self-hosted-jobs (ubuntu:22.04, cublas, 12, 0, , extras, --jobs=3 --output-sync=target, linux/amd64, arc-runner-set, false, -cublas-cuda12) (push) Waiting to run
build container images / core-image-build (-aio-cpu, ubuntu:22.04, , true, core, latest-cpu, latest-aio-cpu, --jobs=4 --output-sync=target, linux/amd64,linux/arm64, arc-runner-set, false, auto, -ffmpeg-core) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, cublas, 11, 7, , core, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -cublas-cuda11-core) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, cublas, 11, 7, true, core, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -cublas-cuda11-ffmpeg-core) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, cublas, 12, 0, , core, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -cublas-cuda12-core) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, cublas, 12, 0, true, core, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -cublas-cuda12-ffmpeg-core) (push) Waiting to run
build container images / core-image-build (ubuntu:22.04, vulkan, true, core, latest-vulkan-ffmpeg-core, --jobs=4 --output-sync=target, linux/amd64, arc-runner-set, false, false, -vulkan-ffmpeg-core) (push) Waiting to run
build container images / gh-runner (nvcr.io/nvidia/l4t-jetpack:r36.4.0, cublas, 12, 0, true, core, latest-nvidia-l4t-arm64-core, --jobs=4 --output-sync=target, linux/arm64, ubuntu-24.04-arm, true, false, -nvidia-l4t-arm64-core) (push) Waiting to run
Security Scan / tests (push) Waiting to run
Tests extras backends / tests-transformers (push) Waiting to run
Tests extras backends / tests-rerankers (push) Waiting to run
Tests extras backends / tests-diffusers (push) Waiting to run
Tests extras backends / tests-coqui (push) Waiting to run
tests / tests-linux (1.21.x) (push) Waiting to run
tests / tests-aio-container (push) Waiting to run
tests / tests-apple (1.21.x) (push) Waiting to run

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2025-03-02 16:07:32 +01:00 committed by GitHub
parent ff85f01459
commit a7b4001b75
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 38 additions and 26 deletions

View File

@ -116,6 +116,11 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
}
if tokenCallback != nil {
if c.TemplateConfig.ReplyPrefix != "" {
tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
}
ss := ""
var partialRune []byte
@ -165,8 +170,13 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
response := string(reply.Message)
if c.TemplateConfig.ReplyPrefix != "" {
response = c.TemplateConfig.ReplyPrefix + response
}
return LLMResponse{
Response: string(reply.Message),
Response: response,
Usage: tokenUsage,
}, err
}

View File

@ -130,28 +130,28 @@ type LLMConfig struct {
TrimSpace []string `yaml:"trimspace"`
TrimSuffix []string `yaml:"trimsuffix"`
ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraAdapters []string `yaml:"lora_adapters"`
LoraScales []float32 `yaml:"lora_scales"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
DType string `yaml:"dtype"` // vLLM
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
MMProj string `yaml:"mmproj"`
ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraAdapters []string `yaml:"lora_adapters"`
LoraScales []float32 `yaml:"lora_scales"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
DType string `yaml:"dtype"` // vLLM
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
MMProj string `yaml:"mmproj"`
FlashAttention bool `yaml:"flash_attention"`
NoKVOffloading bool `yaml:"no_kv_offloading"`
@ -171,9 +171,9 @@ type LLMConfig struct {
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
type LimitMMPerPrompt struct {
LimitImagePerPrompt int `yaml:"image"`
LimitVideoPerPrompt int `yaml:"video"`
LimitAudioPerPrompt int `yaml:"audio"`
LimitImagePerPrompt int `yaml:"image"`
LimitVideoPerPrompt int `yaml:"video"`
LimitAudioPerPrompt int `yaml:"audio"`
}
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
@ -213,6 +213,8 @@ type TemplateConfig struct {
Multimodal string `yaml:"multimodal"`
JinjaTemplate bool `yaml:"jinja_template"`
ReplyPrefix string `yaml:"reply_prefix"`
}
func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {