From d4c1746c7db3d13ba97bb9d8a8b698d8a366a0a7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 6 Dec 2024 10:23:59 +0100 Subject: [PATCH] feat(llama.cpp): expose cache_type_k and cache_type_v for quant of kv cache (#4329) Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 3 +++ backend/cpp/llama/grpc-server.cpp | 6 ++++++ core/backend/options.go | 2 ++ core/config/backend_config.go | 6 ++++-- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index 48b0101b..0a341ca2 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -242,6 +242,9 @@ message ModelOptions { repeated float LoraScales = 61; repeated string Options = 62; + + string CacheTypeKey = 63; + string CacheTypeValue = 64; } message Result { diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 0fde74cb..ea5c4e34 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2241,6 +2241,12 @@ static void params_parse(const backend::ModelOptions* request, } // params.model_alias ?? params.model_alias = request->modelfile(); + if (!request->cachetypekey().empty()) { + params.cache_type_k = request->cachetypekey(); + } + if (!request->cachetypevalue().empty()) { + params.cache_type_v = request->cachetypevalue(); + } params.n_ctx = request->contextsize(); //params.memory_f16 = request->f16memory(); params.cpuparams.n_threads = request->threads(); diff --git a/core/backend/options.go b/core/backend/options.go index 1f88122f..f6247c60 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { TensorParallelSize: int32(c.TensorParallelSize), MMProj: c.MMProj, FlashAttention: c.FlashAttention, + CacheTypeKey: c.CacheTypeK, + CacheTypeValue: c.CacheTypeV, NoKVOffload: c.NoKVOffloading, YarnExtFactor: c.YarnExtFactor, YarnAttnFactor: c.YarnAttnFactor, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 1de540f9..0ff34769 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -155,8 +155,10 @@ type LLMConfig struct { TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM MMProj string `yaml:"mmproj"` - FlashAttention bool `yaml:"flash_attention"` - NoKVOffloading bool `yaml:"no_kv_offloading"` + FlashAttention bool `yaml:"flash_attention"` + NoKVOffloading bool `yaml:"no_kv_offloading"` + CacheTypeK string `yaml:"cache_type_k"` + CacheTypeV string `yaml:"cache_type_v"` RopeScaling string `yaml:"rope_scaling"` ModelType string `yaml:"type"`