mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-21 00:04:59 +00:00
feat(llama.cpp): expose cache_type_k and cache_type_v for quant of kv cache (#4329)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
88737e1d76
commit
d4c1746c7d
@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
TensorParallelSize: int32(c.TensorParallelSize),
|
||||
MMProj: c.MMProj,
|
||||
FlashAttention: c.FlashAttention,
|
||||
CacheTypeKey: c.CacheTypeK,
|
||||
CacheTypeValue: c.CacheTypeV,
|
||||
NoKVOffload: c.NoKVOffloading,
|
||||
YarnExtFactor: c.YarnExtFactor,
|
||||
YarnAttnFactor: c.YarnAttnFactor,
|
||||
|
Reference in New Issue
Block a user