mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
feat(llama.cpp): expose cache_type_k and cache_type_v for quant of kv cache (#4329)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
88737e1d76
commit
d4c1746c7d
@ -242,6 +242,9 @@ message ModelOptions {
|
|||||||
repeated float LoraScales = 61;
|
repeated float LoraScales = 61;
|
||||||
|
|
||||||
repeated string Options = 62;
|
repeated string Options = 62;
|
||||||
|
|
||||||
|
string CacheTypeKey = 63;
|
||||||
|
string CacheTypeValue = 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
message Result {
|
message Result {
|
||||||
|
@ -2241,6 +2241,12 @@ static void params_parse(const backend::ModelOptions* request,
|
|||||||
}
|
}
|
||||||
// params.model_alias ??
|
// params.model_alias ??
|
||||||
params.model_alias = request->modelfile();
|
params.model_alias = request->modelfile();
|
||||||
|
if (!request->cachetypekey().empty()) {
|
||||||
|
params.cache_type_k = request->cachetypekey();
|
||||||
|
}
|
||||||
|
if (!request->cachetypevalue().empty()) {
|
||||||
|
params.cache_type_v = request->cachetypevalue();
|
||||||
|
}
|
||||||
params.n_ctx = request->contextsize();
|
params.n_ctx = request->contextsize();
|
||||||
//params.memory_f16 = request->f16memory();
|
//params.memory_f16 = request->f16memory();
|
||||||
params.cpuparams.n_threads = request->threads();
|
params.cpuparams.n_threads = request->threads();
|
||||||
|
@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
|||||||
TensorParallelSize: int32(c.TensorParallelSize),
|
TensorParallelSize: int32(c.TensorParallelSize),
|
||||||
MMProj: c.MMProj,
|
MMProj: c.MMProj,
|
||||||
FlashAttention: c.FlashAttention,
|
FlashAttention: c.FlashAttention,
|
||||||
|
CacheTypeKey: c.CacheTypeK,
|
||||||
|
CacheTypeValue: c.CacheTypeV,
|
||||||
NoKVOffload: c.NoKVOffloading,
|
NoKVOffload: c.NoKVOffloading,
|
||||||
YarnExtFactor: c.YarnExtFactor,
|
YarnExtFactor: c.YarnExtFactor,
|
||||||
YarnAttnFactor: c.YarnAttnFactor,
|
YarnAttnFactor: c.YarnAttnFactor,
|
||||||
|
@ -155,8 +155,10 @@ type LLMConfig struct {
|
|||||||
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
||||||
MMProj string `yaml:"mmproj"`
|
MMProj string `yaml:"mmproj"`
|
||||||
|
|
||||||
FlashAttention bool `yaml:"flash_attention"`
|
FlashAttention bool `yaml:"flash_attention"`
|
||||||
NoKVOffloading bool `yaml:"no_kv_offloading"`
|
NoKVOffloading bool `yaml:"no_kv_offloading"`
|
||||||
|
CacheTypeK string `yaml:"cache_type_k"`
|
||||||
|
CacheTypeV string `yaml:"cache_type_v"`
|
||||||
|
|
||||||
RopeScaling string `yaml:"rope_scaling"`
|
RopeScaling string `yaml:"rope_scaling"`
|
||||||
ModelType string `yaml:"type"`
|
ModelType string `yaml:"type"`
|
||||||
|
Loading…
Reference in New Issue
Block a user