fix(llama.cpp): enable cont batching when parallel is set (#1622)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-01-21 14:59:48 +01:00 committed by GitHub
parent 94261b1717
commit 697c769b64
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2465,10 +2465,10 @@ static void params_parse(const backend::ModelOptions* request,
const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
if (env_parallel != NULL) {
params.n_parallel = std::stoi(env_parallel);
params.cont_batching = true;
} else {
params.n_parallel = 1;
}
params.cont_batching = true;
// TODO: Add yarn
if (!request->tensorsplit().empty()) {