mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
feat(llama.cpp): Bump llama.cpp, adapt grpc server (#1211)
* feat(llama.cpp): Bump llama.cpp, adapt grpc server Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci: fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
c132dbadce
commit
f227e918f9
3
.github/workflows/test-gpu.yml
vendored
3
.github/workflows/test-gpu.yml
vendored
@ -40,6 +40,8 @@ jobs:
|
|||||||
if [ ! -e /run/systemd/system ]; then
|
if [ ! -e /run/systemd/system ]; then
|
||||||
sudo mkdir /run/systemd/system
|
sudo mkdir /run/systemd/system
|
||||||
fi
|
fi
|
||||||
|
sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
|
||||||
|
sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
|
||||||
make \
|
make \
|
||||||
TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
|
TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
|
||||||
BUILD_TYPE=cublas \
|
BUILD_TYPE=cublas \
|
||||||
@ -57,4 +59,5 @@ jobs:
|
|||||||
make \
|
make \
|
||||||
TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
|
TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
|
||||||
teardown-e2e || true
|
teardown-e2e || true
|
||||||
|
sudo rm -rf /host/tests/${{ github.head_ref || github.ref }}
|
||||||
docker system prune -f -a --volumes || true
|
docker system prune -f -a --volumes || true
|
2
Makefile
2
Makefile
@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
|
|||||||
|
|
||||||
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
|
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
|
||||||
|
|
||||||
CPPLLAMA_VERSION?=96981f37b1e3f450d9e63e571514217bf60f0a7f
|
CPPLLAMA_VERSION?=9d02956443e5c1ded29b7b5ed8a21bc01ba6f563
|
||||||
|
|
||||||
# gpt4all version
|
# gpt4all version
|
||||||
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
||||||
|
@ -275,11 +275,11 @@ struct llama_server_context
|
|||||||
if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
|
if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
|
||||||
suffix_tokens.erase(suffix_tokens.begin());
|
suffix_tokens.erase(suffix_tokens.begin());
|
||||||
}
|
}
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||||
prefix_tokens.push_back(llama_token_middle(ctx));
|
prefix_tokens.push_back(llama_token_middle(model));
|
||||||
|
|
||||||
auto prompt_tokens = prefix_tokens;
|
auto prompt_tokens = prefix_tokens;
|
||||||
|
|
||||||
@ -419,7 +419,7 @@ struct llama_server_context
|
|||||||
if (params.n_predict == 0)
|
if (params.n_predict == 0)
|
||||||
{
|
{
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
result.tok = llama_token_eos(ctx);
|
result.tok = llama_token_eos(model);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -453,7 +453,7 @@ struct llama_server_context
|
|||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
|
if (!embd.empty() && embd.back() == llama_token_eos(model))
|
||||||
{
|
{
|
||||||
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
@ -594,7 +594,7 @@ static void parse_options_completion(bool streaming,const backend::PredictOption
|
|||||||
|
|
||||||
if (predict->ignoreeos())
|
if (predict->ignoreeos())
|
||||||
{
|
{
|
||||||
llama.params.sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
// const auto &logit_bias = body.find("logit_bias");
|
// const auto &logit_bias = body.find("logit_bias");
|
||||||
@ -676,7 +676,7 @@ static void params_parse(const backend::ModelOptions* request,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
|
static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
|
||||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.model);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function matching type llama_beam_search_callback_fn_t.
|
// Function matching type llama_beam_search_callback_fn_t.
|
||||||
|
Loading…
Reference in New Issue
Block a user