mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
deps(llama.cpp): update, support Gemma models (#1734)
deps(llama.cpp): update Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
54ec6348fa
commit
8292781045
2
Makefile
2
Makefile
@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
|
|||||||
|
|
||||||
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
|
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
|
||||||
|
|
||||||
CPPLLAMA_VERSION?=9350a1cf21b1492c69b20175b73a419b897d6a3a
|
CPPLLAMA_VERSION?=88c46cbdac05cebd936511b1d3c74112e721615f
|
||||||
|
|
||||||
# gpt4all version
|
# gpt4all version
|
||||||
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
||||||
|
@ -2,16 +2,20 @@
|
|||||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||||
## This is an hack for now, but it should be fixed in the future.
|
## This is an hack for now, but it should be fixed in the future.
|
||||||
set(TARGET myclip)
|
set(TARGET myclip)
|
||||||
add_library(${TARGET} clip.cpp clip.h)
|
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
|
||||||
install(TARGETS ${TARGET} LIBRARY)
|
install(TARGETS ${TARGET} LIBRARY)
|
||||||
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_include_directories(myclip PUBLIC .)
|
||||||
|
target_include_directories(myclip PUBLIC ../..)
|
||||||
|
target_include_directories(myclip PUBLIC ../../common)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||||
endif()
|
endif()
|
||||||
|
# END CLIP hack
|
||||||
|
|
||||||
|
|
||||||
set(TARGET grpc-server)
|
set(TARGET grpc-server)
|
||||||
# END CLIP hack
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
cmake_minimum_required(VERSION 3.15)
|
cmake_minimum_required(VERSION 3.15)
|
||||||
set(TARGET grpc-server)
|
set(TARGET grpc-server)
|
||||||
|
@ -45,6 +45,9 @@ llama.cpp/examples/grpc-server:
|
|||||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||||
## This is an hack for now, but it should be fixed in the future.
|
## This is an hack for now, but it should be fixed in the future.
|
||||||
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
|
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
|
||||||
|
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
|
||||||
|
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
|
||||||
|
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
|
||||||
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
|
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
|
||||||
|
|
||||||
rebuild:
|
rebuild:
|
||||||
|
@ -11,7 +11,8 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include "../llava/clip.h"
|
#include "clip.h"
|
||||||
|
#include "llava.h"
|
||||||
#include "stb_image.h"
|
#include "stb_image.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
@ -32,6 +33,7 @@
|
|||||||
#include <grpcpp/grpcpp.h>
|
#include <grpcpp/grpcpp.h>
|
||||||
#include <grpcpp/health_check_service_interface.h>
|
#include <grpcpp/health_check_service_interface.h>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <signal.h>
|
||||||
|
|
||||||
using grpc::Server;
|
using grpc::Server;
|
||||||
using grpc::ServerBuilder;
|
using grpc::ServerBuilder;
|
||||||
@ -51,10 +53,11 @@ struct server_params
|
|||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
std::string chat_template = "chatml";
|
std::string chat_template = "";
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
bool slots_endpoint = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
@ -173,6 +176,7 @@ struct llama_client_slot
|
|||||||
int32_t n_decoded = 0;
|
int32_t n_decoded = 0;
|
||||||
int32_t n_remaining = -1;
|
int32_t n_remaining = -1;
|
||||||
int32_t i_batch = -1;
|
int32_t i_batch = -1;
|
||||||
|
int32_t n_predict = -1;
|
||||||
|
|
||||||
int32_t num_prompt_tokens = 0;
|
int32_t num_prompt_tokens = 0;
|
||||||
int32_t num_prompt_tokens_processed = 0;
|
int32_t num_prompt_tokens_processed = 0;
|
||||||
@ -424,6 +428,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
slot.id = i;
|
slot.id = i;
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
|
slot.n_predict = params.n_predict;
|
||||||
|
|
||||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||||
|
|
||||||
@ -451,10 +456,6 @@ struct llama_server_context
|
|||||||
default_generation_settings_for_props["seed"] = -1;
|
default_generation_settings_for_props["seed"] = -1;
|
||||||
|
|
||||||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||||
|
|
||||||
// empty system prompt
|
|
||||||
system_prompt = "";
|
|
||||||
system_tokens.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||||
@ -531,7 +532,7 @@ struct llama_server_context
|
|||||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
|
|
||||||
slot->params.stream = json_value(data, "stream", false);
|
slot->params.stream = json_value(data, "stream", false);
|
||||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
@ -555,6 +556,16 @@ struct llama_server_context
|
|||||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
|
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
|
||||||
|
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||||
|
// Might be better to reject the request with a 400 ?
|
||||||
|
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||||
|
{"params.n_predict", slot->params.n_predict},
|
||||||
|
{"slot.n_predict", slot->n_predict},
|
||||||
|
});
|
||||||
|
slot->params.n_predict = slot->n_predict;
|
||||||
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
if (data.count("input_prefix") != 0)
|
if (data.count("input_prefix") != 0)
|
||||||
@ -683,6 +694,24 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto &samplers_sequence = data.find("samplers");
|
||||||
|
if (samplers_sequence != data.end() && samplers_sequence->is_array())
|
||||||
|
{
|
||||||
|
std::vector<std::string> sampler_names;
|
||||||
|
for (const auto &sampler_name : *samplers_sequence)
|
||||||
|
{
|
||||||
|
if (sampler_name.is_string())
|
||||||
|
{
|
||||||
|
sampler_names.emplace_back(sampler_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
|
||||||
|
}
|
||||||
|
|
||||||
if (multimodal)
|
if (multimodal)
|
||||||
{
|
{
|
||||||
const auto &images_data = data.find("image_data");
|
const auto &images_data = data.find("image_data");
|
||||||
@ -772,27 +801,30 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
void update_system_prompt() {
|
void update_system_prompt() {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
|
||||||
|
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
|
system_tokens.clear();
|
||||||
|
|
||||||
for (int i = 0; i < (int) system_tokens.size(); ++i)
|
if (!system_prompt.empty()) {
|
||||||
{
|
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0)
|
llama_batch_clear(batch);
|
||||||
{
|
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// assign the system KV cache to all parallel sequences
|
for (int i = 0; i < (int)system_tokens.size(); ++i)
|
||||||
for (int32_t i = 1; i < params.n_parallel; ++i)
|
{
|
||||||
{
|
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||||
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
|
}
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch) != 0)
|
||||||
|
{
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// assign the system KV cache to all parallel sequences
|
||||||
|
for (int32_t i = 1; i < params.n_parallel; ++i)
|
||||||
|
{
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("system prompt updated\n");
|
LOG_TEE("system prompt updated\n");
|
||||||
@ -814,10 +846,8 @@ struct llama_server_context
|
|||||||
name_user = sys_props.value("anti_prompt", "");
|
name_user = sys_props.value("anti_prompt", "");
|
||||||
name_assistant = sys_props.value("assistant_name", "");
|
name_assistant = sys_props.value("assistant_name", "");
|
||||||
|
|
||||||
if (slots.size() > 0)
|
|
||||||
{
|
notify_system_prompt_changed();
|
||||||
notify_system_prompt_changed();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||||
@ -975,44 +1005,12 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
clip_image_f32_batch img_res_v;
|
|
||||||
img_res_v.size = 0;
|
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||||
img_res_v.data = nullptr;
|
|
||||||
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
|
|
||||||
{
|
|
||||||
LOG_TEE("Error processing the given image");
|
|
||||||
clip_free(clp_ctx);
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (img_res_v.size == 0)
|
|
||||||
{
|
|
||||||
LOG_TEE("Error processing the given image");
|
LOG_TEE("Error processing the given image");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// note: assumes only one image was returned by clip_image_preprocess
|
|
||||||
clip_image_f32 * img_res = img_res_v.data;
|
|
||||||
|
|
||||||
img.image_tokens = clip_n_patches(clp_ctx);
|
|
||||||
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
|
|
||||||
if (!img.image_embedding)
|
|
||||||
{
|
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
clip_free(clp_ctx);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
|
||||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
|
||||||
{
|
|
||||||
LOG_TEE("Unable to encode image\n");
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
|
|
||||||
img.request_encode_image = false;
|
img.request_encode_image = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1036,8 +1034,15 @@ struct llama_server_context
|
|||||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
||||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
std::vector<std::string> samplers_sequence;
|
||||||
|
for (const auto &sampler_type : slot.sparams.samplers_sequence)
|
||||||
|
{
|
||||||
|
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
|
||||||
|
}
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
|
{"n_predict", slot.n_predict},
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.params.seed},
|
{"seed", slot.params.seed},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
@ -1065,7 +1070,9 @@ struct llama_server_context
|
|||||||
{"stream", slot.params.stream},
|
{"stream", slot.params.stream},
|
||||||
{"logit_bias", slot.sparams.logit_bias},
|
{"logit_bias", slot.sparams.logit_bias},
|
||||||
{"n_probs", slot.sparams.n_probs},
|
{"n_probs", slot.sparams.n_probs},
|
||||||
|
{"min_keep", slot.sparams.min_keep},
|
||||||
{"grammar", slot.sparams.grammar},
|
{"grammar", slot.sparams.grammar},
|
||||||
|
{"samplers", samplers_sequence}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1877,6 +1884,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::function<void(int)> shutdown_handler;
|
||||||
|
inline void signal_handler(int signal) { shutdown_handler(signal); }
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
//////// LOCALAI code starts below here
|
//////// LOCALAI code starts below here
|
||||||
@ -2147,7 +2157,8 @@ public:
|
|||||||
gpt_params params;
|
gpt_params params;
|
||||||
params_parse(request, params);
|
params_parse(request, params);
|
||||||
|
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
if (!llama.load_model(params))
|
if (!llama.load_model(params))
|
||||||
|
Loading…
Reference in New Issue
Block a user