// llama.cpp gRPC C++ backend server // // Ettore Di Giacinto // // This is a gRPC server for llama.cpp compatible with the LocalAI proto // Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP, // but modified to work with gRPC // #include #include #include #include #include "common.h" #include "llama.h" #include "grammar-parser.h" #include "backend.pb.h" #include "backend.grpc.pb.h" // include std::regex #include #include #include #include using grpc::Server; using grpc::ServerBuilder; using grpc::ServerContext; using grpc::Status; using backend::HealthMessage; // completion token output with probabilities struct completion_token_output { struct token_prob { llama_token tok; float prob; }; std::vector probs; llama_token tok; }; static size_t common_part(const std::vector &a, const std::vector &b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) { } return i; } enum stop_type { STOP_FULL, STOP_PARTIAL, }; static bool ends_with(const std::string &str, const std::string &suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } static size_t find_partial_stop_string(const std::string &stop, const std::string &text) { if (!text.empty() && !stop.empty()) { const char text_last_char = text.back(); for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { if (stop[char_index] == text_last_char) { const std::string current_partial = stop.substr(0, char_index + 1); if (ends_with(text, current_partial)) { return text.size() - char_index - 1; } } } } return std::string::npos; } template static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { ret += llama_token_to_piece(ctx, *begin); } return ret; } // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) { std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) if (out.size() == 1 && (out[0] & 0x80) == 0x80) { std::stringstream ss; ss << std::hex << (out[0] & 0xff); std::string res(ss.str()); out = "byte: \\x" + res; } return out; } struct llama_server_context { bool stream = false; bool has_next_token = false; std::string generated_text; std::vector generated_token_probs; size_t num_prompt_tokens = 0; size_t num_tokens_predicted = 0; size_t n_past = 0; size_t n_remain = 0; std::vector embd; gpt_params params; llama_model *model = nullptr; llama_context *ctx = nullptr; llama_sampling_context *ctx_sampling = nullptr; int n_ctx; bool truncated = false; bool stopped_eos = false; bool stopped_word = false; bool stopped_limit = false; std::string stopping_word; int32_t multibyte_pending = 0; std::mutex mutex; std::unique_lock lock() { return std::unique_lock(mutex); } ~llama_server_context() { if (ctx) { llama_free(ctx); ctx = nullptr; } if (model) { llama_free_model(model); model = nullptr; } } void rewind() { params.antiprompt.clear(); params.sparams.grammar.clear(); num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; generated_text.reserve(n_ctx); generated_token_probs.clear(); truncated = false; stopped_eos = false; stopped_word = false; stopped_limit = false; stopping_word = ""; multibyte_pending = 0; n_remain = 0; n_past = 0; params.sparams.n_prev = n_ctx; } void initSampling() { if (ctx_sampling != nullptr) { llama_sampling_free(ctx_sampling); } ctx_sampling = llama_sampling_init(params.sparams); } bool loadModel(const gpt_params ¶ms_) { params = params_; std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr) { return false; } n_ctx = llama_n_ctx(ctx); return true; } std::vector tokenize_string(const char *prompt, bool add_bos) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; auto s = std::string(prompt); prompt_tokens = ::llama_tokenize(ctx, s, add_bos); return prompt_tokens; } std::vector tokenize_array(const char **prompts, bool add_bos) const { std::vector prompt_tokens; bool first = true; bool is_string = true; for (const char **p = prompts; *p != nullptr; ++p) { if (is_string) { auto s = std::string(*p); std::vector p; if (first) { p = ::llama_tokenize(ctx, s, add_bos); first = false; } else { p = ::llama_tokenize(ctx, s, false); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); } else { if (first) { first = false; } //prompt_tokens.push_back(p.template get()); } } return prompt_tokens; } void truncatePrompt(std::vector &prompt_tokens) { const int n_left = n_ctx - params.n_keep; const int n_block_size = n_left / 2; const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size; // Keep n_keep tokens at start of prompt (at most n_ctx - 4) std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); truncated = true; prompt_tokens = new_tokens; } void loadInfill() { bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } auto prefix_tokens = tokenize_string(params.input_prefix.c_str(), false); auto suffix_tokens = tokenize_string(params.input_suffix.c_str(), false); const int space_token = 29871; if (suff_rm_leading_spc && suffix_tokens[0] == space_token) { suffix_tokens.erase(suffix_tokens.begin()); } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); prefix_tokens.push_back(llama_token_middle(model)); auto prompt_tokens = prefix_tokens; num_prompt_tokens = prompt_tokens.size(); if (params.n_keep < 0) { params.n_keep = (int)num_prompt_tokens; } params.n_keep = std::min(params.n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal if (num_prompt_tokens >= (size_t) n_ctx) { truncatePrompt(prompt_tokens); num_prompt_tokens = prompt_tokens.size(); GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx); } // push the prompt into the sampling context (do not apply grammar) for (auto & token : prompt_tokens) { llama_sampling_accept(ctx_sampling, ctx, token, false); } // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); embd = prompt_tokens; if (n_past == num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. printf("we have to evaluate at least 1 token to generate logits\n"); n_past--; } // since #3228 we now have to manually manage the KV cache llama_kv_cache_seq_rm(ctx, 0, n_past, -1); has_next_token = true; } void loadPrompt(std::string prompt) { auto prompt_tokens = tokenize_string(prompt.c_str(), true); // always add BOS num_prompt_tokens = prompt_tokens.size(); if (params.n_keep < 0) { params.n_keep = (int)num_prompt_tokens; } params.n_keep = std::min(n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal if (num_prompt_tokens >= (size_t) n_ctx) { truncatePrompt(prompt_tokens); num_prompt_tokens = prompt_tokens.size(); GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx); } // push the prompt into the sampling context (do not apply grammar) for (auto & token : prompt_tokens) { llama_sampling_accept(ctx_sampling, ctx, token, false); } // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); embd = prompt_tokens; if (n_past == num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. n_past--; } // since #3228 we now have to manually manage the KV cache llama_kv_cache_seq_rm(ctx, 0, n_past, -1); has_next_token = true; } void beginCompletion() { // number of tokens to keep when resetting context n_remain = params.n_predict; llama_set_rng_seed(ctx, params.seed); } completion_token_output nextToken() { completion_token_output result; result.tok = -1; if (embd.size() >= (size_t)n_ctx) { // Shift context const int n_left = n_past - params.n_keep - 1; const int n_discard = n_left/2; llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) { embd[i - n_discard] = embd[i]; } embd.resize(embd.size() - n_discard); n_past -= n_discard; truncated = true; } bool tg = true; while (n_past < embd.size()) { int n_eval = (int)embd.size() - n_past; tg = n_eval == 1; if (n_eval > params.n_batch) { n_eval = params.n_batch; } if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { has_next_token = false; return result; } n_past += n_eval; } if (params.n_predict == 0) { has_next_token = false; result.tok = llama_token_eos(model); return result; } { // out of user input, sample next token result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL); llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false }; const int32_t n_probs = params.sparams.n_probs; if (params.sparams.temp <= 0 && n_probs > 0) { // For llama_sample_token_greedy we need to sort candidates llama_sample_softmax(ctx, &cur_p); } for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); } llama_sampling_accept(ctx_sampling, ctx, result.tok, true); if (tg) { num_tokens_predicted++; } } // add it to the context embd.push_back(result.tok); // decrement remaining sampling budget --n_remain; if (!embd.empty() && embd.back() == llama_token_eos(model)) { // stopping_word = llama_token_to_piece(ctx, embd.back()); has_next_token = false; stopped_eos = true; return result; } has_next_token = params.n_predict == -1 || n_remain != 0; return result; } size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type) { size_t stop_pos = std::string::npos; for (const std::string &word : params.antiprompt) { size_t pos; if (type == STOP_FULL) { const size_t tmp = word.size() + last_token_size; const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; pos = text.find(word, from_pos); } else { pos = find_partial_stop_string(word, text); } if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { if (type == STOP_FULL) { stopping_word = word; stopped_word = true; has_next_token = false; } stop_pos = pos; } } return stop_pos; } completion_token_output doCompletion() { auto token_with_probs = nextToken(); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; if (params.sparams.n_probs > 0) { generated_token_probs.push_back(token_with_probs); } if (multibyte_pending > 0) { multibyte_pending -= token_text.size(); } else if (token_text.size() == 1) { const char c = token_text[0]; // 2-byte characters: 110xxxxx 10xxxxxx if ((c & 0xE0) == 0xC0) { multibyte_pending = 1; // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx } else if ((c & 0xF0) == 0xE0) { multibyte_pending = 2; // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx } else if ((c & 0xF8) == 0xF0) { multibyte_pending = 3; } else { multibyte_pending = 0; } } if (multibyte_pending > 0 && !has_next_token) { has_next_token = true; n_remain++; } if (!has_next_token && n_remain == 0) { stopped_limit = true; } return token_with_probs; } std::vector getEmbedding() { static const int n_embd = llama_n_embd(model); if (!params.embedding) { return std::vector(n_embd, 0.0f); } const float *data = llama_get_embeddings(ctx); std::vector embedding(data, data + n_embd); return embedding; } }; static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama) { gpt_params default_params; llama.stream = streaming; llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens(); llama.params.sparams.top_k = predict->topk(); llama.params.sparams.top_p = predict->topp(); llama.params.sparams.tfs_z = predict->tailfreesamplingz(); llama.params.sparams.typical_p = predict->typicalp(); llama.params.sparams.penalty_last_n = predict->repeat(); llama.params.sparams.temp = predict->temperature(); llama.params.sparams.penalty_repeat = predict->penalty(); llama.params.sparams.penalty_present = predict->presencepenalty(); llama.params.sparams.penalty_freq = predict->frequencypenalty(); llama.params.sparams.mirostat = predict->mirostat(); llama.params.sparams.mirostat_tau = predict->mirostattau(); llama.params.sparams.mirostat_eta = predict->mirostateta(); llama.params.sparams.penalize_nl = predict->penalizenl(); llama.params.n_keep = predict->nkeep(); llama.params.seed = predict->seed(); llama.params.sparams.grammar = predict->grammar(); // llama.params.n_probs = predict-> llama.params.prompt = predict->prompt(); llama.params.sparams.logit_bias.clear(); if (predict->ignoreeos()) { llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY; } // const auto &logit_bias = body.find("logit_bias"); // if (logit_bias != body.end() && logit_bias->is_array()) // { // const int n_vocab = llama_n_vocab(llama.model); // for (const auto &el : *logit_bias) // { // if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) // { // llama_token tok = el[0].get(); // if (tok >= 0 && tok < n_vocab) // { // if (el[1].is_number()) // { // llama.params.logit_bias[tok] = el[1].get(); // } // else if (el[1].is_boolean() && !el[1].get()) // { // llama.params.logit_bias[tok] = -INFINITY; // } // } // } // } // } llama.params.antiprompt.clear(); for (const std::string& stopPrompt : predict->stopprompts()) { if (!stopPrompt.empty()) { llama.params.antiprompt.push_back(stopPrompt); } } } static void params_parse(const backend::ModelOptions* request, gpt_params & params) { params.model = request->modelfile(); // params.model_alias ?? params.model_alias = request->modelfile(); params.n_ctx = request->contextsize(); params.memory_f16 = request->f16memory(); params.n_threads = request->threads(); params.n_gpu_layers = request->ngpulayers(); params.n_batch = request->nbatch(); if (!request->tensorsplit().empty()) { std::string arg_next = request->tensorsplit(); // split string by , and / const std::regex regex{ R"([,/]+)" }; std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { if (i_device < split_arg.size()) { params.tensor_split[i_device] = std::stof(split_arg[i_device]); } else { params.tensor_split[i_device] = 0.0f; } } } if (!request->maingpu().empty()) { params.main_gpu = std::stoi(request->maingpu()); } // TODO: lora needs also a scale factor //params.lora_adapter = request->loraadapter(); //params.lora_base = request->lorabase(); params.use_mlock = request->mlock(); params.use_mmap = request->mmap(); params.embedding = request->embeddings(); } static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) { return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.model); } // Function matching type llama_beam_search_callback_fn_t. // Custom callback example is called each time the beams lengths increase: // * Show progress by printing ',' following by number of convergent beam tokens if any. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // This is also called when the stop condition is met. // Collect tokens into std::vector response which is pointed to by callback_data. static void beam_search_callback(void *callback_data, llama_beams_state beams_state) { auto & llama = *static_cast(callback_data); // Mark beams as EOS as needed. for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { llama_beam_view& beam_view = beams_state.beam_views[i]; if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) { beam_view.eob = true; } } printf(","); // Show progress if (const size_t n = beams_state.common_prefix_length) { llama.generated_token_probs.resize(llama.generated_token_probs.size() + n); assert(0u < beams_state.n_beams); const llama_token * tokens = beams_state.beam_views[0].tokens; const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); printf("%zu", n); } fflush(stdout); #if 0 // DEBUG: print current beams for this iteration std::cout << "\n\nCurrent beams:\n"; for (size_t i=0 ; i < beams_state.n_beams ; ++i) { std::cout << "beams["<set_message("OK"); return Status::OK; } grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { // Implement LoadModel RPC gpt_params params; params_parse(request, params); llama_backend_init(params.numa); // load the model if (!llama.loadModel(params)) { result->set_message("Failed loading model"); result->set_success(false); return Status::CANCELLED; } result->set_message("Loading succeeded"); result->set_success(true); return Status::OK; } grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter* writer) override { // Implement the streaming logic here based on the request options // You can use writer->Write(response) to send a reply to the client // and return grpc::Status::OK when the operation is complete. auto lock = llama.lock(); llama.rewind(); llama_reset_timings(llama.ctx); parse_options_completion(false, request, llama); llama.initSampling(); llama.loadPrompt(request->prompt()); llama.beginCompletion(); size_t sent_count = 0; size_t sent_token_probs_index = 0; while (llama.has_next_token) { const completion_token_output token_with_probs = llama.doCompletion(); if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) { continue; } const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok); size_t pos = std::min(sent_count, llama.generated_text.size()); const std::string str_test = llama.generated_text.substr(pos); bool is_stop_full = false; size_t stop_pos = llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); if (stop_pos != std::string::npos) { is_stop_full = true; llama.generated_text.erase( llama.generated_text.begin() + pos + stop_pos, llama.generated_text.end()); pos = std::min(sent_count, llama.generated_text.size()); } else { is_stop_full = false; stop_pos = llama.findStoppingStrings(str_test, token_text.size(), STOP_PARTIAL); } if ( stop_pos == std::string::npos || // Send rest of the text if we are at the end of the generation (!llama.has_next_token && !is_stop_full && stop_pos > 0) ) { const std::string to_send = llama.generated_text.substr(pos, std::string::npos); sent_count += to_send.size(); std::vector probs_output = {}; if (llama.params.sparams.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); if (probs_pos < probs_stop_pos) { probs_output = std::vector(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos); } sent_token_probs_index = probs_stop_pos; } backend::Reply reply; reply.set_message(to_send); // Send the reply writer->Write(reply); } } llama_print_timings(llama.ctx); llama.mutex.unlock(); lock.release(); return grpc::Status::OK; } grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) { auto lock = llama.lock(); llama.rewind(); llama_reset_timings(llama.ctx); parse_options_completion(false, request, llama); llama.initSampling(); llama.loadPrompt(request->prompt()); llama.beginCompletion(); if (llama.params.n_beams) { // Fill llama.generated_token_probs vector with final beam. llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams, llama.n_past, llama.n_remain); // Translate llama.generated_token_probs to llama.generated_text. append_to_generated_text_from_generated_token_probs(llama); } else { size_t stop_pos = std::string::npos; while (llama.has_next_token) { const completion_token_output token_with_probs = llama.doCompletion(); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok); stop_pos = llama.findStoppingStrings(llama.generated_text, token_text.size(), STOP_FULL); } if (stop_pos == std::string::npos) { stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); } if (stop_pos != std::string::npos) { llama.generated_text.erase(llama.generated_text.begin() + stop_pos, llama.generated_text.end()); } } auto probs = llama.generated_token_probs; if (llama.params.sparams.n_probs > 0 && llama.stopped_word) { const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); } reply->set_message(llama.generated_text); return grpc::Status::OK; } }; void RunServer(const std::string& server_address) { BackendServiceImpl service; ServerBuilder builder; builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); builder.RegisterService(&service); std::unique_ptr server(builder.BuildAndStart()); std::cout << "Server listening on " << server_address << std::endl; server->Wait(); } int main(int argc, char** argv) { std::string server_address("localhost:50051"); // Define long and short options struct option long_options[] = { {"addr", required_argument, nullptr, 'a'}, {nullptr, 0, nullptr, 0} }; // Parse command-line arguments int option; int option_index = 0; while ((option = getopt_long(argc, argv, "a:", long_options, &option_index)) != -1) { switch (option) { case 'a': server_address = optarg; break; default: std::cerr << "Usage: " << argv[0] << " [--addr=

] or [-a

]" << std::endl; return 1; } } RunServer(server_address); return 0; }