diff --git a/Dockerfile b/Dockerfile index f90950d9..8fe5f8b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,8 +68,7 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \ ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \ cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/ && \ - rm spdlog-${SPDLOG_VERSION} -rf && \ - rm /build/lib/Linux-$(uname -m)/piper_phonemize -rf + rm spdlog-${SPDLOG_VERSION} -rf # Extras requirements FROM requirements-core as requirements-extras diff --git a/Makefile b/Makefile index 89fdb42d..2d5fd0fb 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ GOLLAMA_VERSION?=1676dcd7a139b6cdfbaea5fd67f46dc25d9d8bcf GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 -CPPLLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda +CPPLLAMA_VERSION?=465219b9143ac01db0990bbcb0a081ef72ec2008 # gpt4all version GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all @@ -129,7 +129,13 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts) OPTIONAL_GRPC+=backend-assets/grpc/piper endif -GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) +ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) +GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC) + +# If empty, then we build all +ifeq ($(GRPC_BACKENDS),) + GRPC_BACKENDS=$(ALL_GRPC_BACKENDS) +endif .PHONY: all test build vendor diff --git a/backend/cpp/llama/Makefile b/backend/cpp/llama/Makefile index 77851eaa..fb64c556 100644 --- a/backend/cpp/llama/Makefile +++ b/backend/cpp/llama/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda +LLAMA_VERSION?= CMAKE_ARGS?= BUILD_TYPE?= diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 16b63aff..50d59ab1 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -88,6 +88,7 @@ static size_t find_partial_stop_string(const std::string &stop, return std::string::npos; } + template static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) { @@ -128,17 +129,15 @@ struct llama_server_context size_t n_past = 0; size_t n_remain = 0; - // json prompt; std::vector embd; - std::vector last_n_tokens; + + gpt_params params; llama_model *model = nullptr; llama_context *ctx = nullptr; - gpt_params params; - int n_ctx; + llama_sampling_context *ctx_sampling = nullptr; - grammar_parser::parse_state parsed_grammar; - llama_grammar *grammar = nullptr; + int n_ctx; bool truncated = false; bool stopped_eos = false; @@ -171,7 +170,7 @@ struct llama_server_context void rewind() { params.antiprompt.clear(); - params.grammar.clear(); + params.sparams.grammar.clear(); num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; @@ -185,100 +184,87 @@ struct llama_server_context multibyte_pending = 0; n_remain = 0; n_past = 0; + params.sparams.n_prev = n_ctx; + } - if (grammar != nullptr) { - llama_grammar_free(grammar); - grammar = nullptr; + void initSampling() { + if (ctx_sampling != nullptr) { + llama_sampling_free(ctx_sampling); } + ctx_sampling = llama_sampling_init(params.sparams); } bool loadModel(const gpt_params ¶ms_) { - printf("load model %s\n", params_.model.c_str()); - params = params_; std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr) { - printf("unable to load model %s\n", params_.model.c_str()); return false; } n_ctx = llama_n_ctx(ctx); - last_n_tokens.resize(n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); return true; } - - std::vector tokenize_array(const char **prompts, bool add_bos) const - { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, + std::vector tokenize_string(const char *prompt, bool add_bos) const { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - - bool first = true; - // Iterate over prompts - for (const char **p = prompts; *p != nullptr; ++p) - { - auto s = std::string(*p); - std::vector pp; - if (first) - { - pp = ::llama_tokenize(ctx, s, add_bos); - first = false; - } - else - { - pp = ::llama_tokenize(ctx, s, false); - } - prompt_tokens.insert(prompt_tokens.end(), pp.begin(), pp.end()); - } - - - return prompt_tokens; - } - - std::vector tokenize_string(const char *prompt, bool add_bos) const - { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - + std::vector prompt_tokens; auto s = std::string(prompt); prompt_tokens = ::llama_tokenize(ctx, s, add_bos); - return prompt_tokens; } + std::vector tokenize_array(const char **prompts, bool add_bos) const { + std::vector prompt_tokens; - bool loadGrammar() - { - if (!params.grammar.empty()) { - parsed_grammar = grammar_parser::parse(params.grammar.c_str()); - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - printf("grammar parse error"); - return false; - } - grammar_parser::print_grammar(stderr, parsed_grammar); - - { - auto it = params.logit_bias.find(llama_token_eos(ctx)); - if (it != params.logit_bias.end() && it->second == -INFINITY) { - printf("EOS token is disabled, which will cause most grammars to fail"); + bool first = true; + bool is_string = true; + for (const char **p = prompts; *p != nullptr; ++p) + { + if (is_string) + { + auto s = std::string(*p); + std::vector p; + if (first) + { + p = ::llama_tokenize(ctx, s, add_bos); + first = false; + } + else + { + p = ::llama_tokenize(ctx, s, false); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + //prompt_tokens.push_back(p.template get()); } } + return prompt_tokens; + } - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - } - return true; + void truncatePrompt(std::vector &prompt_tokens) { + const int n_left = n_ctx - params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size; + + // Keep n_keep tokens at start of prompt (at most n_ctx - 4) + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + + truncated = true; + prompt_tokens = new_tokens; } void loadInfill() { bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -294,6 +280,7 @@ struct llama_server_context prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); prefix_tokens.push_back(llama_token_middle(ctx)); + auto prompt_tokens = prefix_tokens; num_prompt_tokens = prompt_tokens.size(); @@ -305,29 +292,24 @@ struct llama_server_context params.n_keep = std::min(params.n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)params.n_ctx) + if (num_prompt_tokens >= (size_t) n_ctx) { - printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens); - // todo we probably want to cut from both sides - const int n_left = (params.n_ctx - params.n_keep) / 2; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + truncatePrompt(prompt_tokens); + num_prompt_tokens = prompt_tokens.size(); - truncated = true; - prompt_tokens = new_tokens; + GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx); } - else + + // push the prompt into the sampling context (do not apply grammar) + for (auto & token : prompt_tokens) { - const size_t ps = num_prompt_tokens; - std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); + llama_sampling_accept(ctx_sampling, ctx, token, false); } // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); embd = prompt_tokens; + if (n_past == num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. @@ -335,6 +317,7 @@ struct llama_server_context n_past--; } + // since #3228 we now have to manually manage the KV cache llama_kv_cache_seq_rm(ctx, 0, n_past, -1); has_next_token = true; @@ -352,38 +335,33 @@ struct llama_server_context params.n_keep = std::min(n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)n_ctx) + if (num_prompt_tokens >= (size_t) n_ctx) { - const int n_left = (n_ctx - params.n_keep) / 2; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + truncatePrompt(prompt_tokens); + num_prompt_tokens = prompt_tokens.size(); - - truncated = true; - prompt_tokens = new_tokens; + GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx); } - else + + // push the prompt into the sampling context (do not apply grammar) + for (auto & token : prompt_tokens) { - const size_t ps = num_prompt_tokens; - std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); - std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); + llama_sampling_accept(ctx_sampling, ctx, token, false); } // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); - embd = prompt_tokens; if (n_past == num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. n_past--; } - // since #3228 we now have to manually manage the KV cache + // since #3228 we now have to manually manage the KV cache llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + has_next_token = true; } @@ -418,7 +396,6 @@ struct llama_server_context n_past -= n_discard; truncated = true; - } bool tg = true; @@ -433,7 +410,6 @@ struct llama_server_context if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { - has_next_token = false; return result; } @@ -449,27 +425,24 @@ struct llama_server_context { // out of user input, sample next token - std::vector candidates; - candidates.reserve(llama_n_vocab(model)); + result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL); - result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates); + llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false }; - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - const int32_t n_probs = params.n_probs; - if (params.temp <= 0 && n_probs > 0) + const int32_t n_probs = params.sparams.n_probs; + if (params.sparams.temp <= 0 && n_probs > 0) { // For llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &candidates_p); + llama_sample_softmax(ctx, &cur_p); } - for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i) + for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { - result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p}); + result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(result.tok); + llama_sampling_accept(ctx_sampling, ctx, result.tok, true); + if (tg) { num_tokens_predicted++; } @@ -531,7 +504,7 @@ struct llama_server_context const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; - if (params.n_probs > 0) + if (params.sparams.n_probs > 0) { generated_token_probs.push_back(token_with_probs); } @@ -583,7 +556,6 @@ struct llama_server_context static const int n_embd = llama_n_embd(model); if (!params.embedding) { - printf("embedding disabled"); return std::vector(n_embd, 0.0f); } const float *data = llama_get_embeddings(ctx); @@ -599,30 +571,30 @@ static void parse_options_completion(bool streaming,const backend::PredictOption llama.stream = streaming; llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens(); - llama.params.top_k = predict->topk(); - llama.params.top_p = predict->topp(); - llama.params.tfs_z = predict->tailfreesamplingz(); - llama.params.typical_p = predict->typicalp(); - llama.params.repeat_last_n = predict->repeat(); - llama.params.temp = predict->temperature(); - llama.params.repeat_penalty = predict->penalty(); - llama.params.presence_penalty = predict->presencepenalty(); - llama.params.frequency_penalty = predict->frequencypenalty(); - llama.params.mirostat = predict->mirostat(); - llama.params.mirostat_tau = predict->mirostattau(); - llama.params.mirostat_eta = predict->mirostateta(); - llama.params.penalize_nl = predict->penalizenl(); + llama.params.sparams.top_k = predict->topk(); + llama.params.sparams.top_p = predict->topp(); + llama.params.sparams.tfs_z = predict->tailfreesamplingz(); + llama.params.sparams.typical_p = predict->typicalp(); + llama.params.sparams.penalty_last_n = predict->repeat(); + llama.params.sparams.temp = predict->temperature(); + llama.params.sparams.penalty_repeat = predict->penalty(); + llama.params.sparams.penalty_present = predict->presencepenalty(); + llama.params.sparams.penalty_freq = predict->frequencypenalty(); + llama.params.sparams.mirostat = predict->mirostat(); + llama.params.sparams.mirostat_tau = predict->mirostattau(); + llama.params.sparams.mirostat_eta = predict->mirostateta(); + llama.params.sparams.penalize_nl = predict->penalizenl(); llama.params.n_keep = predict->nkeep(); llama.params.seed = predict->seed(); - llama.params.grammar = predict->grammar(); + llama.params.sparams.grammar = predict->grammar(); // llama.params.n_probs = predict-> llama.params.prompt = predict->prompt(); - llama.params.logit_bias.clear(); + llama.params.sparams.logit_bias.clear(); if (predict->ignoreeos()) { - llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; + llama.params.sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; } // const auto &logit_bias = body.find("logit_bias"); @@ -801,12 +773,7 @@ public: parse_options_completion(false, request, llama); - if (!llama.loadGrammar()) - { - //res.status = 400; - return Status::CANCELLED; - } - + llama.initSampling(); llama.loadPrompt(request->prompt()); llama.beginCompletion(); size_t sent_count = 0; @@ -848,7 +815,7 @@ public: std::vector probs_output = {}; - if (llama.params.n_probs > 0) { + if (llama.params.sparams.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); @@ -879,12 +846,7 @@ public: llama_reset_timings(llama.ctx); parse_options_completion(false, request, llama); - if (!llama.loadGrammar()) - { - //res.status = 400; - return Status::CANCELLED; - } - + llama.initSampling(); llama.loadPrompt(request->prompt()); llama.beginCompletion(); @@ -915,7 +877,7 @@ public: } auto probs = llama.generated_token_probs; - if (llama.params.n_probs > 0 && llama.stopped_word) { + if (llama.params.sparams.n_probs > 0 && llama.stopped_word) { const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); } diff --git a/examples/bruno/LocalAI Test Requests/image generation/Generate image.bru b/examples/bruno/LocalAI Test Requests/image generation/Generate image.bru new file mode 100644 index 00000000..37d350ca --- /dev/null +++ b/examples/bruno/LocalAI Test Requests/image generation/Generate image.bru @@ -0,0 +1,25 @@ +meta { + name: Generate image + type: http + seq: 1 +} + +post { + url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations + body: json + auth: none +} + +headers { + Content-Type: application/json +} + +body:json { + { + "prompt": "|", + "model": "model-name", + "step": 51, + "size": "1024x1024", + "image": "" + } +} diff --git a/examples/bruno/LocalAI Test Requests/llm text/chat/chat completion -simple- 1 message-.bru b/examples/bruno/LocalAI Test Requests/llm text/chat/chat completion -simple- 1 message-.bru index cf51bb7c..fd08aeee 100644 --- a/examples/bruno/LocalAI Test Requests/llm text/chat/chat completion -simple- 1 message-.bru +++ b/examples/bruno/LocalAI Test Requests/llm text/chat/chat completion -simple- 1 message-.bru @@ -15,10 +15,16 @@ headers { } body:json { - { - "model": "{{DEFAULT_MODEL}}", - "messages": [{"role": "user", "content": "How could one use friction to cook an egg?"}], - "max_tokens": 256, - "temperature": 0.2 + { + "model": "{{DEFAULT_MODEL}}", + "messages": [ + { + "role": "user", + "content": "How could one use friction to cook an egg?" + } + ], + "max_tokens": 256, + "temperature": 0.2, + "grammar": "" } }