feat: llama.cpp gRPC C++ backend (#1170)

* wip: llama.cpp c++ gRPC server Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * make it work, attach it to the build process Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * update deps Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: add protobuf dep Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * try fix protobuf on cmake * cmake: workarounds Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * add packages * cmake: use fixed version of grpc Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * cmake(grpc): install locally * install grpc Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * install required deps for grpc on debian bullseye Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * debug * debug * Fixups * no need to install cmake manually Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * ci: fixup macOS * use brew whenever possible Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * macOS fixups * debug * fix container build Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * workaround * try mac https://stackoverflow.com/questions/23905661/on-mac-g-clang-fails-to-search-usr-local-include-and-usr-local-lib-by-def * Disable temp. arm64 docker image builds --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-06 17:21:36 +00:00 · 2023-10-16 21:46:29 +02:00 · 2023-10-16 21:46:29 +02:00 · 128694213f
commit 128694213f
parent 8034ed3473
10 changed files with 1145 additions and 16 deletions
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
+          - repository: "ggerganov/llama.cpp"
+            variable: "CPPLLAMA_VERSION"
+            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@ -19,7 +19,8 @@ jobs:
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
@ -38,7 +39,7 @@ jobs:
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -29,6 +29,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
+
      - name: Build
        id: build
        env:
@ -66,12 +72,20 @@ jobs:
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          make dist
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -72,6 +72,10 @@ jobs:
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          sudo pip install -r extra/requirements.txt

+
+          # Pre-build stable diffusion before we install a newever version of abseil (not compatible with stablediffusion-ncn)
+          GO_TAGS="tts stablediffusion" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
          tar -xzvf - && \
@ -87,6 +91,12 @@ jobs:
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
+
      - name: Test
        run: |
          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test
@ -108,6 +118,14 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/17
+++ b/17
@ -16,7 +16,8 @@ ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/i
 ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch pip
+    apt-get install -y ca-certificates curl patch pip cmake
+

 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
@ -104,6 +105,15 @@ RUN make prepare
 COPY . .
 COPY .git .

+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+RUN git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+      -DgRPC_BUILD_TESTS=OFF \
+       ../.. && make -j12 install && rm -rf grpc
+
+# Rebuild with defaults backends
 RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build

 ###################################
@ -132,8 +142,13 @@ WORKDIR /build
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 RUN make prepare-sources
+
+# Copy the binary
 COPY --from=builder /build/local-ai ./

+# do not let piper rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/piper ./backend-assets/grpc/piper
+
 # Copy VALLE-X as it's not a real "lib"
 RUN cp -rfv /usr/lib/vall-e-x/* ./

--- a/24
+++ b/24
@ -8,6 +8,8 @@ GOLLAMA_VERSION?=1676dcd7a139b6cdfbaea5fd67f46dc25d9d8bcf

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

+CPPLLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda
+
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
@ -120,7 +122,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)

 .PHONY: all test build vendor

@ -223,7 +225,7 @@ go-llama/libbinding.a: go-llama
 go-llama-stable/libbinding.a: go-llama-stable
 	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-go-piper/libpiper_binding.a:
+go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main

 get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
@ -280,6 +282,7 @@ clean: ## Remove build related file
 	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
+	$(MAKE) -C backend/cpp/llama clean

 ## Build:

@ -395,6 +398,16 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

+backend/cpp/llama/grpc-server:
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
+	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+endif
+
 backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
@ -451,9 +464,12 @@ backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/

-backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
+backend-assets/grpc/stablediffusion: backend-assets/grpc
+	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
+		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
 		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
+	fi

 backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@ -0,0 +1,57 @@
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf ${Protobuf_VERSION} ${Protobuf_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}")
+
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+# hw_grpc_proto
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs})
+
+add_executable(${TARGET} grpc-server.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@ -0,0 +1,44 @@
+
+LLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblast)
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+endif
+
+llama.cpp:
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+llama.cpp/examples/grpc-server:
+	mkdir -p llama.cpp/examples/grpc-server
+	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+
+rebuild:
+	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+clean:
+	rm -rf llama.cpp
+	rm -rf grpc-server
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@ -0,0 +1,964 @@
+// llama.cpp gRPC C++ backend server
+//
+// Ettore Di Giacinto <mudler@localai.io>
+//
+// This is a gRPC server for llama.cpp compatible with the LocalAI proto
+// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP, 
+// but modified to work with gRPC
+//
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <getopt.h>
+
+#include "common.h"
+#include "llama.h"
+#include "grammar-parser.h"
+#include "backend.pb.h"
+#include "backend.grpc.pb.h"
+
+// include std::regex
+#include <regex>
+#include <grpcpp/ext/proto_server_reflection_plugin.h>
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/health_check_service_interface.h>
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::Status;
+
+
+using backend::HealthMessage;
+
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+};
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
+    return i;
+}
+
+enum stop_type
+{
+    STOP_FULL,
+    STOP_PARTIAL,
+};
+
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
+    return str.size() >= suffix.size() &&
+           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+static size_t find_partial_stop_string(const std::string &stop,
+                                       const std::string &text)
+{
+    if (!text.empty() && !stop.empty())
+    {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        {
+            if (stop[char_index] == text_last_char)
+            {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial))
+                {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+
+template <class Iter>
+static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+{
+    std::string ret;
+    for (; begin != end; ++begin)
+    {
+        ret += llama_token_to_piece(ctx, *begin);
+    }
+    return ret;
+}
+
+
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+    {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+
+struct llama_server_context
+{
+    bool stream = false;
+    bool has_next_token = false;
+    std::string generated_text;
+    std::vector<completion_token_output> generated_token_probs;
+
+    size_t num_prompt_tokens = 0;
+    size_t num_tokens_predicted = 0;
+    size_t n_past = 0;
+    size_t n_remain = 0;
+
+   // json prompt;
+    std::vector<llama_token> embd;
+    std::vector<llama_token> last_n_tokens;
+
+    llama_model *model = nullptr;
+    llama_context *ctx = nullptr;
+    gpt_params params;
+    int n_ctx;
+
+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *grammar = nullptr;
+
+    bool truncated = false;
+    bool stopped_eos = false;
+    bool stopped_word = false;
+    bool stopped_limit = false;
+    std::string stopping_word;
+    int32_t multibyte_pending = 0;
+
+    std::mutex mutex;
+
+    std::unique_lock<std::mutex> lock()
+    {
+        return std::unique_lock<std::mutex>(mutex);
+    }
+
+    ~llama_server_context()
+    {
+        if (ctx)
+        {
+            llama_free(ctx);
+            ctx = nullptr;
+        }
+        if (model)
+        {
+            llama_free_model(model);
+            model = nullptr;
+        }
+    }
+
+    void rewind()
+    {
+        params.antiprompt.clear();
+        params.grammar.clear();
+        num_prompt_tokens = 0;
+        num_tokens_predicted = 0;
+        generated_text = "";
+        generated_text.reserve(n_ctx);
+        generated_token_probs.clear();
+        truncated = false;
+        stopped_eos = false;
+        stopped_word = false;
+        stopped_limit = false;
+        stopping_word = "";
+        multibyte_pending = 0;
+        n_remain = 0;
+        n_past = 0;
+
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
+        }
+    }
+
+    bool loadModel(const gpt_params &params_)
+    {
+                    printf("load model %s\n", params_.model.c_str());
+
+        params = params_;
+        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        if (model == nullptr)
+        {
+            printf("unable to load model %s\n", params_.model.c_str());
+            return false;
+        }
+        n_ctx = llama_n_ctx(ctx);
+        last_n_tokens.resize(n_ctx);
+        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+        return true;
+    }
+
+     std::vector<llama_token> tokenize_array(const char **prompts, bool add_bos) const
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+
+  
+        bool first = true;
+        // Iterate over prompts
+        for (const char **p = prompts; *p != nullptr; ++p)
+        {
+            auto s = std::string(*p);
+            std::vector<llama_token> pp;
+            if (first)
+            {
+                pp = ::llama_tokenize(ctx, s, add_bos);
+                first = false;
+            }
+            else
+            {
+                pp = ::llama_tokenize(ctx, s, false);
+            }
+            prompt_tokens.insert(prompt_tokens.end(), pp.begin(), pp.end());
+        }
+        
+
+        return prompt_tokens;
+    }
+
+    std::vector<llama_token> tokenize_string(const char *prompt, bool add_bos) const
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+  
+        auto s = std::string(prompt);
+        prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+    
+        return prompt_tokens;
+    }
+
+    bool loadGrammar()
+    {
+        if (!params.grammar.empty()) {
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                printf("grammar parse error");
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    printf("EOS token is disabled, which will cause most grammars to fail");
+                }
+            }
+
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
+        return true;
+    }
+
+    void loadInfill()
+    {
+        bool suff_rm_leading_spc = true;
+        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+            params.input_suffix.erase(0, 1);
+            suff_rm_leading_spc = false;
+        }
+
+        auto prefix_tokens = tokenize_string(params.input_prefix.c_str(), false);
+        auto suffix_tokens = tokenize_string(params.input_suffix.c_str(), false);
+        const int space_token = 29871;
+        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
+            suffix_tokens.erase(suffix_tokens.begin());
+        }
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
+        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
+        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+        prefix_tokens.push_back(llama_token_middle(ctx));
+        auto prompt_tokens = prefix_tokens;
+
+        num_prompt_tokens = prompt_tokens.size();
+
+        if (params.n_keep < 0)
+        {
+            params.n_keep = (int)num_prompt_tokens;
+        }
+        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
+
+        // if input prompt is too big, truncate like normal
+        if (num_prompt_tokens >= (size_t)params.n_ctx)
+        {
+            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
+            // todo we probably want to cut from both sides
+            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
+
+            truncated = true;
+            prompt_tokens = new_tokens;
+        }
+        else
+        {
+            const size_t ps = num_prompt_tokens;
+            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
+        }
+
+        // compare the evaluated prompt with the new prompt
+        n_past = common_part(embd, prompt_tokens);
+        embd = prompt_tokens;
+        if (n_past == num_prompt_tokens)
+        {
+            // we have to evaluate at least 1 token to generate logits.
+            printf("we have to evaluate at least 1 token to generate logits\n");
+            n_past--;
+        }
+
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
+        has_next_token = true;
+    }
+    void loadPrompt(std::string prompt)
+    {
+        auto prompt_tokens = tokenize_string(prompt.c_str(), true);  // always add BOS
+
+        num_prompt_tokens = prompt_tokens.size();
+
+        if (params.n_keep < 0)
+        {
+            params.n_keep = (int)num_prompt_tokens;
+        }
+        params.n_keep = std::min(n_ctx - 4, params.n_keep);
+
+        // if input prompt is too big, truncate like normal
+        if (num_prompt_tokens >= (size_t)n_ctx)
+        {
+            const int n_left = (n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
+
+
+            truncated = true;
+            prompt_tokens = new_tokens;
+        }
+        else
+        {
+            const size_t ps = num_prompt_tokens;
+            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
+        }
+
+        // compare the evaluated prompt with the new prompt
+        n_past = common_part(embd, prompt_tokens);
+
+
+        embd = prompt_tokens;
+        if (n_past == num_prompt_tokens)
+        {
+            // we have to evaluate at least 1 token to generate logits.
+            n_past--;
+        }
+        // since #3228 we now have to manually manage the KV cache
+
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+        has_next_token = true;
+    }
+
+    void beginCompletion()
+    {
+        // number of tokens to keep when resetting context
+        n_remain = params.n_predict;
+        llama_set_rng_seed(ctx, params.seed);
+    }
+
+    completion_token_output nextToken()
+    {
+        completion_token_output result;
+        result.tok = -1;
+
+        if (embd.size() >= (size_t)n_ctx)
+        {
+            // Shift context
+
+            const int n_left    = n_past - params.n_keep - 1;
+            const int n_discard = n_left/2;
+
+            llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+            llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+            for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
+            {
+                embd[i - n_discard] = embd[i];
+            }
+            embd.resize(embd.size() - n_discard);
+
+            n_past -= n_discard;
+
+            truncated = true;
+       
+        }
+
+        bool tg = true;
+        while (n_past < embd.size())
+        {
+            int n_eval = (int)embd.size() - n_past;
+            tg = n_eval == 1;
+            if (n_eval > params.n_batch)
+            {
+                n_eval = params.n_batch;
+            }
+
+            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
+            {
+             
+                has_next_token = false;
+                return result;
+            }
+            n_past += n_eval;
+        }
+
+        if (params.n_predict == 0)
+        {
+            has_next_token = false;
+            result.tok = llama_token_eos(ctx);
+            return result;
+        }
+
+        {
+            // out of user input, sample next token
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(llama_n_vocab(model));
+
+            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            const int32_t n_probs = params.n_probs;
+            if (params.temp <= 0 && n_probs > 0)
+            {
+                // For llama_sample_token_greedy we need to sort candidates
+                llama_sample_softmax(ctx, &candidates_p);
+            }
+
+            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
+            {
+                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
+            }
+
+            last_n_tokens.erase(last_n_tokens.begin());
+            last_n_tokens.push_back(result.tok);
+            if (tg) {
+                num_tokens_predicted++;
+            }
+        }
+
+        // add it to the context
+        embd.push_back(result.tok);
+        // decrement remaining sampling budget
+        --n_remain;
+
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
+        {
+            // stopping_word = llama_token_to_piece(ctx, embd.back());
+            has_next_token = false;
+            stopped_eos = true;
+            return result;
+        }
+
+        has_next_token = params.n_predict == -1 || n_remain != 0;
+        return result;
+    }
+
+    size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
+                               const stop_type type)
+    {
+        size_t stop_pos = std::string::npos;
+        for (const std::string &word : params.antiprompt)
+        {
+            size_t pos;
+            if (type == STOP_FULL)
+            {
+                const size_t tmp = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+                pos = text.find(word, from_pos);
+            }
+            else
+            {
+                pos = find_partial_stop_string(word, text);
+            }
+            if (pos != std::string::npos &&
+                (stop_pos == std::string::npos || pos < stop_pos))
+            {
+                if (type == STOP_FULL)
+                {
+                    stopping_word = word;
+                    stopped_word = true;
+                    has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+        return stop_pos;
+    }
+
+    completion_token_output doCompletion()
+    {
+        auto token_with_probs = nextToken();
+
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
+        generated_text += token_text;
+
+        if (params.n_probs > 0)
+        {
+            generated_token_probs.push_back(token_with_probs);
+        }
+
+        if (multibyte_pending > 0)
+        {
+            multibyte_pending -= token_text.size();
+        }
+        else if (token_text.size() == 1)
+        {
+            const char c = token_text[0];
+            // 2-byte characters: 110xxxxx 10xxxxxx
+            if ((c & 0xE0) == 0xC0)
+            {
+                multibyte_pending = 1;
+                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+            }
+            else if ((c & 0xF0) == 0xE0)
+            {
+                multibyte_pending = 2;
+                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            }
+            else if ((c & 0xF8) == 0xF0)
+            {
+                multibyte_pending = 3;
+            }
+            else
+            {
+                multibyte_pending = 0;
+            }
+        }
+
+        if (multibyte_pending > 0 && !has_next_token)
+        {
+            has_next_token = true;
+            n_remain++;
+        }
+
+        if (!has_next_token && n_remain == 0)
+        {
+            stopped_limit = true;
+        }
+
+        return token_with_probs;
+    }
+
+    std::vector<float> getEmbedding()
+    {
+        static const int n_embd = llama_n_embd(model);
+        if (!params.embedding)
+        {
+            printf("embedding disabled");
+            return std::vector<float>(n_embd, 0.0f);
+        }
+        const float *data = llama_get_embeddings(ctx);
+        std::vector<float> embedding(data, data + n_embd);
+        return embedding;
+    }
+};
+
+
+static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
+{
+    gpt_params default_params;
+
+    llama.stream = streaming;
+    llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
+    llama.params.top_k = predict->topk();
+    llama.params.top_p = predict->topp();
+    llama.params.tfs_z = predict->tailfreesamplingz();
+    llama.params.typical_p = predict->typicalp();
+    llama.params.repeat_last_n = predict->repeat();
+    llama.params.temp = predict->temperature();
+    llama.params.repeat_penalty = predict->penalty();
+    llama.params.presence_penalty = predict->presencepenalty();
+    llama.params.frequency_penalty = predict->frequencypenalty();
+    llama.params.mirostat = predict->mirostat();
+    llama.params.mirostat_tau = predict->mirostattau();
+    llama.params.mirostat_eta = predict->mirostateta();
+    llama.params.penalize_nl = predict->penalizenl();
+    llama.params.n_keep = predict->nkeep();
+    llama.params.seed = predict->seed();
+    llama.params.grammar = predict->grammar();
+    // llama.params.n_probs = predict->
+    llama.params.prompt = predict->prompt();
+
+    llama.params.logit_bias.clear();
+
+    if (predict->ignoreeos())
+    {
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
+    }
+
+    // const auto &logit_bias = body.find("logit_bias");
+    // if (logit_bias != body.end() && logit_bias->is_array())
+    // {
+    //     const int n_vocab = llama_n_vocab(llama.model);
+    //     for (const auto &el : *logit_bias)
+    //     {
+    //         if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+    //         {
+    //             llama_token tok = el[0].get<llama_token>();
+    //             if (tok >= 0 && tok < n_vocab)
+    //             {
+    //                 if (el[1].is_number())
+    //                 {
+    //                     llama.params.logit_bias[tok] = el[1].get<float>();
+    //                 }
+    //                 else if (el[1].is_boolean() && !el[1].get<bool>())
+    //                 {
+    //                     llama.params.logit_bias[tok] = -INFINITY;
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
+
+    llama.params.antiprompt.clear();
+    for (const std::string& stopPrompt : predict->stopprompts()) {
+    if (!stopPrompt.empty())
+            {
+                llama.params.antiprompt.push_back(stopPrompt);
+            }
+    }
+}
+
+
+
+static void params_parse(const backend::ModelOptions* request,
+                                gpt_params & params) {
+   
+    params.model = request->modelfile();
+    //  params.model_alias ??
+    params.model_alias =  request->modelfile();
+    params.n_ctx = request->contextsize();
+    params.memory_f16 = request->f16memory();
+    params.n_threads = request->threads();
+    params.n_gpu_layers = request->ngpulayers();
+    params.n_batch = request->nbatch();
+
+    if (!request->tensorsplit().empty()) {
+        std::string arg_next = request->tensorsplit();
+
+        // split string by , and /
+        const std::regex regex{ R"([,/]+)" };
+        std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+        std::vector<std::string> split_arg{ it, {} };
+
+        GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+        for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
+            if (i_device < split_arg.size()) {
+                params.tensor_split[i_device] = std::stof(split_arg[i_device]);
+            }
+            else {
+                params.tensor_split[i_device] = 0.0f;
+            }
+        }
+    }
+
+    if (!request->maingpu().empty()) {
+        params.main_gpu = std::stoi(request->maingpu());
+    }
+    // TODO: lora needs also a scale factor
+    //params.lora_adapter = request->loraadapter();
+    //params.lora_base = request->lorabase();
+    params.use_mlock = request->mlock();
+    params.use_mmap = request->mmap();
+    params.embedding = request->embeddings();
+}
+
+static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
+    auto & llama = *static_cast<llama_server_context*>(callback_data);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
+        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
+        printf("%zu", n);
+    }
+    fflush(stdout);
+#if 0 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams:\n";
+    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+struct token_translator {
+    llama_context * ctx;
+    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
+};
+
+
+static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
+{
+    auto & gtps = llama.generated_token_probs;
+    auto translator = token_translator{llama.ctx};
+    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
+    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
+    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
+        llama.generated_text.reserve(llama.generated_text.size() + len);
+    }
+    for (const completion_token_output & cto : gtps) {
+        llama.generated_text += translator(cto);
+    }
+}
+
+// GRPC Server start
+class BackendServiceImpl final : public backend::Backend::Service {
+  // The class has a llama instance that is shared across all RPCs
+  llama_server_context llama;
+public:
+  grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
+    // Implement Health RPC
+    reply->set_message("OK");
+    return Status::OK;
+  }
+
+  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
+    // Implement LoadModel RPC
+    gpt_params params;
+    params_parse(request, params);
+
+    llama_backend_init(params.numa);
+
+    // load the model
+    if (!llama.loadModel(params))
+    {
+        result->set_message("Failed loading model");
+        result->set_success(false);
+        return Status::CANCELLED;
+    }
+    result->set_message("Loading succeeded");
+    result->set_success(true);
+    return Status::OK;
+  }
+  grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
+        // Implement the streaming logic here based on the request options
+        // You can use writer->Write(response) to send a reply to the client
+        // and return grpc::Status::OK when the operation is complete.
+        auto lock = llama.lock();
+
+        llama.rewind();
+
+        llama_reset_timings(llama.ctx);
+
+        parse_options_completion(false, request, llama);
+
+        if (!llama.loadGrammar())
+        {
+            //res.status = 400;
+            return Status::CANCELLED;
+        }
+
+        llama.loadPrompt(request->prompt());
+        llama.beginCompletion();
+        size_t sent_count = 0;
+        size_t sent_token_probs_index = 0;
+
+        while (llama.has_next_token) {
+            const completion_token_output token_with_probs = llama.doCompletion();
+            if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
+                continue;
+            }
+            const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
+
+            size_t pos = std::min(sent_count, llama.generated_text.size());
+
+            const std::string str_test = llama.generated_text.substr(pos);
+            bool is_stop_full = false;
+            size_t stop_pos =
+                llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
+            if (stop_pos != std::string::npos) {
+                is_stop_full = true;
+                llama.generated_text.erase(
+                    llama.generated_text.begin() + pos + stop_pos,
+                    llama.generated_text.end());
+                pos = std::min(sent_count, llama.generated_text.size());
+            } else {
+                is_stop_full = false;
+                stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
+                    STOP_PARTIAL);
+            }
+
+            if (
+                stop_pos == std::string::npos ||
+                // Send rest of the text if we are at the end of the generation
+                (!llama.has_next_token && !is_stop_full && stop_pos > 0)
+            ) {
+                const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
+
+                sent_count += to_send.size();
+
+                std::vector<completion_token_output> probs_output = {};
+
+                if (llama.params.n_probs > 0) {
+                    const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+                    size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                    size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                    if (probs_pos < probs_stop_pos) {
+                        probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                    }
+                    sent_token_probs_index = probs_stop_pos;
+                }
+                backend::Reply reply;
+                reply.set_message(to_send);
+
+                // Send the reply
+                writer->Write(reply);
+            }
+        }
+
+        llama_print_timings(llama.ctx);
+
+        llama.mutex.unlock();
+        lock.release();
+        return grpc::Status::OK;
+    }
+
+
+    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
+        auto lock = llama.lock();
+        llama.rewind();
+        llama_reset_timings(llama.ctx);
+        parse_options_completion(false, request, llama);
+
+        if (!llama.loadGrammar())
+        {
+            //res.status = 400;
+            return Status::CANCELLED;
+        }
+
+        llama.loadPrompt(request->prompt());
+        llama.beginCompletion();
+
+        if (llama.params.n_beams) {
+            // Fill llama.generated_token_probs vector with final beam.
+            llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
+                                llama.n_past, llama.n_remain);
+            // Translate llama.generated_token_probs to llama.generated_text.
+            append_to_generated_text_from_generated_token_probs(llama);
+        } else {
+            size_t stop_pos = std::string::npos;
+
+            while (llama.has_next_token) {
+                const completion_token_output token_with_probs = llama.doCompletion();
+                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
+
+                stop_pos = llama.findStoppingStrings(llama.generated_text,
+                    token_text.size(), STOP_FULL);
+            }
+
+            if (stop_pos == std::string::npos) {
+                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+            }
+            if (stop_pos != std::string::npos) {
+                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                    llama.generated_text.end());
+            }
+        }
+
+        auto probs = llama.generated_token_probs;
+        if (llama.params.n_probs > 0 && llama.stopped_word) {
+            const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
+            probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
+        }
+        reply->set_message(llama.generated_text);
+        return grpc::Status::OK;
+    }
+};
+
+void RunServer(const std::string& server_address) {
+  BackendServiceImpl service;
+
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service);
+
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  std::cout << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+int main(int argc, char** argv) {
+  std::string server_address("localhost:50051");
+
+  // Define long and short options
+  struct option long_options[] = {
+      {"addr", required_argument, nullptr, 'a'},
+      {nullptr, 0, nullptr, 0}
+  };
+
+  // Parse command-line arguments
+  int option;
+  int option_index = 0;
+  while ((option = getopt_long(argc, argv, "a:", long_options, &option_index)) != -1) {
+    switch (option) {
+      case 'a':
+        server_address = optarg;
+        break;
+      default:
+        std::cerr << "Usage: " << argv[0] << " [--addr=<address>] or [-a <address>]" << std::endl;
+        return 1;
+    }
+  }
+
+  RunServer(server_address);
+  return 0;
+}
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -17,6 +17,7 @@ import (
 const (
 	LlamaBackend        = "llama"
 	LlamaStableBackend  = "llama-stable"
+	LLamaCPP            = "llama-cpp"
 	BloomzBackend       = "bloomz"
 	StarcoderBackend    = "starcoder"
 	GPTJBackend         = "gptj"
@ -41,8 +42,9 @@ const (
 )

 var AutoLoadBackends []string = []string{
-	LlamaBackend,
+	LLamaCPP,
 	LlamaStableBackend,
+	LlamaBackend,
 	Gpt4All,
 	FalconBackend,
 	GPTNeoXBackend,
@ -175,11 +177,6 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err er
 	}

 	switch backend {
-	case LlamaBackend, LlamaStableBackend, GPTJBackend, DollyBackend,
-		MPTBackend, Gpt2Backend, FalconBackend,
-		GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend,
-		RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend:
-		return ml.LoadModel(o.model, ml.grpcModel(backend, o))
 	case Gpt4AllLlamaBackend, Gpt4AllMptBackend, Gpt4AllJBackend, Gpt4All:
 		o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "gpt4all")
 		return ml.LoadModel(o.model, ml.grpcModel(Gpt4All, o))
@ -187,7 +184,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err er
 		o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "espeak-ng-data")
 		return ml.LoadModel(o.model, ml.grpcModel(PiperBackend, o))
 	default:
-		return nil, fmt.Errorf("backend unsupported: %s", o.backendString)
+		return ml.LoadModel(o.model, ml.grpcModel(backend, o))
 	}
 }