From 61cc76c4558d933e312f48b5220635b03eb9255d Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 19 Apr 2025 15:52:29 +0200 Subject: [PATCH] chore(autogptq): drop archived backend (#5214) Signed-off-by: Ettore Di Giacinto --- .github/dependabot.yml | 4 - Dockerfile | 5 +- Makefile | 13 +- backend/backend.proto | 6 +- backend/python/autogptq/Makefile | 17 -- backend/python/autogptq/README.md | 5 - backend/python/autogptq/backend.py | 158 ------------------ backend/python/autogptq/install.sh | 14 -- .../python/autogptq/requirements-cublas11.txt | 2 - .../python/autogptq/requirements-cublas12.txt | 1 - .../python/autogptq/requirements-hipblas.txt | 2 - .../python/autogptq/requirements-intel.txt | 6 - backend/python/autogptq/requirements.txt | 6 - backend/python/autogptq/run.sh | 4 - backend/python/autogptq/test.sh | 6 - core/backend/options.go | 5 - core/config/backend_config.go | 11 -- core/http/middleware/request.go | 8 - core/schema/openai.go | 1 - core/schema/prediction.go | 2 - docs/content/docs/advanced/advanced-usage.md | 8 - .../content/docs/features/GPU-acceleration.md | 1 - docs/content/docs/features/text-generation.md | 42 +---- 23 files changed, 5 insertions(+), 322 deletions(-) delete mode 100644 backend/python/autogptq/Makefile delete mode 100644 backend/python/autogptq/README.md delete mode 100755 backend/python/autogptq/backend.py delete mode 100755 backend/python/autogptq/install.sh delete mode 100644 backend/python/autogptq/requirements-cublas11.txt delete mode 100644 backend/python/autogptq/requirements-cublas12.txt delete mode 100644 backend/python/autogptq/requirements-hipblas.txt delete mode 100644 backend/python/autogptq/requirements-intel.txt delete mode 100644 backend/python/autogptq/requirements.txt delete mode 100755 backend/python/autogptq/run.sh delete mode 100755 backend/python/autogptq/test.sh diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 570ac569..5e8f919b 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -29,10 +29,6 @@ updates: schedule: # Check for updates to GitHub Actions every weekday interval: "weekly" - - package-ecosystem: "pip" - directory: "/backend/python/autogptq" - schedule: - interval: "weekly" - package-ecosystem: "pip" directory: "/backend/python/bark" schedule: diff --git a/Dockerfile b/Dockerfile index 64861a8a..796a0d69 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ARG TARGETARCH ARG TARGETVARIANT ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh" +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh" RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -431,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/vllm \ ; fi && \ - if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ - make -C backend/python/autogptq \ - ; fi && \ if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ make -C backend/python/bark \ ; fi && \ diff --git a/Makefile b/Makefile index d858e574..394d3772 100644 --- a/Makefile +++ b/Makefile @@ -505,18 +505,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen +protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean - -.PHONY: autogptq-protogen -autogptq-protogen: - $(MAKE) -C backend/python/autogptq protogen - -.PHONY: autogptq-protogen-clean -autogptq-protogen-clean: - $(MAKE) -C backend/python/autogptq protogen-clean +protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean .PHONY: bark-protogen bark-protogen: @@ -593,7 +585,6 @@ vllm-protogen-clean: ## GRPC # Note: it is duplicated in the Dockerfile prepare-extra-conda-environments: protogen-python - $(MAKE) -C backend/python/autogptq $(MAKE) -C backend/python/bark $(MAKE) -C backend/python/coqui $(MAKE) -C backend/python/diffusers diff --git a/backend/backend.proto b/backend/backend.proto index cbb81c66..d5028efa 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -190,11 +190,7 @@ message ModelOptions { int32 NGQA = 20; string ModelFile = 21; - // AutoGPTQ - string Device = 22; - bool UseTriton = 23; - string ModelBaseName = 24; - bool UseFastTokenizer = 25; + // Diffusers string PipelineType = 26; diff --git a/backend/python/autogptq/Makefile b/backend/python/autogptq/Makefile deleted file mode 100644 index e2662b7a..00000000 --- a/backend/python/autogptq/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -.PHONY: autogptq -autogptq: protogen - bash install.sh - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto - -.PHONY: clean -clean: protogen-clean - rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/autogptq/README.md b/backend/python/autogptq/README.md deleted file mode 100644 index 4a5480f1..00000000 --- a/backend/python/autogptq/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Creating a separate environment for the autogptq project - -``` -make autogptq -``` diff --git a/backend/python/autogptq/backend.py b/backend/python/autogptq/backend.py deleted file mode 100755 index 3b5515cb..00000000 --- a/backend/python/autogptq/backend.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python3 -from concurrent import futures -import argparse -import signal -import sys -import os -import time -import base64 - -import grpc -import backend_pb2 -import backend_pb2_grpc - -from auto_gptq import AutoGPTQForCausalLM -from transformers import AutoTokenizer, AutoModelForCausalLM -from transformers import TextGenerationPipeline - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - def LoadModel(self, request, context): - try: - device = "cuda:0" - if request.Device != "": - device = request.Device - - # support loading local model files - model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model) - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode) - - # support model `Qwen/Qwen-VL-Chat-Int4` - if "qwen-vl" in request.Model.lower(): - self.model_name = "Qwen-VL-Chat" - model = AutoModelForCausalLM.from_pretrained(model_path, - trust_remote_code=request.TrustRemoteCode, - device_map="auto").eval() - else: - model = AutoGPTQForCausalLM.from_quantized(model_path, - model_basename=request.ModelBaseName, - use_safetensors=True, - trust_remote_code=request.TrustRemoteCode, - device=device, - use_triton=request.UseTriton, - quantize_config=None) - - self.model = model - self.tokenizer = tokenizer - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - penalty = 1.0 - if request.Penalty != 0.0: - penalty = request.Penalty - tokens = 512 - if request.Tokens != 0: - tokens = request.Tokens - top_p = 0.95 - if request.TopP != 0.0: - top_p = request.TopP - - - prompt_images = self.recompile_vl_prompt(request) - compiled_prompt = prompt_images[0] - print(f"Prompt: {compiled_prompt}", file=sys.stderr) - - # Implement Predict RPC - pipeline = TextGenerationPipeline( - model=self.model, - tokenizer=self.tokenizer, - max_new_tokens=tokens, - temperature=request.Temperature, - top_p=top_p, - repetition_penalty=penalty, - ) - t = pipeline(compiled_prompt)[0]["generated_text"] - print(f"generated_text: {t}", file=sys.stderr) - - if compiled_prompt in t: - t = t.replace(compiled_prompt, "") - # house keeping. Remove the image files from /tmp folder - for img_path in prompt_images[1]: - try: - os.remove(img_path) - except Exception as e: - print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) - - return backend_pb2.Result(message=bytes(t, encoding='utf-8')) - - def PredictStream(self, request, context): - # Implement PredictStream RPC - #for reply in some_data_generator(): - # yield reply - # Not implemented yet - return self.Predict(request, context) - - def recompile_vl_prompt(self, request): - prompt = request.Prompt - image_paths = [] - - if "qwen-vl" in self.model_name.lower(): - # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename. - # Then, save the image file paths to an array "image_paths". - # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt". - for i, img in enumerate(request.Images): - timestamp = str(int(time.time() * 1000)) # Generate timestamp - img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename - with open(img_path, "wb") as f: - f.write(base64.b64decode(img)) - image_paths.append(img_path) - prompt = prompt.replace(f"[img-{i}]", "" + img_path + ",") - else: - prompt = request.Prompt - return (prompt, image_paths) - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), - options=[ - ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB - ]) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) \ No newline at end of file diff --git a/backend/python/autogptq/install.sh b/backend/python/autogptq/install.sh deleted file mode 100755 index 36443ef1..00000000 --- a/backend/python/autogptq/install.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links. -# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match. -# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index -# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index -if [ "x${BUILD_PROFILE}" == "xintel" ]; then - EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" -fi - -installRequirements diff --git a/backend/python/autogptq/requirements-cublas11.txt b/backend/python/autogptq/requirements-cublas11.txt deleted file mode 100644 index cf469472..00000000 --- a/backend/python/autogptq/requirements-cublas11.txt +++ /dev/null @@ -1,2 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.4.1+cu118 diff --git a/backend/python/autogptq/requirements-cublas12.txt b/backend/python/autogptq/requirements-cublas12.txt deleted file mode 100644 index 20f84cf7..00000000 --- a/backend/python/autogptq/requirements-cublas12.txt +++ /dev/null @@ -1 +0,0 @@ -torch==2.4.1 \ No newline at end of file diff --git a/backend/python/autogptq/requirements-hipblas.txt b/backend/python/autogptq/requirements-hipblas.txt deleted file mode 100644 index ecd817dc..00000000 --- a/backend/python/autogptq/requirements-hipblas.txt +++ /dev/null @@ -1,2 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/rocm6.0 -torch==2.4.1+rocm6.0 \ No newline at end of file diff --git a/backend/python/autogptq/requirements-intel.txt b/backend/python/autogptq/requirements-intel.txt deleted file mode 100644 index 07b502eb..00000000 --- a/backend/python/autogptq/requirements-intel.txt +++ /dev/null @@ -1,6 +0,0 @@ ---extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -intel-extension-for-pytorch==2.3.110+xpu -torch==2.3.1+cxx11.abi -oneccl_bind_pt==2.3.100+xpu -optimum[openvino] -setuptools \ No newline at end of file diff --git a/backend/python/autogptq/requirements.txt b/backend/python/autogptq/requirements.txt deleted file mode 100644 index 4b879746..00000000 --- a/backend/python/autogptq/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -accelerate -auto-gptq==0.7.1 -grpcio==1.71.0 -protobuf -certifi -transformers \ No newline at end of file diff --git a/backend/python/autogptq/run.sh b/backend/python/autogptq/run.sh deleted file mode 100755 index 375c07e5..00000000 --- a/backend/python/autogptq/run.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -source $(dirname $0)/../common/libbackend.sh - -startBackend $@ \ No newline at end of file diff --git a/backend/python/autogptq/test.sh b/backend/python/autogptq/test.sh deleted file mode 100755 index 6940b066..00000000 --- a/backend/python/autogptq/test.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -source $(dirname $0)/../common/libbackend.sh - -runUnittests diff --git a/core/backend/options.go b/core/backend/options.go index 7a7a69bb..56cf3385 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -184,11 +184,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { MainGPU: c.MainGPU, Threads: int32(*c.Threads), TensorSplit: c.TensorSplit, - // AutoGPTQ - ModelBaseName: c.AutoGPTQ.ModelBaseName, - Device: c.AutoGPTQ.Device, - UseTriton: c.AutoGPTQ.Triton, - UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer, // RWKV Tokenizer: c.Tokenizer, } diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 47ba4958..2c022912 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -50,9 +50,6 @@ type BackendConfig struct { // LLM configs (GPT4ALL, Llama.cpp, ...) LLMConfig `yaml:",inline"` - // AutoGPTQ specifics - AutoGPTQ AutoGPTQ `yaml:"autogptq"` - // Diffusers Diffusers Diffusers `yaml:"diffusers"` Step int `yaml:"step"` @@ -176,14 +173,6 @@ type LimitMMPerPrompt struct { LimitAudioPerPrompt int `yaml:"audio"` } -// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend -type AutoGPTQ struct { - ModelBaseName string `yaml:"model_base_name"` - Device string `yaml:"device"` - Triton bool `yaml:"triton"` - UseFastTokenizer bool `yaml:"use_fast_tokenizer"` -} - // TemplateConfig is a struct that holds the configuration of the templating system type TemplateConfig struct { // Chat is the template used in the chat completion endpoint diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index ae357e7b..b6934a82 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -203,18 +203,10 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch config.Diffusers.ClipSkip = input.ClipSkip } - if input.ModelBaseName != "" { - config.AutoGPTQ.ModelBaseName = input.ModelBaseName - } - if input.NegativePromptScale != 0 { config.NegativePromptScale = input.NegativePromptScale } - if input.UseFastTokenizer { - config.UseFastTokenizer = input.UseFastTokenizer - } - if input.NegativePrompt != "" { config.NegativePrompt = input.NegativePrompt } diff --git a/core/schema/openai.go b/core/schema/openai.go index e445bee1..8eb20364 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -202,7 +202,6 @@ type OpenAIRequest struct { Backend string `json:"backend" yaml:"backend"` - // AutoGPTQ ModelBaseName string `json:"model_base_name" yaml:"model_base_name"` } diff --git a/core/schema/prediction.go b/core/schema/prediction.go index 15785f19..a75c7ab1 100644 --- a/core/schema/prediction.go +++ b/core/schema/prediction.go @@ -41,8 +41,6 @@ type PredictionOptions struct { RopeFreqBase float32 `json:"rope_freq_base" yaml:"rope_freq_base"` RopeFreqScale float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"` NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"` - // AutoGPTQ - UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"` // Diffusers ClipSkip int `json:"clip_skip" yaml:"clip_skip"` diff --git a/docs/content/docs/advanced/advanced-usage.md b/docs/content/docs/advanced/advanced-usage.md index 62c19aba..3a370054 100644 --- a/docs/content/docs/advanced/advanced-usage.md +++ b/docs/content/docs/advanced/advanced-usage.md @@ -268,14 +268,6 @@ yarn_ext_factor: 0 yarn_attn_factor: 0 yarn_beta_fast: 0 yarn_beta_slow: 0 - -# AutoGPT-Q settings, for configurations specific to GPT models. -autogptq: - model_base_name: "" # Base name of the model. - device: "" # Device to run the model on. - triton: false # Whether to use Triton Inference Server. - use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing. - # configuration for diffusers model diffusers: cuda: false # Whether to use CUDA diff --git a/docs/content/docs/features/GPU-acceleration.md b/docs/content/docs/features/GPU-acceleration.md index c4160738..9dc81aad 100644 --- a/docs/content/docs/features/GPU-acceleration.md +++ b/docs/content/docs/features/GPU-acceleration.md @@ -147,7 +147,6 @@ The devices in the following list have been tested with `hipblas` images running | diffusers | yes | Radeon VII (gfx906) | | piper | yes | Radeon VII (gfx906) | | whisper | no | none | -| autogptq | no | none | | bark | no | none | | coqui | no | none | | transformers | no | none | diff --git a/docs/content/docs/features/text-generation.md b/docs/content/docs/features/text-generation.md index 342b8e76..c4e637f7 100644 --- a/docs/content/docs/features/text-generation.md +++ b/docs/content/docs/features/text-generation.md @@ -74,49 +74,9 @@ curl http://localhost:8080/v1/models ## Backends -### AutoGPTQ - -[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) is an easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm. - -#### Prerequisites - -This is an extra backend - in the container images is already available and there is nothing to do for the setup. - -If you are building LocalAI locally, you need to install [AutoGPTQ manually](https://github.com/PanQiWei/AutoGPTQ#quick-installation). - - -#### Model setup - -The models are automatically downloaded from `huggingface` if not present the first time. It is possible to define models via `YAML` config file, or just by querying the endpoint with the `huggingface` repository model name. For example, create a `YAML` config file in `models/`: - -``` -name: orca -backend: autogptq -model_base_name: "orca_mini_v2_13b-GPTQ-4bit-128g.no-act.order" -parameters: - model: "TheBloke/orca_mini_v2_13b-GPTQ" -# ... -``` - -Test with: - -```bash -curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ - "model": "orca", - "messages": [{"role": "user", "content": "How are you?"}], - "temperature": 0.1 - }' -``` ### RWKV -A full example on how to run a rwkv model is in the [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv). - -Note: rwkv models needs to specify the backend `rwkv` in the YAML config files and have an associated tokenizer along that needs to be provided with it: - -``` -36464540 -rw-r--r-- 1 mudler mudler 1.2G May 3 10:51 rwkv_small -36464543 -rw-r--r-- 1 mudler mudler 2.4M May 3 10:51 rwkv_small.tokenizer.json -``` +RWKV support is available through llama.cpp (see below) ### llama.cpp