mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-20 21:23:10 +00:00
build(Makefile): add back single target to build native llama-cpp (#2448)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
10c64dbb55
commit
ff8a6962cd
8
Makefile
8
Makefile
@ -672,6 +672,14 @@ else
|
|||||||
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
|
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# This target is for manually building a variant with-auto detected flags
|
||||||
|
backend-assets/grpc/llama-cpp: backend-assets/grpc
|
||||||
|
cp -rf backend/cpp/llama backend/cpp/llama-cpp
|
||||||
|
$(MAKE) -C backend/cpp/llama-cpp purge
|
||||||
|
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
|
||||||
|
$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
|
||||||
|
cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-avx2
|
cp -rf backend/cpp/llama backend/cpp/llama-avx2
|
||||||
$(MAKE) -C backend/cpp/llama-avx2 purge
|
$(MAKE) -C backend/cpp/llama-avx2 purge
|
||||||
|
@ -351,7 +351,7 @@ For example, to start vllm manually after compiling LocalAI (also assuming runni
|
|||||||
./local-ai --external-grpc-backends "vllm:$PWD/backend/python/vllm/run.sh"
|
./local-ai --external-grpc-backends "vllm:$PWD/backend/python/vllm/run.sh"
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that first is is necessary to create the conda environment with:
|
Note that first is is necessary to create the environment with:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make -C backend/python/vllm
|
make -C backend/python/vllm
|
||||||
@ -369,7 +369,7 @@ there are additional environment variables available that modify the behavior of
|
|||||||
| `BUILD_TYPE` | | Build type. Available: `cublas`, `openblas`, `clblas` |
|
| `BUILD_TYPE` | | Build type. Available: `cublas`, `openblas`, `clblas` |
|
||||||
| `GO_TAGS` | | Go tags. Available: `stablediffusion` |
|
| `GO_TAGS` | | Go tags. Available: `stablediffusion` |
|
||||||
| `HUGGINGFACEHUB_API_TOKEN` | | Special token for interacting with HuggingFace Inference API, required only when using the `langchain-huggingface` backend |
|
| `HUGGINGFACEHUB_API_TOKEN` | | Special token for interacting with HuggingFace Inference API, required only when using the `langchain-huggingface` backend |
|
||||||
| `EXTRA_BACKENDS` | | A space separated list of backends to prepare. For example `EXTRA_BACKENDS="backend/python/diffusers backend/python/transformers"` prepares the conda environment on start |
|
| `EXTRA_BACKENDS` | | A space separated list of backends to prepare. For example `EXTRA_BACKENDS="backend/python/diffusers backend/python/transformers"` prepares the python environment on start |
|
||||||
| `DISABLE_AUTODETECT` | `false` | Disable autodetect of CPU flagset on start |
|
| `DISABLE_AUTODETECT` | `false` | Disable autodetect of CPU flagset on start |
|
||||||
| `LLAMACPP_GRPC_SERVERS` | | A list of llama.cpp workers to distribute the workload. For example `LLAMACPP_GRPC_SERVERS="address1:port,address2:port"` |
|
| `LLAMACPP_GRPC_SERVERS` | | A list of llama.cpp workers to distribute the workload. For example `LLAMACPP_GRPC_SERVERS="address1:port,address2:port"` |
|
||||||
|
|
||||||
@ -475,7 +475,7 @@ If you wish to build a custom container image with extra backends, you can use t
|
|||||||
```Dockerfile
|
```Dockerfile
|
||||||
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
|
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
|
||||||
|
|
||||||
RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
|
RUN make -C backend/python/diffusers
|
||||||
```
|
```
|
||||||
|
|
||||||
Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--external-grpc-backends` as CLI flag) to point to the backends you are using (`EXTERNAL_GRPC_BACKENDS="backend_name:/path/to/backend"`), for example with diffusers:
|
Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--external-grpc-backends` as CLI flag) to point to the backends you are using (`EXTERNAL_GRPC_BACKENDS="backend_name:/path/to/backend"`), for example with diffusers:
|
||||||
@ -483,7 +483,7 @@ Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--ex
|
|||||||
```Dockerfile
|
```Dockerfile
|
||||||
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
|
FROM quay.io/go-skynet/local-ai:master-ffmpeg-core
|
||||||
|
|
||||||
RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers
|
RUN make -C backend/python/diffusers
|
||||||
|
|
||||||
ENV EXTERNAL_GRPC_BACKENDS="diffusers:/build/backend/python/diffusers/run.sh"
|
ENV EXTERNAL_GRPC_BACKENDS="diffusers:/build/backend/python/diffusers/run.sh"
|
||||||
```
|
```
|
||||||
@ -525,3 +525,8 @@ A list of the environment variable that tweaks parallelism is the following:
|
|||||||
|
|
||||||
Note that, for llama.cpp you need to set accordingly `LLAMACPP_PARALLEL` to the number of parallel processes your GPU/CPU can handle. For python-based backends (like vLLM) you can set `PYTHON_GRPC_MAX_WORKERS` to the number of parallel requests.
|
Note that, for llama.cpp you need to set accordingly `LLAMACPP_PARALLEL` to the number of parallel processes your GPU/CPU can handle. For python-based backends (like vLLM) you can set `PYTHON_GRPC_MAX_WORKERS` to the number of parallel requests.
|
||||||
|
|
||||||
|
### Disable CPU flagset auto detection in llama.cpp
|
||||||
|
|
||||||
|
LocalAI will automatically discover the CPU flagset available in your host and will use the most optimized version of the backends.
|
||||||
|
|
||||||
|
If you want to disable this behavior, you can set `DISABLE_AUTODETECT` to `true` in the environment variables.
|
Loading…
Reference in New Issue
Block a user