docs: update to include installer and update advanced YAML options (#2631)

* docs: update quickstart and advanced sections

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs: improvements

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* examples(kubernete): add nvidia example

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-06-22 12:00:38 +02:00 committed by GitHub
parent 9fb3e4040b
commit 9a7ad75bff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 667 additions and 447 deletions

View File

@ -106,118 +106,202 @@ local-ai github://mudler/LocalAI/examples/configurations/phi-2.yaml@master
### Full config model file reference
```yaml
# Model name.
# The model name is used to identify the model in the API calls.
name: gpt-3.5-turbo
# Main configuration of the model, template, and system features.
name: "" # Model name, used to identify the model in API calls.
# Default model parameters.
# These options can also be specified in the API calls
parameters:
# Relative to the models path
model: luna-ai-llama2-uncensored.ggmlv3.q5_K_M.bin
# temperature
temperature: 0.3
# all the OpenAI request options here..
top_k:
top_p:
max_tokens:
ignore_eos: true
n_keep: 10
seed:
mode:
step:
negative_prompt:
typical_p:
tfz:
frequency_penalty:
# Precision settings for the model, reducing precision can enhance performance on some hardware.
f16: null # Whether to use 16-bit floating-point precision.
rope_freq_base:
rope_freq_scale:
negative_prompt_scale:
# Concurrency settings for the application.
threads: null # Number of threads to use for processing.
mirostat_eta:
mirostat_tau:
mirostat:
# Default context size
context_size: 512
# Default number of threads
threads: 10
# Define a backend (optional). By default it will try to guess the backend the first time the model is interacted with.
backend: llama-stable # available: llama, stablelm, gpt2, gptj rwkv
# stopwords (if supported by the backend)
stopwords:
- "HUMAN:"
- "### Response:"
# string to trim space to
trimspace:
- string
# Strings to cut from the response
cutstrings:
- "string"
# Roles define how different entities interact in a conversational model.
# It can be used to map roles to specific parts of the conversation.
roles: {} # Roles for entities like user, system, assistant, etc.
# Directory used to store additional assets
asset_dir: ""
# Backend to use for computation (like llama-cpp, diffusers, whisper).
backend: "" # Backend for AI computations.
# define chat roles
roles:
user: "HUMAN:"
system: "GPT:"
assistant: "ASSISTANT:"
# Templates for various types of model interactions.
template:
# template file ".tmpl" with the prompt template to use by default on the endpoint call. Note there is no extension in the files
completion: completion
chat: chat
edit: edit_template
function: function_template
chat: "" # Template for chat interactions. Uses golang templates with Sprig functions.
chat_message: "" # Template for individual chat messages. Uses golang templates with Sprig functions.
completion: "" # Template for generating text completions. Uses golang templates with Sprig functions.
edit: "" # Template for edit operations. Uses golang templates with Sprig functions.
function: "" # Template for function calls. Uses golang templates with Sprig functions.
use_tokenizer_template: false # Whether to use a specific tokenizer template. (vLLM)
join_chat_messages_by_character: null # Character to join chat messages, if applicable. Defaults to newline.
# Function-related settings to control behavior of specific function calls.
function:
disable_no_action: true
no_action_function_name: "reply"
no_action_description_name: "Reply to the AI assistant"
disable_no_action: false # Whether to disable the no-action behavior.
grammar:
parallel_calls: false # Allow to return parallel tools
disable_parallel_new_lines: false # Disable parallel processing for new lines in grammar checks.
mixed_mode: false # Allow mixed-mode grammar enforcing
no_mixed_free_string: false # Disallow free strings in mixed mode.
disable: false # Completely disable grammar enforcing functionality.
prefix: "" # Prefix to add before grammars rules.
expect_strings_after_json: false # Expect string after JSON data.
no_action_function_name: "" # Function name to call when no action is determined.
no_action_description_name: "" # Description name for no-action functions.
response_regex: [] # Regular expressions to match response from
json_regex_match: [] # Regular expressions to match JSON data when in tool mode
replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.
capture_llm_results: [] # Capture language model results as text result, among JSON, in function calls. For instance, if a model returns a block for "thinking" and a block for "response", this will allow you to capture the thinking block.
return_name_in_function_response: false # Some models might prefer to use "name" rather then "function" when returning JSON data. This will allow to use "name" as a key in the JSON response.
system_prompt:
rms_norm_eps:
# Set it to 8 for llama2 70b
ngqa: 1
## LLAMA specific options
# Enable F16 if backend supports it
f16: true
# Enable debugging
debug: true
# Enable embeddings
embeddings: true
# Mirostat configuration (llama.cpp only)
mirostat_eta: 0.8
mirostat_tau: 0.9
mirostat: 1
# GPU Layers (only used when built with cublas)
gpu_layers: 22
# Enable memory lock
mmlock: true
# GPU setting to split the tensor in multiple parts and define a main GPU
# see llama.cpp for usage
# Feature gating flags to enable experimental or optional features.
feature_flags: {}
# System prompt to use by default.
system_prompt: ""
# Configuration for splitting tensors across GPUs.
tensor_split: ""
main_gpu: ""
# Define a prompt cache path (relative to the models)
prompt_cache_path: "prompt-cache"
# Cache all the prompts
prompt_cache_all: true
# Read only
prompt_cache_ro: false
# Enable mmap
mmap: true
# Enable low vram mode (GPU only)
low_vram: true
# Set NUMA mode (CPU only)
numa: true
# Lora settings
lora_adapter: "/path/to/lora/adapter"
lora_base: "/path/to/lora/base"
# Disable mulmatq (CUDA)
no_mulmatq: true
# Diffusers/transformers
cuda: true
# Identifier for the main GPU used in multi-GPU setups.
main_gpu: ""
# Small value added to the denominator in RMS normalization to prevent division by zero.
rms_norm_eps: 0
# Natural question generation model parameter.
ngqa: 0
# Path where prompt cache is stored.
prompt_cache_path: ""
# Whether to cache all prompts.
prompt_cache_all: false
# Whether the prompt cache is read-only.
prompt_cache_ro: false
# Mirostat sampling settings.
mirostat_eta: null
mirostat_tau: null
mirostat: null
# GPU-specific layers configuration.
gpu_layers: null
# Memory mapping for efficient I/O operations.
mmap: null
# Memory locking to ensure data remains in RAM.
mmlock: null
# Mode to use minimal VRAM for GPU operations.
low_vram: null
# Words or phrases that halts processing.
stopwords: []
# Strings to cut from responses to maintain context or relevance.
cutstrings: []
# Strings to trim from responses for cleaner outputs.
trimspace: []
trimsuffix: []
# Default context size for the model's understanding of the conversation or text.
context_size: null
# Non-uniform memory access settings, useful for systems with multiple CPUs.
numa: false
# Configuration for LoRA
lora_adapter: ""
lora_base: ""
lora_scale: 0
# Disable matrix multiplication queuing in GPU operations.
no_mulmatq: false
# Model for generating draft responses.
draft_model: ""
n_draft: 0
# Quantization settings for the model, impacting memory and processing speed.
quantization: ""
# Utilization percentage of GPU memory to allocate for the model. (vLLM)
gpu_memory_utilization: 0
# Whether to trust and execute remote code.
trust_remote_code: false
# Force eager execution of TensorFlow operations if applicable. (vLLM)
enforce_eager: false
# Space allocated for swapping data in and out of memory. (vLLM)
swap_space: 0
# Maximum model length, possibly referring to the number of tokens or parameters. (vLLM)
max_model_len: 0
# Size of the tensor parallelism in distributed computing environments. (vLLM)
tensor_parallel_size: 0
# vision model to use for multimodal
mmproj: ""
# Disables offloading of key/value pairs in transformer models to save memory.
no_kv_offloading: false
# Scaling factor for the rope penalty.
rope_scaling: ""
# Type of configuration, often related to the type of task or model architecture.
type: ""
# YARN settings
yarn_ext_factor: 0
yarn_attn_factor: 0
yarn_beta_fast: 0
yarn_beta_slow: 0
# AutoGPT-Q settings, for configurations specific to GPT models.
autogptq:
model_base_name: "" # Base name of the model.
device: "" # Device to run the model on.
triton: false # Whether to use Triton Inference Server.
use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
# configuration for diffusers model
diffusers:
cuda: false # Whether to use CUDA
pipeline_type: "" # Type of pipeline to use.
scheduler_type: "" # Type of scheduler for controlling operations.
enable_parameters: "" # Parameters to enable in the diffuser.
cfg_scale: 0 # Scale for CFG in the diffuser setup.
img2img: false # Whether image-to-image transformation is supported.
clip_skip: 0 # Number of steps to skip in CLIP operations.
clip_model: "" # Model to use for CLIP operations.
clip_subfolder: "" # Subfolder for storing CLIP-related data.
control_net: "" # Control net to use
# Step count, usually for image processing models
step: 0
# Configuration for gRPC communication.
grpc:
attempts: 0 # Number of retry attempts for gRPC calls.
attempts_sleep_time: 0 # Sleep time between retries.
# Text-to-Speech (TTS) configuration.
tts:
voice: "" # Voice setting for TTS.
vall-e:
audio_path: "" # Path to audio files for Vall-E.
# Whether to use CUDA for GPU-based operations.
cuda: false
# List of files to download as part of the setup or operations.
download_files: []
```
### Prompt templates

View File

@ -0,0 +1,33 @@
+++
disableToc = false
title = "Installer options"
weight = 24
+++
An installation script is available for quick and hassle-free installations, streamlining the setup process for new users.
Can be used with the following command:
```bash
curl https://localai.io/install.sh | sh
```
Installation can be configured with Environment variables, for example:
```bash
curl https://localai.io/install.sh | VAR=value sh
```
List of the Environment Variables:
| Environment Variable | Description |
|----------------------|--------------------------------------------------------------|
| **DOCKER_INSTALL** | Set to "true" to enable the installation of Docker images. |
| **USE_AIO** | Set to "true" to use the all-in-one LocalAI Docker image. |
| **API_KEY** | Specify an API key for accessing LocalAI, if required. |
| **CORE_IMAGES** | Set to "true" to download core LocalAI images. |
| **PORT** | Specifies the port on which LocalAI will run (default is 8080). |
| **THREADS** | Number of processor threads the application should use. Defaults to the number of logical cores minus one. |
| **VERSION** | Specifies the version of LocalAI to install. Defaults to the latest available version. |
| **MODELS_PATH** | Directory path where LocalAI models are stored (default is /usr/share/local-ai/models). |
We are looking into improving the installer, and as this is a first iteration any feedback is welcome! Open up an [issue](https://github.com/mudler/LocalAI/issues/new/choose) if something doesn't work for you!

View File

@ -1,7 +1,7 @@
+++
disableToc = false
title = "Run other Models"
weight = 3
weight = 23
icon = "rocket_launch"
+++

View File

@ -23,6 +23,20 @@ For GPU Acceleration support for Nvidia video graphic cards, use the Nvidia/CUDA
{{% /alert %}}
#### Prerequisites
Before you begin, ensure you have a container engine installed if you are not using the binaries. Suitable options include Docker or Podman. For installation instructions, refer to the following guides:
- [Install Docker Desktop (Mac, Windows, Linux)](https://docs.docker.com/get-docker/)
- [Install Podman (Linux)](https://podman.io/getting-started/installation)
- [Install Docker engine (Servers)](https://docs.docker.com/engine/install/#get-started)
{{% alert icon="💡" %}}
**Hardware Requirements:** The hardware requirements for LocalAI vary based on the model size and quantization method used. For performance benchmarks with different backends, such as `llama.cpp`, visit [this link](https://github.com/ggerganov/llama.cpp#memorydisk-requirements). The `rwkv` backend is noted for its lower resource consumption.
{{% /alert %}}
## All-in-one images
All-In-One images are images that come pre-configured with a set of models and backends to fully leverage almost all the LocalAI featureset. These images are available for both CPU and GPU environments. The AIO images are designed to be easy to use and requires no configuration. Models configuration can be found [here](https://github.com/mudler/LocalAI/tree/master/aio) separated by size.
@ -45,11 +59,72 @@ Select the image (CPU or GPU) and start the container with Docker:
```bash
# CPU example
docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
# For Nvidia GPUs:
# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-11
# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-12
```
LocalAI will automatically download all the required models, and the API will be available at [localhost:8080](http://localhost:8080/v1/models).
### Available images
Or with a docker-compose file:
```yaml
version: "3.9"
services:
api:
image: localai/localai:latest-aio-cpu
# For a specific version:
# image: localai/localai:{{< version >}}-aio-cpu
# For Nvidia GPUs decomment one of the following (cuda11 or cuda12):
# image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-11
# image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-12
# image: localai/localai:latest-aio-gpu-nvidia-cuda-11
# image: localai/localai:latest-aio-gpu-nvidia-cuda-12
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
interval: 1m
timeout: 20m
retries: 5
ports:
- 8080:8080
environment:
- DEBUG=true
# ...
volumes:
- ./models:/build/models:cached
# decomment the following piece if running with Nvidia GPUs
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
```
{{% alert icon="💡" %}}
**Models caching**: The **AIO** image will download the needed models on the first run if not already present and store those in `/build/models` inside the container. The AIO models will be automatically updated with new versions of AIO images.
You can change the directory inside the container by specifying a `MODELS_PATH` environment variable (or `--models-path`).
If you want to use a named model or a local directory, you can mount it as a volume to `/build/models`:
```bash
docker run -p 8080:8080 --name local-ai -ti -v $PWD/models:/build/models localai/localai:latest-aio-cpu
```
or associate a volume:
```bash
docker volume create localai-models
docker run -p 8080:8080 --name local-ai -ti -v localai-models:/build/models localai/localai:latest-aio-cpu
```
{{% /alert %}}
### Available AIO images
| Description | Quay | Docker Hub |
| --- | --- |-----------------------------------------------|
@ -68,7 +143,7 @@ The AIO Images are inheriting the same environment variables as the base images
| Variable | Default | Description |
| ---------------------| ------- | ----------- |
| `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` |
| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/getting-started/run-other-models" %}})) |
| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/advanced/run-other-models" %}})) |
## Standard container images

View File

@ -1,7 +1,7 @@
+++
disableToc = false
title = "Customizing the Model"
weight = 4
weight = 5
icon = "rocket_launch"
+++

View File

@ -1,31 +1,39 @@
+++
disableToc = false
title = "Run models manually"
weight = 5
icon = "rocket_launch"
---
+++
disableToc: false
title: "Run models manually"
weight: 5
icon: "rocket_launch"
---
1. Ensure you have a model file, a configuration YAML file, or both. Customize model defaults and specific settings with a configuration file. For advanced configurations, refer to the [Advanced Documentation](docs/advanced).
# Run Models Manually
2. For GPU Acceleration instructions, visit [GPU acceleration](docs/features/gpu-acceleration).
Follow these steps to manually run models using LocalAI:
1. **Prepare Your Model and Configuration Files**:
Ensure you have a model file and a configuration YAML file, if necessary. Customize model defaults and specific settings with a configuration file. For advanced configurations, refer to the [Advanced Documentation]({{% relref "docs/advanced" %}}).
2. **GPU Acceleration**:
For instructions on GPU acceleration, visit the [GPU acceleration]({{% relref "docs/features/gpu-acceleration" %}}) page.
3. **Run LocalAI**:
Choose one of the following methods to run LocalAI:
{{< tabs tabTotal="5" >}}
{{% tab tabName="Docker" %}}
```bash
# Prepare the models into the `model` directory
# Prepare the models into the `models` directory
mkdir models
# copy your models to it
# Copy your models to the directory
cp your-model.gguf models/
# run the LocalAI container
# Run the LocalAI container
docker run -p 8080:8080 -v $PWD/models:/models -ti --rm quay.io/go-skynet/local-ai:latest --models-path /models --context-size 700 --threads 4
# You should see:
#
# Expected output:
# ┌───────────────────────────────────────────────────┐
# │ Fiber v2.42.0 │
# │ http://127.0.0.1:8080 │
@ -35,7 +43,7 @@ docker run -p 8080:8080 -v $PWD/models:/models -ti --rm quay.io/go-skynet/local-
# │ Prefork ....... Disabled PID ................. 1 │
# └───────────────────────────────────────────────────┘
# Try the endpoint with curl
# Test the endpoint with curl
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "your-model.gguf",
"prompt": "A long time ago in a galaxy far, far away",
@ -44,15 +52,12 @@ curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d
```
{{% alert icon="💡" %}}
**Other Docker Images**:
For other Docker images, please see the table in
https://localai.io/basics/getting_started/#container-images.
For other Docker images, please refer to the table in [the container images section]({{% relref "docs/getting-started/container-images" %}}).
{{% /alert %}}
Here is a more specific example:
### Example:
```bash
mkdir models
@ -60,12 +65,12 @@ mkdir models
# Download luna-ai-llama2 to models/
wget https://huggingface.co/TheBloke/Luna-AI-Llama2-Uncensored-GGUF/resolve/main/luna-ai-llama2-uncensored.Q4_0.gguf -O models/luna-ai-llama2
# Use a template from the examples
# Use a template from the examples, if needed
cp -rf prompt-templates/getting_started.tmpl models/luna-ai-llama2.tmpl
docker run -p 8080:8080 -v $PWD/models:/models -ti --rm quay.io/go-skynet/local-ai:latest --models-path /models --context-size 700 --threads 4
# Now API is accessible at localhost:8080
# Now the API is accessible at localhost:8080
curl http://localhost:8080/v1/models
# {"object":"list","data":[{"id":"luna-ai-llama2","object":"model"}]}
@ -78,12 +83,12 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso
```
{{% alert note %}}
- If running on Apple Silicon (ARM) it is **not** suggested to run on Docker due to emulation. Follow the [build instructions]({{%relref "docs/getting-started/build" %}}) to use Metal acceleration for full GPU support.
- If you are running Apple x86_64 you can use `docker`, there is no additional gain into building it from source.
- If running on Apple Silicon (ARM), it is **not** recommended to run on Docker due to emulation. Follow the [build instructions]({{% relref "docs/getting-started/build" %}}) to use Metal acceleration for full GPU support.
- If you are running on Apple x86_64, you can use Docker without additional gain from building it from source.
{{% /alert %}}
{{% /tab %}}
{{% tab tabName="Docker compose" %}}
{{% tab tabName="Docker Compose" %}}
```bash
# Clone LocalAI
@ -91,21 +96,21 @@ git clone https://github.com/go-skynet/LocalAI
cd LocalAI
# (optional) Checkout a specific LocalAI tag
# (Optional) Checkout a specific LocalAI tag
# git checkout -b build <TAG>
# copy your models to models/
# Copy your models to the models directory
cp your-model.gguf models/
# (optional) Edit the .env file to set things like context size and threads
# (Optional) Edit the .env file to set parameters like context size and threads
# vim .env
# start with docker compose
# Start with Docker Compose
docker compose up -d --pull always
# or you can build the images with:
# Or build the images with:
# docker compose up -d --build
# Now API is accessible at localhost:8080
# Now the API is accessible at localhost:8080
curl http://localhost:8080/v1/models
# {"object":"list","data":[{"id":"your-model.gguf","object":"model"}]}
@ -117,48 +122,43 @@ curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d
```
{{% alert icon="💡" %}}
**Other Docker Images**:
For other Docker images, please see the table in
https://localai.io/basics/getting_started/#container-images.
For other Docker images, please refer to the table in [Getting Started](https://localai.io/basics/getting_started/#container-images).
{{% /alert %}}
Note: If you are on Windows, please make sure the project is on the Linux Filesystem, otherwise loading models might be slow. For more Info: [Microsoft Docs](https://learn.microsoft.com/en-us/windows/wsl/filesystems)
Note: If you are on Windows, ensure the project is on the Linux filesystem to avoid slow model loading. For more information, see the [Microsoft Docs](https://learn.microsoft.com/en-us/windows/wsl/filesystems).
{{% /tab %}}
{{% tab tabName="Kubernetes" %}}
See the [Kubernetes section]({{%relref "docs/getting-started/kubernetes" %}}).
For Kubernetes deployment, see the [Kubernetes section]({{% relref "docs/getting-started/kubernetes" %}}).
{{% /tab %}}
{{% tab tabName="From binary" %}}
{{% tab tabName="From Binary" %}}
LocalAI binary releases are available in [Github](https://github.com/go-skynet/LocalAI/releases).
LocalAI binary releases are available on [GitHub](https://github.com/go-skynet/LocalAI/releases).
{{% alert icon="⚠️" %}}
If you are installing on MacOS, when you excecute the binary, you will get a message saying:
If installing on macOS, you might encounter a message saying:
> "local-ai-git-Darwin-arm64" (or whatever name you gave to the binary) can't be opened because Apple cannot check it for malicious software.
> "local-ai-git-Darwin-arm64" (or the name you gave the binary) can't be opened because Apple cannot check it for malicious software.
Hit OK, and go to Settings > Privacy & Security > Security and look for the message:
Hit OK, then go to Settings > Privacy & Security > Security and look for the message:
> "local-ai-git-Darwin-arm64" was blocked from use because it is not from an identified developer.
And press "Allow Anyway"
Press "Allow Anyway."
{{% /alert %}}
{{% /tab %}}
{{% tab tabName="From Source" %}}
For instructions on building LocalAI from source, see the [Build Section]({{% relref "docs/getting-started/build" %}}).
{{% /tab %}}
{{% tab tabName="From source" %}}
See the [build section]({{%relref "docs/getting-started/build" %}}).
{{% /tab %}}
{{< /tabs >}}
For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI/tree/master/examples/configurations).
---

View File

@ -1,42 +1,42 @@
+++
disableToc = false
title = "Quickstart"
weight = 3
url = '/basics/getting_started/'
icon = "rocket_launch"
+++
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run [LLMs]({{%relref "docs/features/text-generation" %}}), generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families and architectures.
**LocalAI** is a free, open-source alternative to OpenAI (Anthropic, etc.), functioning as a drop-in replacement REST API for local inferencing. It allows you to run [LLMs]({{% relref "docs/features/text-generation" %}}), generate images, and produce audio, all locally or on-premises with consumer-grade hardware, supporting multiple model families and architectures.
LocalAI is available as a container image and binary, compatible with various container engines like Docker, Podman, and Kubernetes. Container images are published on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) and [Docker Hub](https://hub.docker.com/r/localai/localai). Binaries can be downloaded from [GitHub](https://github.com/mudler/LocalAI/releases).
## Using the Bash Installer
## Prerequisites
Install LocalAI easily using the bash installer with the following command:
Before you begin, ensure you have a container engine installed if you are not using the binaries. Suitable options include Docker or Podman. For installation instructions, refer to the following guides:
```sh
curl https://localai.io/install.sh | sh
```
- [Install Docker Desktop (Mac, Windows, Linux)](https://docs.docker.com/get-docker/)
- [Install Podman (Linux)](https://podman.io/getting-started/installation)
- [Install Docker engine (Servers)](https://docs.docker.com/engine/install/#get-started)
For a full list of options, refer to the [Installer Options]({{% relref "docs/advanced/installer" %}}) documentation.
{{% alert icon="💡" %}}
Binaries can also be [manually downloaded]({{% relref "docs/reference/binaries" %}}).
**Hardware Requirements:** The hardware requirements for LocalAI vary based on the model size and quantization method used. For performance benchmarks with different backends, such as `llama.cpp`, visit [this link](https://github.com/ggerganov/llama.cpp#memorydisk-requirements). The `rwkv` backend is noted for its lower resource consumption.
## Using Container Images or Kubernetes
{{% /alert %}}
LocalAI is available as a container image compatible with various container engines such as Docker, Podman, and Kubernetes. Container images are published on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) and [Docker Hub](https://hub.docker.com/r/localai/localai).
For detailed instructions, see [Using container images]({{% relref "docs/getting-started/container-images" %}}). For Kubernetes deployment, see [Run with Kubernetes]({{% relref "docs/getting-started/kubernetes" %}}).
## Running LocalAI with All-in-One (AIO) Images
> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}}) or [Run other models]({{%relref "docs/getting-started/run-other-models" %}}) to use an already-configured model_.
> _Already have a model file? Skip to [Run models manually]({{% relref "docs/getting-started/manual" %}})_.
LocalAI's All-in-One (AIO) images are pre-configured with a set of models and backends to fully leverage almost all the LocalAI featureset. If you don't need models pre-configured, you can use the standard [images]({{%relref "docs/getting-started/container-images" %}}).
LocalAI's All-in-One (AIO) images are pre-configured with a set of models and backends to fully leverage almost all the features of LocalAI. If pre-configured models are not required, you can use the standard [images]({{% relref "docs/getting-started/container-images" %}}).
These images are available for both CPU and GPU environments. The AIO images are designed to be easy to use and requires no configuration.
These images are available for both CPU and GPU environments. AIO images are designed for ease of use and require no additional configuration.
It suggested to use the AIO images if you don't want to configure the models to run on LocalAI. If you want to run specific models, you can use the [manual method]({{%relref "docs/getting-started/manual" %}}).
It is recommended to use AIO images if you prefer not to configure the models manually or via the web interface. For running specific models, refer to the [manual method]({{% relref "docs/getting-started/manual" %}}).
The AIO Images comes pre-configured with the following features:
The AIO images come pre-configured with the following features:
- Text to Speech (TTS)
- Speech to Text
- Function calling
@ -44,282 +44,17 @@ The AIO Images comes pre-configured with the following features:
- Image generation
- Embedding server
For instructions on using AIO images, see [Using container images]({{% relref "docs/getting-started/container-images#all-in-one-images" %}}).
Start the image with Docker:
## What's Next?
```bash
docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
# For Nvidia GPUs:
# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-11
# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-12
```
There is much more to explore with LocalAI! You can run any model from Hugging Face, perform video generation, and also voice cloning. For a comprehensive overview, check out the [features]({{% relref "docs/features" %}}) section.
Explore additional resources and community contributions:
Or with a docker-compose file:
```yaml
version: "3.9"
services:
api:
image: localai/localai:latest-aio-cpu
# For a specific version:
# image: localai/localai:{{< version >}}-aio-cpu
# For Nvidia GPUs decomment one of the following (cuda11 or cuda12):
# image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-11
# image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-12
# image: localai/localai:latest-aio-gpu-nvidia-cuda-11
# image: localai/localai:latest-aio-gpu-nvidia-cuda-12
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
interval: 1m
timeout: 20m
retries: 5
ports:
- 8080:8080
environment:
- DEBUG=true
# ...
volumes:
- ./models:/build/models:cached
# decomment the following piece if running with Nvidia GPUs
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
```
For a list of all the container-images available, see [Container images]({{%relref "docs/getting-started/container-images" %}}). To learn more about All-in-one images instead, see [All-in-one Images]({{%relref "docs/getting-started/container-images" %}}).
{{% alert icon="💡" %}}
**Models caching**: The **AIO** image will download the needed models on the first run if not already present and store those in `/build/models` inside the container. The AIO models will be automatically updated with new versions of AIO images.
You can change the directory inside the container by specifying a `MODELS_PATH` environment variable (or `--models-path`).
If you want to use a named model or a local directory, you can mount it as a volume to `/build/models`:
```bash
docker run -p 8080:8080 --name local-ai -ti -v $PWD/models:/build/models localai/localai:latest-aio-cpu
```
or associate a volume:
```bash
docker volume create localai-models
docker run -p 8080:8080 --name local-ai -ti -v localai-models:/build/models localai/localai:latest-aio-cpu
```
{{% /alert %}}
## Running LocalAI from Binaries
LocalAI binaries are available for both Linux and MacOS platforms and can be executed directly from your command line. These binaries are continuously updated and hosted on [our GitHub Releases page](https://github.com/mudler/LocalAI/releases). This method also supports Windows users via the Windows Subsystem for Linux (WSL).
Use the following one-liner command in your terminal to download and run LocalAI on Linux or MacOS:
```bash
curl -Lo local-ai "https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-$(uname -s)-$(uname -m)" && chmod +x local-ai && ./local-ai
```
Otherwise, here are the links to the binaries:
| OS | Link |
| --- | --- |
| Linux | [Download](https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-Linux-x86_64) |
| MacOS | [Download](https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-Darwin-arm64) |
## Try it out
Connect to LocalAI, by default the WebUI should be accessible from http://localhost:8080 . You can also use 3rd party projects to interact with LocalAI as you would use OpenAI (see also [Integrations]({{%relref "docs/integrations" %}}) ).
You can also test out the API endpoints using `curl`, examples below.
### Text Generation
Creates a model response for the given chat conversation. [OpenAI documentation](https://platform.openai.com/docs/api-reference/chat/create).
<details>
```bash
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}] }'
```
</details>
### GPT Vision
Understand images.
<details>
```bash
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user", "content": [
{"type":"text", "text": "What is in the image?"},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
],
"temperature": 0.9
}
]
}'
```
</details>
### Function calling
Call functions
<details>
```bash
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4",
"messages": [
{
"role": "user",
"content": "What is the weather like in Boston?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto"
}'
```
</details>
### Image Generation
Creates an image given a prompt. [OpenAI documentation](https://platform.openai.com/docs/api-reference/images/create).
<details>
```bash
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" -d '{
"prompt": "A cute baby sea otter",
"size": "256x256"
}'
```
</details>
### Text to speech
Generates audio from the input text. [OpenAI documentation](https://platform.openai.com/docs/api-reference/audio/createSpeech).
<details>
```bash
curl http://localhost:8080/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"model": "tts-1",
"input": "The quick brown fox jumped over the lazy dog.",
"voice": "alloy"
}' \
--output speech.mp3
```
</details>
### Audio Transcription
Transcribes audio into the input language. [OpenAI Documentation](https://platform.openai.com/docs/api-reference/audio/createTranscription).
<details>
Download first a sample to transcribe:
```bash
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
```
Send the example audio file to the transcriptions endpoint :
```bash
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper-1"
```
</details>
### Embeddings Generation
Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. [OpenAI Embeddings](https://platform.openai.com/docs/api-reference/embeddings).
<details>
```bash
curl http://localhost:8080/embeddings \
-X POST -H "Content-Type: application/json" \
-d '{
"input": "Your text string goes here",
"model": "text-embedding-ada-002"
}'
```
</details>
{{% alert icon="💡" %}}
Don't use the model file as `model` in the request unless you want to handle the prompt template for yourself.
Use the model names like you would do with OpenAI like in the examples below. For instance `gpt-4-vision-preview`, or `gpt-4`.
{{% /alert %}}
## What's next?
There is much more to explore! run any model from huggingface, video generation, and voice cloning with LocalAI, check out the [features]({{%relref "docs/features" %}}) section for a full overview.
Explore further resources and community contributions:
- [Build LocalAI and the container image]({{%relref "docs/getting-started/build" %}})
- [Run models manually]({{%relref "docs/getting-started/manual" %}})
- [Run other models]({{%relref "docs/getting-started/run-other-models" %}})
- [Container images]({{%relref "docs/getting-started/container-images" %}})
- [All-in-one Images]({{%relref "docs/getting-started/container-images" %}})
- [Installer Options]({{% relref "docs/advanced/installer" %}})
- [Run from Container images]({{% relref "docs/getting-started/container-images" %}})
- [Examples to try from the CLI]({{% relref "docs/getting-started/try-it-out" %}})
- [Build LocalAI and the container image]({{% relref "docs/getting-started/build" %}})
- [Run models manually]({{% relref "docs/getting-started/manual" %}})
- [Examples](https://github.com/mudler/LocalAI/tree/master/examples#examples)

View File

@ -0,0 +1,196 @@
+++
disableToc = false
title = "Try it out"
weight = 4
url = '/basics/try/'
icon = "rocket_launch"
+++
Once LocalAI is installed, you can start it (either by using docker, or the cli, or the systemd service).
By default the LocalAI WebUI should be accessible from http://localhost:8080. You can also use 3rd party projects to interact with LocalAI as you would use OpenAI (see also [Integrations]({{%relref "docs/integrations" %}}) ).
After installation, install new models by navigating the model gallery, or by using the `local-ai` CLI.
{{% alert icon="🚀" %}}
To install models with the WebUI, see the [Models section]({{%relref "docs/features/model-gallery" %}}).
With the CLI you can list the models with `local-ai models list` and install them with `local-ai models install <model-name>`.
You can also [run models manually]({{%relref "docs/getting-started/manual" %}}) by copying files into the `models` directory.
{{% /alert %}}
You can test out the API endpoints using `curl`, few examples are listed below. The models we are refering here (`gpt-4`, `gpt-4-vision-preview`, `tts-1`, `whisper-1`) are the default models that come with the AIO images - you can also use any other model you have installed.
### Text Generation
Creates a model response for the given chat conversation. [OpenAI documentation](https://platform.openai.com/docs/api-reference/chat/create).
<details>
```bash
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}] }'
```
</details>
### GPT Vision
Understand images.
<details>
```bash
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user", "content": [
{"type":"text", "text": "What is in the image?"},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
],
"temperature": 0.9
}
]
}'
```
</details>
### Function calling
Call functions
<details>
```bash
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4",
"messages": [
{
"role": "user",
"content": "What is the weather like in Boston?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto"
}'
```
</details>
### Image Generation
Creates an image given a prompt. [OpenAI documentation](https://platform.openai.com/docs/api-reference/images/create).
<details>
```bash
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" -d '{
"prompt": "A cute baby sea otter",
"size": "256x256"
}'
```
</details>
### Text to speech
Generates audio from the input text. [OpenAI documentation](https://platform.openai.com/docs/api-reference/audio/createSpeech).
<details>
```bash
curl http://localhost:8080/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"model": "tts-1",
"input": "The quick brown fox jumped over the lazy dog.",
"voice": "alloy"
}' \
--output speech.mp3
```
</details>
### Audio Transcription
Transcribes audio into the input language. [OpenAI Documentation](https://platform.openai.com/docs/api-reference/audio/createTranscription).
<details>
Download first a sample to transcribe:
```bash
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
```
Send the example audio file to the transcriptions endpoint :
```bash
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper-1"
```
</details>
### Embeddings Generation
Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. [OpenAI Embeddings](https://platform.openai.com/docs/api-reference/embeddings).
<details>
```bash
curl http://localhost:8080/embeddings \
-X POST -H "Content-Type: application/json" \
-d '{
"input": "Your text string goes here",
"model": "text-embedding-ada-002"
}'
```
</details>
{{% alert icon="💡" %}}
Don't use the model file as `model` in the request unless you want to handle the prompt template for yourself.
Use the model names like you would do with OpenAI like in the examples below. For instance `gpt-4-vision-preview`, or `gpt-4`.
{{% /alert %}}

View File

@ -72,6 +72,12 @@ docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-12
```
Or just use the bash installer:
```bash
curl https://localai.io/install.sh | sh
```
See the [💻 Quickstart](https://localai.io/basics/getting_started/) for all the options and way you can run LocalAI!
## What is LocalAI?

View File

@ -0,0 +1,22 @@
+++
disableToc = false
title = "LocalAI binaries"
weight = 26
+++
LocalAI binaries are available for both Linux and MacOS platforms and can be executed directly from your command line. These binaries are continuously updated and hosted on [our GitHub Releases page](https://github.com/mudler/LocalAI/releases). This method also supports Windows users via the Windows Subsystem for Linux (WSL).
Use the following one-liner command in your terminal to download and run LocalAI on Linux or MacOS:
```bash
curl -Lo local-ai "https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-$(uname -s)-$(uname -m)" && chmod +x local-ai && ./local-ai
```
Otherwise, here are the links to the binaries:
| OS | Link |
| --- | --- |
| Linux (amd64) | [Download](https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-Linux-x86_64) |
| Linux (arm64) | [Download](https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-Linux-arm64) |
| MacOS (arm64) | [Download](https://github.com/mudler/LocalAI/releases/download/{{< version >}}/local-ai-Darwin-arm64) |

View File

@ -0,0 +1,69 @@
apiVersion: v1
kind: Namespace
metadata:
name: local-ai
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: models-pvc
namespace: local-ai
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: local-ai
namespace: local-ai
labels:
app: local-ai
spec:
selector:
matchLabels:
app: local-ai
replicas: 1
template:
metadata:
labels:
app: local-ai
name: local-ai
spec:
runtimeClassName: "nvidia"
containers:
- args:
- phi-2
env:
- name: DEBUG
value: "true"
name: local-ai
image: quay.io/go-skynet/local-ai:master-cublas-cuda12
imagePullPolicy: IfNotPresent
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: models-volume
mountPath: /build/models
volumes:
- name: models-volume
persistentVolumeClaim:
claimName: models-pvc
---
apiVersion: v1
kind: Service
metadata:
name: local-ai
namespace: local-ai
spec:
selector:
app: local-ai
type: NodePort
ports:
- protocol: TCP
targetPort: 8080
port: 8080