Merge branch 'master' into ci/public-runner
Some checks failed
Security Scan / tests (push) Has been cancelled

This commit is contained in:
Ettore Di Giacinto 2025-02-08 11:00:45 +01:00 committed by GitHub
commit d6ea1a67cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
89 changed files with 1245 additions and 1548 deletions

View File

@ -14,7 +14,7 @@ jobs:
steps: steps:
- name: Dependabot metadata - name: Dependabot metadata
id: metadata id: metadata
uses: dependabot/fetch-metadata@v2.2.0 uses: dependabot/fetch-metadata@v2.3.0
with: with:
github-token: "${{ secrets.GITHUB_TOKEN }}" github-token: "${{ secrets.GITHUB_TOKEN }}"
skip-commit-verification: true skip-commit-verification: true

View File

@ -18,7 +18,7 @@ jobs:
with: with:
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
# Check the PR diff using the current branch and the base branch of the PR # Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.7.0 - uses: GrantBirki/git-diff-action@v2.8.0
id: git-diff-action id: git-diff-action
with: with:
json_diff_file_output: diff.json json_diff_file_output: diff.json
@ -99,7 +99,7 @@ jobs:
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
# Check the PR diff using the current branch and the base branch of the PR # Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.7.0 - uses: GrantBirki/git-diff-action@v2.8.0
id: git-diff-action id: git-diff-action
with: with:
json_diff_file_output: diff.json json_diff_file_output: diff.json

View File

@ -303,7 +303,7 @@ RUN make prepare
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
## (both will use CUDA or hipblas for the actual computation) ## (both will use CUDA or hipblas for the actual computation)
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \ RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \ else \
make build; \ make build; \
fi fi

View File

@ -6,9 +6,7 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
# whisper.cpp version # whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@ -24,7 +22,7 @@ BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml) # stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024 STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
ONNX_VERSION?=1.20.0 ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64 ONNX_ARCH?=x64
@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
export CXX=$(ROCM_HOME)/llvm/bin/clang++ export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
export STABLE_BUILD_TYPE= export STABLE_BUILD_TYPE=
export GGML_HIP=1 export GGML_HIP=1
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@ -186,8 +183,8 @@ endif
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@ -221,19 +218,6 @@ endif
all: help all: help
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
cd sources/go-llama.cpp && \
git init && \
git remote add origin $(GOLLAMA_REPO) && \
git fetch origin && \
git checkout $(GOLLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp ## bark.cpp
sources/bark.cpp: sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \ git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@ -309,19 +293,17 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
replace: replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace: dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace prepare-sources: get-sources replace
$(GOCMD) mod download $(GOCMD) mod download
@ -329,7 +311,6 @@ prepare-sources: get-sources replace
## GENERIC ## GENERIC
rebuild: ## Rebuilds the project rebuild: ## Rebuilds the project
$(GOCMD) clean -cache $(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-piper clean $(MAKE) -C sources/go-piper clean
$(MAKE) build $(MAKE) build
@ -433,7 +414,7 @@ run: prepare ## run local-ai
test-models/testmodel.ggml: test-models/testmodel.ggml:
mkdir test-models mkdir test-models
mkdir test-dir mkdir test-dir
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@ -448,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
export GO_TAGS="tts debug" export GO_TAGS="tts debug"
$(MAKE) prepare-test $(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama
$(MAKE) test-llama-gguf $(MAKE) test-llama-gguf
$(MAKE) test-tts $(MAKE) test-tts
$(MAKE) test-stablediffusion $(MAKE) test-stablediffusion
@ -478,10 +458,6 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests) docker stop $$(docker ps -q --filter ancestor=localai-tests)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@ -699,6 +675,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2 cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx512
$(MAKE) -C backend/cpp/llama-avx512 purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx cp -rf backend/cpp/llama backend/cpp/llama-avx
$(MAKE) -C backend/cpp/llama-avx purge $(MAKE) -C backend/cpp/llama-avx purge
@ -752,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/ mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@ -853,7 +829,7 @@ swagger:
.PHONY: gen-assets .PHONY: gen-assets
gen-assets: gen-assets:
$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets $(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
## Documentation ## Documentation
docs/layouts/_default: docs/layouts/_default:

View File

@ -163,6 +163,11 @@ message Reply {
double timing_token_generation = 5; double timing_token_generation = 5;
} }
message GrammarTrigger {
string word = 1;
bool at_start = 2;
}
message ModelOptions { message ModelOptions {
string Model = 1; string Model = 1;
int32 ContextSize = 2; int32 ContextSize = 2;
@ -247,6 +252,8 @@ message ModelOptions {
string CacheTypeKey = 63; string CacheTypeKey = 63;
string CacheTypeValue = 64; string CacheTypeValue = 64;
repeated GrammarTrigger GrammarTriggers = 65;
} }
message Result { message Result {

View File

@ -468,6 +468,9 @@ struct llama_server_context
bool add_bos_token = true; bool add_bos_token = true;
bool has_eos_token = true; bool has_eos_token = true;
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_trigger_words;
int32_t n_ctx; // total context for all clients / slots int32_t n_ctx; // total context for all clients / slots
// system prompt // system prompt
@ -706,6 +709,8 @@ struct llama_server_context
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
slot->sparams.grammar_trigger_words = grammar_trigger_words;
slot->sparams.grammar_lazy = grammar_lazy;
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ? // Might be better to reject the request with a 400 ?
@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
if ( request->ropefreqscale() != 0.0f ) { if ( request->ropefreqscale() != 0.0f ) {
params.rope_freq_scale = request->ropefreqscale(); params.rope_freq_scale = request->ropefreqscale();
} }
if (request->grammartriggers_size() > 0) {
LOG_INFO("configuring grammar triggers", {});
llama.grammar_lazy = true;
for (int i = 0; i < request->grammartriggers_size(); i++) {
common_grammar_trigger trigger;
trigger.word = request->grammartriggers(i).word();
trigger.at_start = request->grammartriggers(i).at_start();
llama.grammar_trigger_words.push_back(trigger);
LOG_INFO("grammar trigger", {
{ "word", trigger.word },
{ "at_start", trigger.at_start }
});
}
}
} }
@ -2522,6 +2542,18 @@ public:
return grpc::Status::OK; return grpc::Status::OK;
} }
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
json data = parse_options(false, request, llama);
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
for (int i=0 ; i< tokens.size(); i++){
response->add_tokens(tokens[i]);
}
return grpc::Status::OK;
}
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) { grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
llama_client_slot* active_slot = llama.get_active_slot(); llama_client_slot* active_slot = llama.get_active_slot();

View File

@ -1,204 +0,0 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"
"github.com/go-skynet/go-llama.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type LLM struct {
base.SingleThread
llama *llama.LLama
}
func (llm *LLM) Load(opts *pb.ModelOptions) error {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(ropeFreqBase),
llama.WithRopeFreqScale(ropeFreqScale),
}
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}
if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}
if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}
model, err := llama.New(opts.ModelFile, llamaOpts...)
llm.llama = model
return err
}
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
predictOptions := []llama.PredictOption{
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(ropeFreqBase),
llama.SetRopeFreqScale(ropeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}
if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}
if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}
// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}
if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}
if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}
if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}
if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}
if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}
if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}
if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))
go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()
return nil
}
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}

View File

@ -1,19 +0,0 @@
package main
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View File

@ -1,6 +1,6 @@
accelerate accelerate
auto-gptq==0.7.1 auto-gptq==0.7.1
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
transformers transformers

View File

@ -1,4 +1,4 @@
bark==0.1.5 bark==0.1.5
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi

View File

@ -1,3 +1,3 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
grpcio-tools grpcio-tools

View File

@ -1,4 +1,4 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
packaging==24.1 packaging==24.1

View File

@ -1,5 +1,5 @@
setuptools setuptools
grpcio==1.69.0 grpcio==1.70.0
pillow pillow
protobuf protobuf
certifi certifi

View File

@ -1,4 +1,4 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
wheel wheel

View File

@ -1,3 +1,3 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
grpcio-tools grpcio-tools

View File

@ -1,4 +1,4 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
phonemizer phonemizer
scipy scipy

View File

@ -1,3 +1,3 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi

View File

@ -5,4 +5,4 @@ accelerate
transformers transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.3.1 sentence-transformers==3.4.1

View File

@ -6,4 +6,4 @@ accelerate
transformers transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.3.1 sentence-transformers==3.4.1

View File

@ -5,4 +5,4 @@ numba==0.60.0
transformers transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.3.1 sentence-transformers==3.4.1

View File

@ -7,4 +7,4 @@ numba==0.60.0
bitsandbytes bitsandbytes
outetts outetts
bitsandbytes bitsandbytes
sentence-transformers==3.3.1 sentence-transformers==3.4.1

View File

@ -8,4 +8,4 @@ numba==0.60.0
intel-extension-for-transformers intel-extension-for-transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.3.1 sentence-transformers==3.4.1

View File

@ -1,4 +1,4 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
setuptools setuptools

View File

@ -1,4 +1,4 @@
grpcio==1.69.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
setuptools setuptools

View File

@ -62,7 +62,7 @@ func New(opts ...config.AppOption) (*Application, error) {
} }
} }
if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil { if err := pkgStartup.InstallModels(options.Galleries, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
log.Error().Err(err).Msg("error installing models") log.Error().Err(err).Msg("error installing models")
} }

View File

@ -118,9 +118,19 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
nGPULayers = *c.NGPULayers nGPULayers = *c.NGPULayers
} }
triggers := make([]*pb.GrammarTrigger, 0)
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word,
AtStart: t.AtStart,
})
}
return &pb.ModelOptions{ return &pb.ModelOptions{
CUDA: c.CUDA || c.Diffusers.CUDA, CUDA: c.CUDA || c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType, SchedulerType: c.Diffusers.SchedulerType,
GrammarTriggers: triggers,
PipelineType: c.Diffusers.PipelineType, PipelineType: c.Diffusers.PipelineType,
CFGScale: c.CFGScale, CFGScale: c.CFGScale,
LoraAdapter: c.LoraAdapter, LoraAdapter: c.LoraAdapter,

View File

@ -16,12 +16,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile)) opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
if backendConfig.Backend == "" { inferenceModel, err = loader.Load(opts...)
inferenceModel, err = loader.Load(opts...)
} else {
opts = append(opts, model.WithBackendString(backendConfig.Backend))
inferenceModel, err = loader.Load(opts...)
}
if err != nil { if err != nil {
return schema.TokenizeResponse{}, err return schema.TokenizeResponse{}, err
} }
@ -35,6 +30,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
return schema.TokenizeResponse{}, err return schema.TokenizeResponse{}, err
} }
if resp.Tokens == nil {
resp.Tokens = make([]int32, 0)
}
return schema.TokenizeResponse{ return schema.TokenizeResponse{
Tokens: resp.Tokens, Tokens: resp.Tokens,
}, nil }, nil

View File

@ -100,7 +100,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model") log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
} }
err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName) err = startup.InstallModels(galleries, mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
if err != nil { if err != nil {
return err return err
} }

View File

@ -32,7 +32,6 @@ type RunCMD struct {
Galleries string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"` Galleries string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
AutoloadGalleries bool `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"` AutoloadGalleries bool `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
RemoteLibrary string `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
PreloadModels string `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"` PreloadModels string `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
Models []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"` Models []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
PreloadModelsConfig string `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"` PreloadModelsConfig string `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
@ -90,7 +89,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval), config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
config.WithF16(r.F16), config.WithF16(r.F16),
config.WithStringGalleries(r.Galleries), config.WithStringGalleries(r.Galleries),
config.WithModelLibraryURL(r.RemoteLibrary),
config.WithCors(r.CORS), config.WithCors(r.CORS),
config.WithCorsAllowOrigins(r.CORSAllowOrigins), config.WithCorsAllowOrigins(r.CORSAllowOrigins),
config.WithCsrf(r.CSRF), config.WithCsrf(r.CSRF),

View File

@ -44,8 +44,6 @@ type ApplicationConfig struct {
DisableGalleryEndpoint bool DisableGalleryEndpoint bool
LoadToMemory []string LoadToMemory []string
ModelLibraryURL string
Galleries []Gallery Galleries []Gallery
BackendAssets embed.FS BackendAssets embed.FS
@ -126,12 +124,6 @@ func WithP2PToken(s string) AppOption {
} }
} }
func WithModelLibraryURL(url string) AppOption {
return func(o *ApplicationConfig) {
o.ModelLibraryURL = url
}
}
func WithLibPath(path string) AppOption { func WithLibPath(path string) AppOption {
return func(o *ApplicationConfig) { return func(o *ApplicationConfig) {
o.LibPath = path o.LibPath = path

View File

@ -287,7 +287,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
defaultTopP := 0.95 defaultTopP := 0.95
defaultTopK := 40 defaultTopK := 40
defaultTemp := 0.9 defaultTemp := 0.9
defaultMirostat := 2 // https://github.com/mudler/LocalAI/issues/2780
defaultMirostat := 0
defaultMirostatTAU := 5.0 defaultMirostatTAU := 5.0
defaultMirostatETA := 0.1 defaultMirostatETA := 0.1
defaultTypicalP := 1.0 defaultTypicalP := 1.0

View File

@ -48,9 +48,9 @@ parameters:
Expect(config.Name).To(Equal("bar-baz")) Expect(config.Name).To(Equal("bar-baz"))
Expect(config.Validate()).To(BeTrue()) Expect(config.Validate()).To(BeTrue())
// download https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml // download https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml
httpClient := http.Client{} httpClient := http.Client{}
resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml") resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml")
Expect(err).To(BeNil()) Expect(err).To(BeNil())
defer resp.Body.Close() defer resp.Body.Close()
tmp, err = os.CreateTemp("", "config.yaml") tmp, err = os.CreateTemp("", "config.yaml")

View File

@ -48,8 +48,10 @@ var _ = Describe("Model test", func() {
defer os.RemoveAll(tempdir) defer os.RemoveAll(tempdir)
gallery := []GalleryModel{{ gallery := []GalleryModel{{
Name: "bert", Metadata: Metadata{
URL: bertEmbeddingsURL, Name: "bert",
URL: bertEmbeddingsURL,
},
}} }}
out, err := yaml.Marshal(gallery) out, err := yaml.Marshal(gallery)
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())

View File

@ -11,6 +11,14 @@ import (
// It is used to install the model by resolving the URL and downloading the files. // It is used to install the model by resolving the URL and downloading the files.
// The other fields are used to override the configuration of the model. // The other fields are used to override the configuration of the model.
type GalleryModel struct { type GalleryModel struct {
Metadata `json:",inline" yaml:",inline"`
// config_file is read in the situation where URL is blank - and therefore this is a base config.
ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
// Overrides are used to override the configuration of the model located at URL
Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
}
type Metadata struct {
URL string `json:"url,omitempty" yaml:"url,omitempty"` URL string `json:"url,omitempty" yaml:"url,omitempty"`
Name string `json:"name,omitempty" yaml:"name,omitempty"` Name string `json:"name,omitempty" yaml:"name,omitempty"`
Description string `json:"description,omitempty" yaml:"description,omitempty"` Description string `json:"description,omitempty" yaml:"description,omitempty"`
@ -18,10 +26,6 @@ type GalleryModel struct {
URLs []string `json:"urls,omitempty" yaml:"urls,omitempty"` URLs []string `json:"urls,omitempty" yaml:"urls,omitempty"`
Icon string `json:"icon,omitempty" yaml:"icon,omitempty"` Icon string `json:"icon,omitempty" yaml:"icon,omitempty"`
Tags []string `json:"tags,omitempty" yaml:"tags,omitempty"` Tags []string `json:"tags,omitempty" yaml:"tags,omitempty"`
// config_file is read in the situation where URL is blank - and therefore this is a base config.
ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
// Overrides are used to override the configuration of the model located at URL
Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
// AdditionalFiles are used to add additional files to the model // AdditionalFiles are used to add additional files to the model
AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"` AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"`
// Gallery is a reference to the gallery which contains the model // Gallery is a reference to the gallery which contains the model

View File

@ -9,7 +9,11 @@ import (
var _ = Describe("Gallery API tests", func() { var _ = Describe("Gallery API tests", func() {
Context("requests", func() { Context("requests", func() {
It("parses github with a branch", func() { It("parses github with a branch", func() {
req := GalleryModel{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"} req := GalleryModel{
Metadata: Metadata{
URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main",
},
}
e, err := GetGalleryConfigFromURL(req.URL, "") e, err := GetGalleryConfigFromURL(req.URL, "")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(e.Name).To(Equal("gpt4all-j")) Expect(e.Name).To(Equal("gpt4all-j"))

View File

@ -299,14 +299,18 @@ var _ = Describe("API test", func() {
g := []gallery.GalleryModel{ g := []gallery.GalleryModel{
{ {
Name: "bert", Metadata: gallery.Metadata{
URL: bertEmbeddingsURL, Name: "bert",
URL: bertEmbeddingsURL,
},
}, },
{ {
Name: "bert2", Metadata: gallery.Metadata{
URL: bertEmbeddingsURL, Name: "bert2",
Overrides: map[string]interface{}{"foo": "bar"}, URL: bertEmbeddingsURL,
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}}, AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
},
Overrides: map[string]interface{}{"foo": "bar"},
}, },
} }
out, err := yaml.Marshal(g) out, err := yaml.Marshal(g)
@ -476,7 +480,7 @@ var _ = Describe("API test", func() {
}) })
It("apply models from config", func() { It("apply models from config", func() {
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml", ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
}) })
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@ -522,77 +526,6 @@ var _ = Describe("API test", func() {
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
}) })
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
Name: "openllama_3b",
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
uuid := response["uuid"].(string)
Eventually(func() bool {
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
return response["processed"].(bool)
}, "360s", "10s").Should(Equal(true))
By("testing completion")
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1))
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
By("testing functions")
resp2, err := client.CreateChatCompletion(
context.TODO(),
openai.ChatCompletionRequest{
Model: "openllama_3b",
Messages: []openai.ChatCompletionMessage{
{
Role: "user",
Content: "What is the weather like in San Francisco (celsius)?",
},
},
Functions: []openai.FunctionDefinition{
openai.FunctionDefinition{
Name: "get_current_weather",
Description: "Get the current weather",
Parameters: jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"location": {
Type: jsonschema.String,
Description: "The city and state, e.g. San Francisco, CA",
},
"unit": {
Type: jsonschema.String,
Enum: []string{"celcius", "fahrenheit"},
},
},
Required: []string{"location"},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp2.Choices)).To(Equal(1))
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
var res map[string]string
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
Expect(err).ToNot(HaveOccurred())
Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
})
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() { It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
if runtime.GOOS != "linux" { if runtime.GOOS != "linux" {
Skip("test supported only on linux") Skip("test supported only on linux")
@ -600,7 +533,7 @@ var _ = Describe("API test", func() {
modelName := "hermes-2-pro-mistral" modelName := "hermes-2-pro-mistral"
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml", ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
}) })
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))

View File

@ -117,19 +117,25 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
// @Router /models/available [get] // @Router /models/available [get]
func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error { func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error { return func(c *fiber.Ctx) error {
log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath) models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
if err != nil { if err != nil {
return err return err
} }
log.Debug().Msgf("Models found from galleries: %+v", models)
for _, m := range models { log.Debug().Msgf("Available %d models from %d galleries\n", len(models), len(mgs.galleries))
log.Debug().Msgf("Model found from galleries: %+v", m)
m := []gallery.Metadata{}
for _, mm := range models {
m = append(m, mm.Metadata)
} }
dat, err := json.Marshal(models)
log.Debug().Msgf("Models %#v", m)
dat, err := json.Marshal(m)
if err != nil { if err != nil {
return err return fmt.Errorf("could not marshal models: %w", err)
} }
return c.Send(dat) return c.Send(dat)
} }

View File

@ -12,6 +12,7 @@ import (
// TokenizeEndpoint exposes a REST API to tokenize the content // TokenizeEndpoint exposes a REST API to tokenize the content
// @Summary Tokenize the input. // @Summary Tokenize the input.
// @Param request body schema.TokenizeRequest true "Request"
// @Success 200 {object} schema.TokenizeResponse "Response" // @Success 200 {object} schema.TokenizeResponse "Response"
// @Router /v1/tokenize [post] // @Router /v1/tokenize [post]
func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error { func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
@ -51,8 +52,6 @@ func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, app
return err return err
} }
c.JSON(tokenResponse) return c.JSON(tokenResponse)
return nil
} }
} }

View File

@ -401,6 +401,11 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
log.Debug().Msgf("Text content to return: %s", textContentToReturn) log.Debug().Msgf("Text content to return: %s", textContentToReturn)
noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0 noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
finishReason := "stop"
if len(input.Tools) > 0 {
finishReason = "tool_calls"
}
switch { switch {
case noActionsToRun: case noActionsToRun:
result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput) result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput)
@ -408,19 +413,18 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
log.Error().Err(err).Msg("error handling question") log.Error().Err(err).Msg("error handling question")
return return
} }
*c = append(*c, schema.Choice{ *c = append(*c, schema.Choice{
Message: &schema.Message{Role: "assistant", Content: &result}}) FinishReason: finishReason,
Message: &schema.Message{Role: "assistant", Content: &result}})
default: default:
toolChoice := schema.Choice{ toolChoice := schema.Choice{
FinishReason: finishReason,
Message: &schema.Message{ Message: &schema.Message{
Role: "assistant", Role: "assistant",
}, },
} }
if len(input.Tools) > 0 {
toolChoice.FinishReason = "tool_calls"
}
for _, ss := range results { for _, ss := range results {
name, args := ss.Name, ss.Arguments name, args := ss.Name, ss.Arguments
if len(input.Tools) > 0 { if len(input.Tools) > 0 {
@ -438,7 +442,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
}, },
) )
} else { } else {
// otherwise we return more choices directly // otherwise we return more choices directly (deprecated)
*c = append(*c, schema.Choice{ *c = append(*c, schema.Choice{
FinishReason: "function_call", FinishReason: "function_call",
Message: &schema.Message{ Message: &schema.Message{

View File

@ -129,7 +129,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.BackendConfigLoader
if op.GalleryModelName != "" { if op.GalleryModelName != "" {
err = gallery.InstallModelFromGallery(op.Galleries, op.GalleryModelName, g.appConfig.ModelPath, op.Req, progressCallback, g.appConfig.EnforcePredownloadScans) err = gallery.InstallModelFromGallery(op.Galleries, op.GalleryModelName, g.appConfig.ModelPath, op.Req, progressCallback, g.appConfig.EnforcePredownloadScans)
} else if op.ConfigURL != "" { } else if op.ConfigURL != "" {
err = startup.InstallModels(op.Galleries, op.ConfigURL, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL) err = startup.InstallModels(op.Galleries, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
if err != nil { if err != nil {
updateError(err) updateError(err)
continue continue

View File

@ -148,6 +148,9 @@ function:
no_action_function_name: "" # Function name to call when no action is determined. no_action_function_name: "" # Function name to call when no action is determined.
no_action_description_name: "" # Description name for no-action functions. no_action_description_name: "" # Description name for no-action functions.
response_regex: [] # Regular expressions to match response from response_regex: [] # Regular expressions to match response from
argument_regex: [] # Named regular to extract function arguments from the response.
argument_regex_key_name: "key" # Name of the named regex capture to capture the key of the function arguments
argument_regex_value_name: "value" # Name of the named regex capture to capture the value of the function arguments
json_regex_match: [] # Regular expressions to match JSON data when in tool mode json_regex_match: [] # Regular expressions to match JSON data when in tool mode
replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns. replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
replace_llm_results: [] # Replace language model results with arbitrary strings or patterns. replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.

View File

@ -1,126 +0,0 @@
+++
disableToc = false
title = "Run other Models"
weight = 23
icon = "rocket_launch"
+++
## Running other models
> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/models" %}})_.
To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/models" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
{{% alert icon="💡" %}}
To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI-examples/tree/main/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
{{% /alert %}}
{{< tabs tabTotal="3" >}}
{{% tab tabName="CPU-only" %}}
> 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
| Model | Category | Docker command |
| --- | --- | --- |
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
{{% /tab %}}
{{% tab tabName="GPU (CUDA 11)" %}}
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
| Model | Category | Docker command |
| --- | --- | --- |
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
{{% /tab %}}
{{% tab tabName="GPU (CUDA 12)" %}}
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
| Model | Category | Docker command |
| --- | --- | --- |
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
{{% /tab %}}
{{< /tabs >}}
{{% alert icon="💡" %}}
**Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
```bash
docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
```
{{% /alert %}}

View File

@ -134,12 +134,12 @@ curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
}' }'
``` ```
An example that installs openllama can be: An example that installs hermes-2-pro-mistral can be:
```bash ```bash
LOCALAI=http://localhost:8080 LOCALAI=http://localhost:8080
curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{ curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
"config_url": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml" "config_url": "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml"
}' }'
``` ```

View File

@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
{{% alert note %}} {{% alert note %}}
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings. The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
{{% /alert %}} {{% /alert %}}
@ -175,25 +175,12 @@ name: llama
backend: llama backend: llama
parameters: parameters:
# Relative to the models path # Relative to the models path
model: file.gguf.bin model: file.gguf
```
In the example above we specify `llama` as the backend to restrict loading `gguf` models only.
For instance, to use the `llama-ggml` backend for `ggml` models:
```yaml
name: llama
backend: llama-ggml
parameters:
# Relative to the models path
model: file.ggml.bin
``` ```
#### Reference #### Reference
- [llama](https://github.com/ggerganov/llama.cpp) - [llama](https://github.com/ggerganov/llama.cpp)
- [binding](https://github.com/go-skynet/go-llama.cpp)
### exllama/2 ### exllama/2

View File

@ -143,7 +143,7 @@ The AIO Images are inheriting the same environment variables as the base images
| Variable | Default | Description | | Variable | Default | Description |
| ---------------------| ------- | ----------- | | ---------------------| ------- | ----------- |
| `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` | | `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` |
| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/advanced/run-other-models" %}})) | | `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/getting-started/models" %}})) |
## Standard container images ## Standard container images
@ -154,7 +154,7 @@ Images are available with and without python dependencies. Note that images with
Images with `core` in the tag are smaller and do not contain any python dependencies. Images with `core` in the tag are smaller and do not contain any python dependencies.
{{< tabs tabTotal="7" >}} {{< tabs tabTotal="8" >}}
{{% tab tabName="Vanilla / CPU Images" %}} {{% tab tabName="Vanilla / CPU Images" %}}
| Description | Quay | Docker Hub | | Description | Quay | Docker Hub |
@ -236,6 +236,18 @@ Images with `core` in the tag are smaller and do not contain any python dependen
| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core` | | Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core` |
{{% /tab %}} {{% /tab %}}
{{% tab tabName="Nvidia Linux for tegra" %}}
These images are compatible with Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. For more information, see the [Nvidia L4T guide]({{%relref "docs/reference/nvidia-l4t" %}}).
| Description | Quay | Docker Hub |
| --- | --- |-------------------------------------------------------------|
| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core` | `localai/localai:master-nvidia-l4t-arm64-core` |
| Latest tag | `quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-core` | `localai/localai:latest-nvidia-l4t-arm64-core` |
| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-nvidia-l4t-arm64-core` | `localai/localai:{{< version >}}-nvidia-l4t-arm64-core` |
{{% /tab %}}
{{< /tabs >}} {{< /tabs >}}
## See Also ## See Also

View File

@ -40,6 +40,10 @@ icon = "info"
</a> </a>
</p> </p>
<p align="center">
<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
<p align="center"> <p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank"> <a href="https://twitter.com/LocalAI_API" target="blank">
<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/> <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
@ -118,7 +122,24 @@ To help the project you can:
## 🌟 Star history ## 🌟 Star history
[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date) [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=mudler/LocalAI&type=Date)](https://star-history.com/#mudler/LocalAI&Date)
## ❤️ Sponsors
> Do you find LocalAI useful?
Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
<p align="center">
<a href="https://www.spectrocloud.com/" target="blank">
<img width=200 src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
</a>
<a href="https://www.premai.io/" target="blank">
<img width=200 src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
</a>
</p>
## 📖 License ## 📖 License

View File

@ -21,7 +21,13 @@ git clone https://github.com/mudler/LocalAI
cd LocalAI cd LocalAI
docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t localai-orin . docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core .
```
Otherwise images are available on quay.io and dockerhub:
```bash
docker pull quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
``` ```
## Usage ## Usage
@ -29,7 +35,7 @@ docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build
Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models: Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models:
```bash ```bash
docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models -ti --restart=always --name local-ai --runtime nvidia --gpus all localai-orin docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models -ti --restart=always --name local-ai --runtime nvidia --gpus all quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
``` ```
Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models. Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.

@ -1 +1 @@
Subproject commit 8dad5ee419e5bb2a0b380aa72d7a7389af4945f6 Subproject commit 66bc366c4727a958f3873f409550daa36932c03f

View File

@ -1,72 +0,0 @@
package embedded
import (
"embed"
"fmt"
"slices"
"strings"
"github.com/mudler/LocalAI/pkg/downloader"
"github.com/rs/zerolog/log"
"github.com/mudler/LocalAI/pkg/assets"
"gopkg.in/yaml.v3"
)
var modelShorteners map[string]string
//go:embed model_library.yaml
var modelLibrary []byte
//go:embed models/*
var embeddedModels embed.FS
func ModelShortURL(s string) string {
if _, ok := modelShorteners[s]; ok {
s = modelShorteners[s]
}
return s
}
func init() {
err := yaml.Unmarshal(modelLibrary, &modelShorteners)
if err != nil {
log.Error().Err(err).Msg("error while unmarshalling embedded modelLibrary")
}
}
func GetRemoteLibraryShorteners(url string, basePath string) (map[string]string, error) {
remoteLibrary := map[string]string{}
uri := downloader.URI(url)
err := uri.DownloadWithCallback(basePath, func(_ string, i []byte) error {
return yaml.Unmarshal(i, &remoteLibrary)
})
if err != nil {
return nil, fmt.Errorf("error downloading remote library: %s", err.Error())
}
return remoteLibrary, err
}
// ExistsInModelsLibrary checks if a model exists in the embedded models library
func ExistsInModelsLibrary(s string) bool {
f := fmt.Sprintf("%s.yaml", s)
a := []string{}
for _, j := range assets.ListFiles(embeddedModels) {
a = append(a, strings.TrimPrefix(j, "models/"))
}
return slices.Contains(a, f)
}
// ResolveContent returns the content in the embedded model library
func ResolveContent(s string) ([]byte, error) {
if ExistsInModelsLibrary(s) {
return embeddedModels.ReadFile(fmt.Sprintf("models/%s.yaml", s))
}
return nil, fmt.Errorf("cannot find model %s", s)
}

View File

@ -1,9 +0,0 @@
###
###
### This file contains the list of models that are available in the library
### The URLs are automatically expanded when local-ai is being called with the key as argument
###
### For models with an entire YAML file to be embededd, put the file inside the `models`
### directory, it will be automatically available with the file name as key (without the .yaml extension)
phi-2: "github://mudler/LocalAI-examples/configurations/phi-2.yaml@main"

View File

@ -1,13 +0,0 @@
name: all-minilm-l6-v2
backend: sentencetransformers
embeddings: true
parameters:
model: all-MiniLM-L6-v2
usage: |
You can test this model with curl like this:
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "all-minilm-l6-v2"
}'

View File

@ -1,17 +0,0 @@
name: animagine-xl
parameters:
model: Linaqruf/animagine-xl
backend: diffusers
f16: true
diffusers:
scheduler_type: euler_a
usage: |
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "<positive prompt>|<negative prompt>",
"model": "animagine-xl",
"step": 51,
"size": "1024x1024"
}'

View File

@ -1,40 +0,0 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: bakllava
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters:
model: bakllava.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: bakllava.gguf
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "bakllava",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@ -1,8 +0,0 @@
usage: |
bark works without any configuration, to test it, you can run the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"backend": "bark",
"input":"Hello, this is a test!"
}' | aplay
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models

View File

@ -1,24 +0,0 @@
backend: llama
context_size: 8192
f16: false
gpu_layers: 90
name: cerbero
mmap: false
parameters:
model: huggingface://galatolo/cerbero-7b-gguf/ggml-model-Q8_0.gguf
top_k: 80
temperature: 0.2
top_p: 0.7
template:
completion: "{{.Input}}"
chat: "Questa è una conversazione tra un umano ed un assistente AI.\n{{.Input}}\n[|Assistente|] "
roles:
user: "[|Umano|] "
system: "[|Umano|] "
assistant: "[|Assistente|] "
stopwords:
- "[|Umano|]"
trimsuffix:
- "\n"

View File

@ -1,20 +0,0 @@
name: codellama-7b-gguf
backend: transformers
parameters:
model: huggingface://TheBloke/CodeLlama-7B-GGUF/codellama-7b.Q4_K_M.gguf
temperature: 0.5
top_k: 40
seed: -1
top_p: 0.95
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
context_size: 4096
f16: true
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "codellama-7b-gguf",
"prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
}'

View File

@ -1,14 +0,0 @@
name: codellama-7b
backend: transformers
type: AutoModelForCausalLM
parameters:
model: codellama/CodeLlama-7b-hf
temperature: 0.2
top_k: 40
top_p: 0.95
usage: |
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "codellama-7b",
"prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
}'

View File

@ -1,9 +0,0 @@
usage: |
coqui works without any configuration, to test it, you can run the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"backend": "coqui",
"model": "tts_models/en/ljspeech/glow-tts",
"input":"Hello, this is a test!"
}'
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models

View File

@ -1,31 +0,0 @@
name: dolphin-mixtral-8x7b
mmap: true
parameters:
model: huggingface://TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/dolphin-2.5-mixtral-8x7b.Q2_K.gguf
temperature: 0.5
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "dolphin-mixtral-8x7b",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,59 +0,0 @@
name: hermes-2-pro-mistral
mmap: true
parameters:
model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}
<tool_call>
{{- else if eq .RoleName "tool" }}
<tool_response>
{{- end }}
{{- if .Content}}
{{.Content }}
{{- end }}
{{- if .FunctionCall}}
{{toJson .FunctionCall}}
{{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
# https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
function: |
<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{'arguments': <args-dict>, 'name': <function-name>}
</tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant
<tool_call>
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "hermes-2-pro-mistral",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,48 +0,0 @@
name: llama3-8b-instruct
mmap: true
parameters:
model: huggingface://second-state/Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf
template:
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ toJson .FunctionCall -}}
{{ end -}}
<|eot_id|>
function: |
<|start_header_id|>system<|end_header_id|>
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Function call:
chat: |
<|begin_of_text|>{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
completion: |
{{.Input}}
context_size: 8192
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "<|eot_id|>"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llama3-8b-instruct",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,33 +0,0 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava-1.5
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.5-7b-mmproj-Q8_0.gguf
parameters:
model: llava-v1.5-7b-Q4_K.gguf
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: llava-v1.5-7b-Q4_K.gguf
uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-Q4_K.gguf
- filename: llava-v1.5-7b-mmproj-Q8_0.gguf
uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-Q8_0.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava-1.5",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@ -1,33 +0,0 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava-1.6-mistral
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters:
model: llava-v1.6-mistral-7b.gguf
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: llava-v1.6-mistral-7b.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q6_K.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava-1.6-mistral",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@ -1,37 +0,0 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava-1.6-vicuna
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: mmproj-vicuna7b-f16.gguf
parameters:
model: vicuna-7b-q5_k.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: vicuna-7b-q5_k.gguf
uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
- filename: mmproj-vicuna7b-f16.gguf
uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava-1.6-vicuna",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@ -1,40 +0,0 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters:
model: bakllava.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: bakllava.gguf
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@ -1,21 +0,0 @@
name: bagel
backend: mamba
parameters:
model: "jondurbin/bagel-dpo-2.8b-v0.2"
systemPrompt: "You are a helpful, unbiased, uncensored assistant."
template:
chat_message: |
{{if eq .RoleName "assistant"}}{{.Content}}{{else}}
[INST]
{{if .SystemPrompt}}{{.SystemPrompt}}{{else if eq .RoleName "system"}}<<SYS>>{{.Content}}<</SYS>>
{{else if .Content}}{{.Content}}{{end}}
[/INST]
{{end}}
completion: |
{{.Input}}
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "bagel",
"messages": [{"role": "user", "content": "how are you doing"}],
}'

View File

@ -1,28 +0,0 @@
name: mamba-chat
backend: mamba
parameters:
model: "havenhq/mamba-chat"
trimsuffix:
- <|endoftext|>
# https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json
# "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
template:
chat_message: |
{{if eq .RoleName "assistant"}}<|assistant|>{{else if eq .RoleName "system"}}<|system|>{{else if eq .RoleName "user"}}<|user|>{{end}}
{{if .Content}}{{.Content}}{{end}}
</s>
chat: |
{{.Input}}
<|assistant|>
completion: |
{{.Input}}
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "mamba-chat",
"messages": [{"role": "user", "content": "how are you doing"}],
"temperature": 0.7
}'

View File

@ -1,32 +0,0 @@
name: mistral-openorca
mmap: true
parameters:
model: huggingface://TheBloke/Mistral-7B-OpenOrca-GGUF/mistral-7b-openorca.Q6_K.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}
<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "mistral-openorca",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,25 +0,0 @@
name: mixtral-instruct
mmap: true
parameters:
model: huggingface://TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q2_K.gguf
temperature: 0.2
top_k: 40
seed: -1
top_p: 0.95
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat: &chat |
[INST] {{.Input}} [/INST]
completion: *chat
context_size: 4096
f16: true
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "mixtral-instruct",
"prompt": "How are you doing?"
}'

View File

@ -1,25 +0,0 @@
name: phi-2-chat
mmap: true
parameters:
model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}
<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "phi-2-chat",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,30 +0,0 @@
name: phi-2-orange
mmap: true
parameters:
model: huggingface://l3utterfly/phi-2-orange-GGUF/phi-2-orange.Q6_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}
<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
description: |
This model is a chatbot that can be used for general conversation.
[Model card](https://huggingface.co/TheBloke/phi-2-orange-GGUF)
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "phi-2-orange",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,13 +0,0 @@
name: voice-en-us-amy-low
download_files:
- filename: voice-en-us-amy-low.tar.gz
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
usage: |
To test if this model works as expected, you can use the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"model":"en-us-amy-low.onnx",
"input": "Hi, this is a test."
}'

View File

@ -1,29 +0,0 @@
name: tinyllama-chat
mmap: true
parameters:
model: huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q8_0.gguf
temperature: 0.2
top_k: 40
seed: -1
top_p: 0.95
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "tinyllama-chat",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@ -1,31 +0,0 @@
name: tinyllama-chat
backend: transformers
type: AutoModelForCausalLM
parameters:
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
temperature: 0.2
top_k: 40
top_p: 0.95
max_tokens: 4096
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
stopwords:
- <|im_end|>
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "tinyllama-chat",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'

View File

@ -1,8 +0,0 @@
usage: |
Vall-e-x works without any configuration, to test it, you can run the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"backend": "vall-e-x",
"input":"Hello, this is a test!"
}' | aplay
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models

View File

@ -1,18 +0,0 @@
name: whisper
backend: whisper
parameters:
model: ggml-whisper-base.bin
usage: |
## example audio file
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
## Send the example audio file to the transcriptions endpoint
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper"
download_files:
- filename: "ggml-whisper-base.bin"
sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"

23
gallery/deepseek-r1.yaml Normal file
View File

@ -0,0 +1,23 @@
---
name: "deepseek-r1"
config_file: |
context_size: 131072
mmap: true
f16: true
stopwords:
- <begin▁of▁sentence>
- <end▁of▁sentence>
- <User>
- <Assistant>
template:
chat_message: |
{{if eq .RoleName "system" -}}{{.Content }}
{{ end -}}
{{if eq .RoleName "user" -}}<User>{{.Content}}
{{end -}}
{{if eq .RoleName "assistant" -}}<Assistant>{{.Content}}<end▁of▁sentence>{{end}}
completion: |
{{.Input}}
chat: |
{{.Input -}}<Assistant>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
---
name: "llama3.2-fcall"
config_file: |
mmap: true
function:
json_regex_match:
- "(?s)<Output>(.*?)</Output>"
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
grammar:
properties_order: "name,arguments"
function_arguments_key: "arguments"
template:
chat: |
<|start_header_id|>system<|end_header_id|>
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ toJson .FunctionCall -}}
{{ end -}}
<|eot_id|>
completion: |
{{.Input}}
function: |
<|start_header_id|>system<|end_header_id|>
You are an AI assistant that executes function calls, and these are the tools at your disposal:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
<|eot_id|>{{.Input}}<|start_header_id|>assistant<|end_header_id|>
context_size: 8192
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "<|eot_id|>"
- <|end_of_text|>

View File

@ -0,0 +1,55 @@
---
name: "llama3.2-quantized"
config_file: |
mmap: true
function:
disable_no_action: true
grammar:
disable: true
response_regex:
- \[(?P<name>\w+)\((?P<arguments>.*)\)\]
argument_regex:
- (?P<key>[^ '\(=,]+)[='"]+(?P<value>[^=,"']+)['"]?
template:
chat: |
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
The Function was executed and the response was:
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ range .FunctionCall }}
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
{{ end }}
{{ end -}}
<|eot_id|>
completion: |
{{.Input}}
function: |
<|start_header_id|>system<|end_header_id|>
You are an expert in composing functions. You are given a question and a set of possible functions.
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
If you decide to invoke any of the function(s), you MUST put it in the format as follows:
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
You SHOULD NOT include any other text in the response.
Here is a list of functions in JSON format that you can invoke.
{{toJson .Functions}}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input}}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
context_size: 8192
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "<|eot_id|>"
- <|end_of_text|>

View File

@ -21,14 +21,16 @@ import (
) )
const ( const (
HuggingFacePrefix = "huggingface://" HuggingFacePrefix = "huggingface://"
OCIPrefix = "oci://" HuggingFacePrefix1 = "hf://"
OllamaPrefix = "ollama://" HuggingFacePrefix2 = "hf.co/"
HTTPPrefix = "http://" OCIPrefix = "oci://"
HTTPSPrefix = "https://" OllamaPrefix = "ollama://"
GithubURI = "github:" HTTPPrefix = "http://"
GithubURI2 = "github://" HTTPSPrefix = "https://"
LocalPrefix = "file://" GithubURI = "github:"
GithubURI2 = "github://"
LocalPrefix = "file://"
) )
type URI string type URI string
@ -127,6 +129,8 @@ func (u URI) LooksLikeURL() bool {
return strings.HasPrefix(string(u), HTTPPrefix) || return strings.HasPrefix(string(u), HTTPPrefix) ||
strings.HasPrefix(string(u), HTTPSPrefix) || strings.HasPrefix(string(u), HTTPSPrefix) ||
strings.HasPrefix(string(u), HuggingFacePrefix) || strings.HasPrefix(string(u), HuggingFacePrefix) ||
strings.HasPrefix(string(u), HuggingFacePrefix1) ||
strings.HasPrefix(string(u), HuggingFacePrefix2) ||
strings.HasPrefix(string(u), GithubURI) || strings.HasPrefix(string(u), GithubURI) ||
strings.HasPrefix(string(u), OllamaPrefix) || strings.HasPrefix(string(u), OllamaPrefix) ||
strings.HasPrefix(string(u), OCIPrefix) || strings.HasPrefix(string(u), OCIPrefix) ||
@ -170,8 +174,10 @@ func (s URI) ResolveURL() string {
projectPath := strings.Join(repoPath[2:], "/") projectPath := strings.Join(repoPath[2:], "/")
return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath) return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
case strings.HasPrefix(string(s), HuggingFacePrefix): case strings.HasPrefix(string(s), HuggingFacePrefix) || strings.HasPrefix(string(s), HuggingFacePrefix1) || strings.HasPrefix(string(s), HuggingFacePrefix2):
repository := strings.Replace(string(s), HuggingFacePrefix, "", 1) repository := strings.Replace(string(s), HuggingFacePrefix, "", 1)
repository = strings.Replace(repository, HuggingFacePrefix1, "", 1)
repository = strings.Replace(repository, HuggingFacePrefix2, "", 1)
// convert repository to a full URL. // convert repository to a full URL.
// e.g. TheBloke/Mixtral-8x7B-v0.1-GGUF/mixtral-8x7b-v0.1.Q2_K.gguf@main -> https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q2_K.gguf // e.g. TheBloke/Mixtral-8x7B-v0.1-GGUF/mixtral-8x7b-v0.1.Q2_K.gguf@main -> https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q2_K.gguf
owner := strings.Split(repository, "/")[0] owner := strings.Split(repository, "/")[0]

View File

@ -5,6 +5,7 @@ import (
"errors" "errors"
"io" "io"
"regexp" "regexp"
"slices"
"strings" "strings"
"github.com/mudler/LocalAI/pkg/functions/grammars" "github.com/mudler/LocalAI/pkg/functions/grammars"
@ -46,6 +47,14 @@ type GrammarConfig struct {
// SchemaType can be configured to use a specific schema type to force the grammar // SchemaType can be configured to use a specific schema type to force the grammar
// available : json, llama3.1 // available : json, llama3.1
SchemaType string `yaml:"schema_type"` SchemaType string `yaml:"schema_type"`
GrammarTriggers []GrammarTrigger `yaml:"triggers"`
}
type GrammarTrigger struct {
// Trigger is the string that triggers the grammar
Word string `yaml:"word"`
AtStart bool `yaml:"at_start"`
} }
// FunctionsConfig is the configuration for the tool/function call. // FunctionsConfig is the configuration for the tool/function call.
@ -71,6 +80,12 @@ type FunctionsConfig struct {
// JSONRegexMatch is a regex to extract the JSON object from the response // JSONRegexMatch is a regex to extract the JSON object from the response
JSONRegexMatch []string `yaml:"json_regex_match"` JSONRegexMatch []string `yaml:"json_regex_match"`
// ArgumentRegex is a named regex to extract the arguments from the response. Use ArgumentRegexKey and ArgumentRegexValue to set the names of the named regex for key and value of the arguments.
ArgumentRegex []string `yaml:"argument_regex"`
// ArgumentRegex named regex names for key and value extractions. default: key and value
ArgumentRegexKey string `yaml:"argument_regex_key_name"` // default: key
ArgumentRegexValue string `yaml:"argument_regex_value_name"` // default: value
// ReplaceFunctionResults allow to replace strings in the results before parsing them // ReplaceFunctionResults allow to replace strings in the results before parsing them
ReplaceFunctionResults []ReplaceResult `yaml:"replace_function_results"` ReplaceFunctionResults []ReplaceResult `yaml:"replace_function_results"`
@ -310,7 +325,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
if functionName == "" { if functionName == "" {
return results return results
} }
results = append(results, FuncCallResults{Name: result[functionNameKey], Arguments: result[functionArgumentsKey]}) results = append(results, FuncCallResults{Name: result[functionNameKey], Arguments: ParseFunctionCallArgs(result[functionArgumentsKey], functionConfig)})
} }
} }
} else { } else {
@ -322,3 +337,38 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
return results return results
} }
func ParseFunctionCallArgs(functionArguments string, functionConfig FunctionsConfig) string {
if len(functionConfig.ArgumentRegex) == 0 {
return functionArguments
}
// We use named regexes here to extract the function argument key value pairs and convert this to valid json.
// TODO: there might be responses where an object as a value is expected/required. This is currently not handled.
args := make(map[string]string)
agrsRegexKeyName := "key"
agrsRegexValueName := "value"
if functionConfig.ArgumentRegexKey != "" {
agrsRegexKeyName = functionConfig.ArgumentRegexKey
}
if functionConfig.ArgumentRegexValue != "" {
agrsRegexValueName = functionConfig.ArgumentRegexValue
}
for _, r := range functionConfig.ArgumentRegex {
var respRegex = regexp.MustCompile(r)
var nameRange []string = respRegex.SubexpNames()
var keyIndex = slices.Index(nameRange, agrsRegexKeyName)
var valueIndex = slices.Index(nameRange, agrsRegexValueName)
matches := respRegex.FindAllStringSubmatch(functionArguments, -1)
for _, match := range matches {
args[match[keyIndex]] = match[valueIndex]
}
}
jsonBytes, _ := json.Marshal(args)
return string(jsonBytes)
}

View File

@ -43,11 +43,10 @@ var TypeAlias map[string]string = map[string]string{
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
const ( const (
LlamaGGML = "llama-ggml"
LLamaCPP = "llama-cpp" LLamaCPP = "llama-cpp"
LLamaCPPAVX2 = "llama-cpp-avx2" LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX512 = "llama-cpp-avx512"
LLamaCPPAVX = "llama-cpp-avx" LLamaCPPAVX = "llama-cpp-avx"
LLamaCPPFallback = "llama-cpp-fallback" LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda" LLamaCPPCUDA = "llama-cpp-cuda"
@ -66,6 +65,18 @@ const (
LocalStoreBackend = "local-store" LocalStoreBackend = "local-store"
) )
var llamaCPPVariants = []string{
LLamaCPPAVX2,
LLamaCPPAVX512,
LLamaCPPAVX,
LLamaCPPFallback,
LLamaCPPCUDA,
LLamaCPPHipblas,
LLamaCPPSycl16,
LLamaCPPSycl32,
LLamaCPPGRPC,
}
func backendPath(assetDir, backend string) string { func backendPath(assetDir, backend string) string {
return filepath.Join(assetDir, "backend-assets", "grpc", backend) return filepath.Join(assetDir, "backend-assets", "grpc", backend)
} }
@ -107,40 +118,14 @@ ENTRY:
if AutoDetect { if AutoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service // when starting the service
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false foundVariants := map[string]bool{}
if _, ok := backends[LLamaCPP]; !ok { if _, ok := backends[LLamaCPP]; !ok {
for _, e := range entry { for _, e := range entry {
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { for _, v := range llamaCPPVariants {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2) if strings.Contains(e.Name(), v) && !foundVariants[v] {
foundLCPPAVX2 = true backends[LLamaCPP] = append(backends[LLamaCPP], v)
} foundVariants[v] = true
if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX { }
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
foundLCPPAVX = true
}
if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
foundLCPPFallback = true
}
if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
foundLCPPGRPC = true
}
if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
foundLCPPCuda = true
}
if strings.Contains(e.Name(), LLamaCPPHipblas) && !foundLCPPHipblas {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
foundLCPPHipblas = true
}
if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
foundSycl16 = true
}
if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
foundSycl32 = true
} }
} }
} }
@ -156,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
// sets a priority list - first has more priority // sets a priority list - first has more priority
priorityList := []string{ priorityList := []string{
// First llama.cpp(variants) and llama-ggml to follow. // First llama.cpp(variants)
// We keep the fallback to prevent that if the llama.cpp variants // We keep the fallback to prevent that if the llama.cpp variants
// that depends on shared libs if breaks have still a safety net. // that depends on shared libs if breaks have still a safety net.
LLamaCPP, LlamaGGML, LLamaCPPFallback, LLamaCPP, LLamaCPPFallback,
} }
toTheEnd := []string{ toTheEnd := []string{
@ -283,6 +268,12 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
selectedProcess = p selectedProcess = p
} }
} else if xsysinfo.HasCPUCaps(cpuid.AVX512F) {
p := backendPath(assetDir, LLamaCPPAVX512)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX512 variant", backend)
selectedProcess = p
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX) { } else if xsysinfo.HasCPUCaps(cpuid.AVX) {
p := backendPath(assetDir, LLamaCPPAVX) p := backendPath(assetDir, LLamaCPPAVX)
if _, err := os.Stat(p); err == nil { if _, err := os.Stat(p); err == nil {

View File

@ -9,7 +9,6 @@ import (
"github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/gallery" "github.com/mudler/LocalAI/core/gallery"
"github.com/mudler/LocalAI/embedded"
"github.com/mudler/LocalAI/pkg/downloader" "github.com/mudler/LocalAI/pkg/downloader"
"github.com/mudler/LocalAI/pkg/utils" "github.com/mudler/LocalAI/pkg/utils"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
@ -18,42 +17,17 @@ import (
// InstallModels will preload models from the given list of URLs and galleries // InstallModels will preload models from the given list of URLs and galleries
// It will download the model if it is not already present in the model path // It will download the model if it is not already present in the model path
// It will also try to resolve if the model is an embedded model YAML configuration // It will also try to resolve if the model is an embedded model YAML configuration
func InstallModels(galleries []config.Gallery, modelLibraryURL string, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error { func InstallModels(galleries []config.Gallery, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
// create an error that groups all errors // create an error that groups all errors
var err error var err error
lib, _ := embedded.GetRemoteLibraryShorteners(modelLibraryURL, modelPath)
for _, url := range models { for _, url := range models {
// As a best effort, try to resolve the model from the remote library // As a best effort, try to resolve the model from the remote library
// if it's not resolved we try with the other method below // if it's not resolved we try with the other method below
if modelLibraryURL != "" {
if lib[url] != "" {
log.Debug().Msgf("[startup] model configuration is defined remotely: %s (%s)", url, lib[url])
url = lib[url]
}
}
url = embedded.ModelShortURL(url)
uri := downloader.URI(url) uri := downloader.URI(url)
switch { switch {
case embedded.ExistsInModelsLibrary(url):
modelYAML, e := embedded.ResolveContent(url)
// If we resolve something, just save it to disk and continue
if e != nil {
log.Error().Err(e).Msg("error resolving model content")
err = errors.Join(err, e)
continue
}
log.Debug().Msgf("[startup] resolved embedded model: %s", url)
md5Name := utils.MD5(url)
modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
if e := os.WriteFile(modelDefinitionFilePath, modelYAML, 0600); err != nil {
log.Error().Err(e).Str("filepath", modelDefinitionFilePath).Msg("error writing model definition")
err = errors.Join(err, e)
}
case uri.LooksLikeOCI(): case uri.LooksLikeOCI():
log.Debug().Msgf("[startup] resolved OCI model to download: %s", url) log.Debug().Msgf("[startup] resolved OCI model to download: %s", url)

View File

@ -7,7 +7,6 @@ import (
"github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/config"
. "github.com/mudler/LocalAI/pkg/startup" . "github.com/mudler/LocalAI/pkg/startup"
"github.com/mudler/LocalAI/pkg/utils"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
@ -16,29 +15,13 @@ import (
var _ = Describe("Preload test", func() { var _ = Describe("Preload test", func() {
Context("Preloading from strings", func() { Context("Preloading from strings", func() {
It("loads from remote url", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
libraryURL := "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml"
fileName := fmt.Sprintf("%s.yaml", "phi-2")
InstallModels([]config.Gallery{}, libraryURL, tmpdir, true, nil, "phi-2")
resultFile := filepath.Join(tmpdir, fileName)
content, err := os.ReadFile(resultFile)
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).To(ContainSubstring("name: phi-2"))
})
It("loads from embedded full-urls", func() { It("loads from embedded full-urls", func() {
tmpdir, err := os.MkdirTemp("", "") tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
url := "https://raw.githubusercontent.com/mudler/LocalAI-examples/main/configurations/phi-2.yaml" url := "https://raw.githubusercontent.com/mudler/LocalAI-examples/main/configurations/phi-2.yaml"
fileName := fmt.Sprintf("%s.yaml", "phi-2") fileName := fmt.Sprintf("%s.yaml", "phi-2")
InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url) InstallModels([]config.Gallery{}, tmpdir, true, nil, url)
resultFile := filepath.Join(tmpdir, fileName) resultFile := filepath.Join(tmpdir, fileName)
@ -47,45 +30,13 @@ var _ = Describe("Preload test", func() {
Expect(string(content)).To(ContainSubstring("name: phi-2")) Expect(string(content)).To(ContainSubstring("name: phi-2"))
}) })
It("loads from embedded short-urls", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "phi-2"
InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
entry, err := os.ReadDir(tmpdir)
Expect(err).ToNot(HaveOccurred())
Expect(entry).To(HaveLen(1))
resultFile := entry[0].Name()
content, err := os.ReadFile(filepath.Join(tmpdir, resultFile))
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).To(ContainSubstring("name: phi-2"))
})
It("loads from embedded models", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "mistral-openorca"
fileName := fmt.Sprintf("%s.yaml", utils.MD5(url))
InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
resultFile := filepath.Join(tmpdir, fileName)
content, err := os.ReadFile(resultFile)
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).To(ContainSubstring("name: mistral-openorca"))
})
It("downloads from urls", func() { It("downloads from urls", func() {
tmpdir, err := os.MkdirTemp("", "") tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
url := "huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q2_K.gguf" url := "huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
fileName := fmt.Sprintf("%s.gguf", "tinyllama-1.1b-chat-v0.3.Q2_K") fileName := fmt.Sprintf("%s.gguf", "tinyllama-1.1b-chat-v0.3.Q2_K")
err = InstallModels([]config.Gallery{}, "", tmpdir, false, nil, url) err = InstallModels([]config.Gallery{}, tmpdir, false, nil, url)
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
resultFile := filepath.Join(tmpdir, fileName) resultFile := filepath.Join(tmpdir, fileName)

View File

@ -765,6 +765,17 @@ const docTemplate = `{
"/v1/tokenize": { "/v1/tokenize": {
"post": { "post": {
"summary": "Tokenize the input.", "summary": "Tokenize the input.",
"parameters": [
{
"description": "Request",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/schema.TokenizeRequest"
}
}
],
"responses": { "responses": {
"200": { "200": {
"description": "Response", "description": "Response",
@ -1838,6 +1849,17 @@ const docTemplate = `{
} }
} }
}, },
"schema.TokenizeRequest": {
"type": "object",
"properties": {
"content": {
"type": "string"
},
"model": {
"type": "string"
}
}
},
"schema.TokenizeResponse": { "schema.TokenizeResponse": {
"type": "object", "type": "object",
"properties": { "properties": {

View File

@ -758,6 +758,17 @@
"/v1/tokenize": { "/v1/tokenize": {
"post": { "post": {
"summary": "Tokenize the input.", "summary": "Tokenize the input.",
"parameters": [
{
"description": "Request",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/schema.TokenizeRequest"
}
}
],
"responses": { "responses": {
"200": { "200": {
"description": "Response", "description": "Response",
@ -1831,6 +1842,17 @@
} }
} }
}, },
"schema.TokenizeRequest": {
"type": "object",
"properties": {
"content": {
"type": "string"
},
"model": {
"type": "string"
}
}
},
"schema.TokenizeResponse": { "schema.TokenizeResponse": {
"type": "object", "type": "object",
"properties": { "properties": {

View File

@ -705,6 +705,13 @@ definitions:
description: voice audio file or speaker id description: voice audio file or speaker id
type: string type: string
type: object type: object
schema.TokenizeRequest:
properties:
content:
type: string
model:
type: string
type: object
schema.TokenizeResponse: schema.TokenizeResponse:
properties: properties:
tokens: tokens:
@ -1216,6 +1223,13 @@ paths:
summary: Get TokenMetrics for Active Slot. summary: Get TokenMetrics for Active Slot.
/v1/tokenize: /v1/tokenize:
post: post:
parameters:
- description: Request
in: body
name: request
required: true
schema:
$ref: '#/definitions/schema.TokenizeRequest'
responses: responses:
"200": "200":
description: Response description: Response