mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-19 04:37:53 +00:00
feat(llama.cpp): support embeddings endpoints (#2871)
* feat(llama.cpp): add embeddings Also enable embeddings by default for llama.cpp models Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(Makefile): prepare llama.cpp sources only once Otherwise we keep cloning llama.cpp for each of the variants Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * do not set embeddings to false Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs: add embeddings to the YAML config reference Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
6564e7ea01
commit
35561edb6e
20
Makefile
20
Makefile
@ -310,7 +310,7 @@ sources/whisper.cpp:
|
|||||||
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
||||||
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
||||||
|
|
||||||
get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream
|
get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
|
||||||
|
|
||||||
replace:
|
replace:
|
||||||
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
|
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
|
||||||
@ -767,28 +767,28 @@ else
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
# This target is for manually building a variant with-auto detected flags
|
# This target is for manually building a variant with-auto detected flags
|
||||||
backend-assets/grpc/llama-cpp: backend-assets/grpc
|
backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-cpp
|
cp -rf backend/cpp/llama backend/cpp/llama-cpp
|
||||||
$(MAKE) -C backend/cpp/llama-cpp purge
|
$(MAKE) -C backend/cpp/llama-cpp purge
|
||||||
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
|
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
|
||||||
$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
|
$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
|
cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-avx2
|
cp -rf backend/cpp/llama backend/cpp/llama-avx2
|
||||||
$(MAKE) -C backend/cpp/llama-avx2 purge
|
$(MAKE) -C backend/cpp/llama-avx2 purge
|
||||||
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
|
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
|
||||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
|
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
|
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-avx
|
cp -rf backend/cpp/llama backend/cpp/llama-avx
|
||||||
$(MAKE) -C backend/cpp/llama-avx purge
|
$(MAKE) -C backend/cpp/llama-avx purge
|
||||||
$(info ${GREEN}I llama-cpp build info:avx${RESET})
|
$(info ${GREEN}I llama-cpp build info:avx${RESET})
|
||||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
|
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
|
cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-fallback
|
cp -rf backend/cpp/llama backend/cpp/llama-fallback
|
||||||
$(MAKE) -C backend/cpp/llama-fallback purge
|
$(MAKE) -C backend/cpp/llama-fallback purge
|
||||||
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
|
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
|
||||||
@ -799,35 +799,35 @@ ifeq ($(BUILD_TYPE),metal)
|
|||||||
cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
|
cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
|
||||||
endif
|
endif
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-cuda
|
cp -rf backend/cpp/llama backend/cpp/llama-cuda
|
||||||
$(MAKE) -C backend/cpp/llama-cuda purge
|
$(MAKE) -C backend/cpp/llama-cuda purge
|
||||||
$(info ${GREEN}I llama-cpp build info:cuda${RESET})
|
$(info ${GREEN}I llama-cpp build info:cuda${RESET})
|
||||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
|
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
|
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-hipblas
|
cp -rf backend/cpp/llama backend/cpp/llama-hipblas
|
||||||
$(MAKE) -C backend/cpp/llama-hipblas purge
|
$(MAKE) -C backend/cpp/llama-hipblas purge
|
||||||
$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
|
$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
|
||||||
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
|
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
|
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
|
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
|
||||||
$(MAKE) -C backend/cpp/llama-sycl_f16 purge
|
$(MAKE) -C backend/cpp/llama-sycl_f16 purge
|
||||||
$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
|
$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
|
||||||
BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
|
BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
|
cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
|
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
|
||||||
$(MAKE) -C backend/cpp/llama-sycl_f32 purge
|
$(MAKE) -C backend/cpp/llama-sycl_f32 purge
|
||||||
$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
|
$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
|
||||||
BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
|
BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
|
||||||
cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
|
cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
|
||||||
|
|
||||||
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
|
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||||
cp -rf backend/cpp/llama backend/cpp/llama-grpc
|
cp -rf backend/cpp/llama backend/cpp/llama-grpc
|
||||||
$(MAKE) -C backend/cpp/llama-grpc purge
|
$(MAKE) -C backend/cpp/llama-grpc purge
|
||||||
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
|
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
|
||||||
|
@ -2108,6 +2108,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
|||||||
data["grammar"] = predict->grammar();
|
data["grammar"] = predict->grammar();
|
||||||
data["prompt"] = predict->prompt();
|
data["prompt"] = predict->prompt();
|
||||||
data["ignore_eos"] = predict->ignoreeos();
|
data["ignore_eos"] = predict->ignoreeos();
|
||||||
|
data["embeddings"] = predict->embeddings();
|
||||||
|
|
||||||
// for each image in the request, add the image data
|
// for each image in the request, add the image data
|
||||||
//
|
//
|
||||||
@ -2385,6 +2386,31 @@ public:
|
|||||||
|
|
||||||
return grpc::Status::OK;
|
return grpc::Status::OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969
|
||||||
|
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
|
||||||
|
json data = parse_options(false, request, llama);
|
||||||
|
const int task_id = llama.queue_tasks.get_new_id();
|
||||||
|
llama.queue_results.add_waiting_task_id(task_id);
|
||||||
|
llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
|
||||||
|
// get the result
|
||||||
|
task_result result = llama.queue_results.recv(task_id);
|
||||||
|
//std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
|
||||||
|
llama.queue_results.remove_waiting_task_id(task_id);
|
||||||
|
if (!result.error && result.stop) {
|
||||||
|
std::vector<float> embeddings = result.result_json.value("embedding", std::vector<float>());
|
||||||
|
// loop the vector and set the embeddings results
|
||||||
|
for (int i = 0; i < embeddings.size(); i++) {
|
||||||
|
embeddingResult->add_embeddings(embeddings[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return grpc::Status::OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
return grpc::Status::OK;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void RunServer(const std::string& server_address) {
|
void RunServer(const std::string& server_address) {
|
||||||
|
@ -91,7 +91,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
|||||||
Type: c.ModelType,
|
Type: c.ModelType,
|
||||||
RopeFreqScale: c.RopeFreqScale,
|
RopeFreqScale: c.RopeFreqScale,
|
||||||
NUMA: c.NUMA,
|
NUMA: c.NUMA,
|
||||||
Embeddings: c.Embeddings,
|
Embeddings: *c.Embeddings,
|
||||||
LowVRAM: *c.LowVRAM,
|
LowVRAM: *c.LowVRAM,
|
||||||
NGPULayers: int32(*c.NGPULayers),
|
NGPULayers: int32(*c.NGPULayers),
|
||||||
MMap: *c.MMap,
|
MMap: *c.MMap,
|
||||||
|
@ -32,7 +32,7 @@ type BackendConfig struct {
|
|||||||
Threads *int `yaml:"threads"`
|
Threads *int `yaml:"threads"`
|
||||||
Debug *bool `yaml:"debug"`
|
Debug *bool `yaml:"debug"`
|
||||||
Roles map[string]string `yaml:"roles"`
|
Roles map[string]string `yaml:"roles"`
|
||||||
Embeddings bool `yaml:"embeddings"`
|
Embeddings *bool `yaml:"embeddings"`
|
||||||
Backend string `yaml:"backend"`
|
Backend string `yaml:"backend"`
|
||||||
TemplateConfig TemplateConfig `yaml:"template"`
|
TemplateConfig TemplateConfig `yaml:"template"`
|
||||||
|
|
||||||
@ -338,6 +338,10 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
|||||||
cfg.LowVRAM = &falseV
|
cfg.LowVRAM = &falseV
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cfg.Embeddings == nil {
|
||||||
|
cfg.Embeddings = &falseV
|
||||||
|
}
|
||||||
|
|
||||||
// Value passed by the top level are treated as default (no implicit defaults)
|
// Value passed by the top level are treated as default (no implicit defaults)
|
||||||
// defaults are set by the user
|
// defaults are set by the user
|
||||||
if ctx == 0 {
|
if ctx == 0 {
|
||||||
|
@ -112,6 +112,8 @@ name: "" # Model name, used to identify the model in API calls.
|
|||||||
# Precision settings for the model, reducing precision can enhance performance on some hardware.
|
# Precision settings for the model, reducing precision can enhance performance on some hardware.
|
||||||
f16: null # Whether to use 16-bit floating-point precision.
|
f16: null # Whether to use 16-bit floating-point precision.
|
||||||
|
|
||||||
|
embeddings: true # Enable embeddings for the model.
|
||||||
|
|
||||||
# Concurrency settings for the application.
|
# Concurrency settings for the application.
|
||||||
threads: null # Number of threads to use for processing.
|
threads: null # Number of threads to use for processing.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user