mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
feat: more embedded models, coqui fixes, add model usage and description (#1556)
* feat: add model descriptions and usage * remove default model gallery * models: add embeddings and tts * docs: update table * docs: updates * images: cleanup pip cache after install * images: always run apt-get clean * ux: improve gRPC connection errors * ux: improve some messages * fix: fix coqui when no AudioPath is passed by * embedded: add more models * Add usage * Reorder table
This commit is contained in:
parent
0843fe6c65
commit
e19d7226f8
@ -15,7 +15,6 @@ ENV BUILD_TYPE=${BUILD_TYPE}
|
|||||||
|
|
||||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
|
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
|
||||||
|
|
||||||
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
|
|
||||||
ARG GO_TAGS="stablediffusion tinydream tts"
|
ARG GO_TAGS="stablediffusion tinydream tts"
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
@ -64,12 +63,12 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo
|
|||||||
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
|
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
|
||||||
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
|
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y conda
|
apt-get install -y conda && apt-get clean
|
||||||
|
|
||||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
RUN pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||||
RUN apt-get install -y espeak-ng espeak
|
RUN apt-get install -y espeak-ng espeak && apt-get clean
|
||||||
|
|
||||||
###################################
|
###################################
|
||||||
###################################
|
###################################
|
||||||
@ -127,10 +126,11 @@ ARG CUDA_MAJOR_VERSION=11
|
|||||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
|
ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
|
||||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
ENV PIP_CACHE_PURGE=true
|
||||||
|
|
||||||
# Add FFmpeg
|
# Add FFmpeg
|
||||||
RUN if [ "${FFMPEG}" = "true" ]; then \
|
RUN if [ "${FFMPEG}" = "true" ]; then \
|
||||||
apt-get install -y ffmpeg \
|
apt-get install -y ffmpeg && apt-get clean \
|
||||||
; fi
|
; fi
|
||||||
|
|
||||||
WORKDIR /build
|
WORKDIR /build
|
||||||
|
@ -55,6 +55,9 @@ type Config struct {
|
|||||||
CUDA bool `yaml:"cuda"`
|
CUDA bool `yaml:"cuda"`
|
||||||
|
|
||||||
DownloadFiles []File `yaml:"download_files"`
|
DownloadFiles []File `yaml:"download_files"`
|
||||||
|
|
||||||
|
Description string `yaml:"description"`
|
||||||
|
Usage string `yaml:"usage"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type File struct {
|
type File struct {
|
||||||
@ -326,6 +329,15 @@ func (cm *ConfigLoader) Preload(modelPath string) error {
|
|||||||
c.PredictionOptions.Model = md5Name
|
c.PredictionOptions.Model = md5Name
|
||||||
cm.configs[i] = *c
|
cm.configs[i] = *c
|
||||||
}
|
}
|
||||||
|
if cm.configs[i].Name != "" {
|
||||||
|
log.Info().Msgf("Model name: %s", cm.configs[i].Name)
|
||||||
|
}
|
||||||
|
if cm.configs[i].Description != "" {
|
||||||
|
log.Info().Msgf("Model description: %s", cm.configs[i].Description)
|
||||||
|
}
|
||||||
|
if cm.configs[i].Usage != "" {
|
||||||
|
log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -13,3 +13,12 @@ if conda_env_exists "transformers" ; then
|
|||||||
else
|
else
|
||||||
echo "Virtual environment already exists."
|
echo "Virtual environment already exists."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$PIP_CACHE_PURGE" = true ] ; then
|
||||||
|
export PATH=$PATH:/opt/conda/bin
|
||||||
|
|
||||||
|
# Activate conda environment
|
||||||
|
source activate transformers
|
||||||
|
|
||||||
|
pip cache purge
|
||||||
|
fi
|
@ -21,7 +21,7 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
|||||||
|
|
||||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||||
COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', 'en')
|
COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
|
||||||
|
|
||||||
# Implement the BackendServicer class with the service methods
|
# Implement the BackendServicer class with the service methods
|
||||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
@ -38,6 +38,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
if not torch.cuda.is_available() and request.CUDA:
|
if not torch.cuda.is_available() and request.CUDA:
|
||||||
return backend_pb2.Result(success=False, message="CUDA is not available")
|
return backend_pb2.Result(success=False, message="CUDA is not available")
|
||||||
|
|
||||||
|
self.AudioPath = None
|
||||||
# List available 🐸TTS models
|
# List available 🐸TTS models
|
||||||
print(TTS().list_models())
|
print(TTS().list_models())
|
||||||
if os.path.isabs(request.AudioPath):
|
if os.path.isabs(request.AudioPath):
|
||||||
|
@ -12,4 +12,8 @@ echo $CONDA_PREFIX
|
|||||||
|
|
||||||
git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
|
git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
|
||||||
|
|
||||||
cp -rfv $CONDA_PREFIX/exllama/* ./
|
cp -rfv $CONDA_PREFIX/exllama/* ./
|
||||||
|
|
||||||
|
if [ "$PIP_CACHE_PURGE" = true ] ; then
|
||||||
|
pip cache purge
|
||||||
|
fi
|
@ -11,4 +11,8 @@ echo $CONDA_PREFIX
|
|||||||
|
|
||||||
git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
|
git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
|
||||||
|
|
||||||
cp -rfv $CONDA_PREFIX/exllamav2/* ./
|
cp -rfv $CONDA_PREFIX/exllamav2/* ./
|
||||||
|
|
||||||
|
if [ "$PIP_CACHE_PURGE" = true ] ; then
|
||||||
|
pip cache purge
|
||||||
|
fi
|
@ -12,4 +12,8 @@ echo $CONDA_PREFIX
|
|||||||
|
|
||||||
git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && pip install -r requirements.txt && popd
|
git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && pip install -r requirements.txt && popd
|
||||||
|
|
||||||
cp -rfv $CONDA_PREFIX/vall-e-x/* ./
|
cp -rfv $CONDA_PREFIX/vall-e-x/* ./
|
||||||
|
|
||||||
|
if [ "$PIP_CACHE_PURGE" = true ] ; then
|
||||||
|
pip cache purge
|
||||||
|
fi
|
@ -143,39 +143,60 @@ Note: this feature currently is available only on master builds.
|
|||||||
You can run `local-ai` directly with a model name, and it will download the model and start the API with the model loaded.
|
You can run `local-ai` directly with a model name, and it will download the model and start the API with the model loaded.
|
||||||
|
|
||||||
> Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
|
> Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
|
||||||
|
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version`
|
||||||
|
|
||||||
|
|
||||||
{{< tabs >}}
|
{{< tabs >}}
|
||||||
{{% tab name="CPU-only" %}}
|
{{% tab name="CPU-only" %}}
|
||||||
|
|
||||||
| Model | Docker command |
|
| Model | Category | Docker command |
|
||||||
| --- | --- |
|
| --- | --- | --- |
|
||||||
| phi2 | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
|
| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
|
||||||
| llava | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` |
|
| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` |
|
||||||
| mistral-openorca | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
|
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
|
||||||
|
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | Embeddings | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
|
||||||
|
| all-minilm-l6-v2 | Embeddings | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
|
||||||
|
| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
|
||||||
|
| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
|
||||||
|
| coqui | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
|
||||||
|
| bark | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
|
||||||
|
| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
|
||||||
|
|
||||||
{{% /tab %}}
|
{{% /tab %}}
|
||||||
{{% tab name="GPU (CUDA 11)" %}}
|
{{% tab name="GPU (CUDA 11)" %}}
|
||||||
|
|
||||||
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version`
|
|
||||||
|
|
||||||
| Model | Docker command |
|
|
||||||
| --- | --- |
|
| Model | Category | Docker command |
|
||||||
| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
|
| --- | --- | --- |
|
||||||
| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` |
|
| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
|
||||||
| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
|
| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` |
|
||||||
|
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
|
||||||
|
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
|
||||||
|
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
|
||||||
|
| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
|
||||||
|
| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
|
||||||
|
| coqui | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
|
||||||
|
| bark | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
|
||||||
|
| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
|
||||||
|
|
||||||
{{% /tab %}}
|
{{% /tab %}}
|
||||||
|
|
||||||
|
|
||||||
{{% tab name="GPU (CUDA 12)" %}}
|
{{% tab name="GPU (CUDA 12)" %}}
|
||||||
|
|
||||||
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version`
|
| Model | Category | Docker command |
|
||||||
|
| --- | --- | --- |
|
||||||
| Model | Docker command |
|
| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
|
||||||
| --- | --- |
|
| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` |
|
||||||
| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
|
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
|
||||||
| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` |
|
| bert-cpp | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
|
||||||
| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
|
| all-minilm-l6-v2 | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
|
||||||
|
| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
|
||||||
|
| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
|
||||||
|
| coqui | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
|
||||||
|
| bark | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
|
||||||
|
| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
|
||||||
|
|
||||||
{{% /tab %}}
|
{{% /tab %}}
|
||||||
|
|
||||||
@ -201,7 +222,7 @@ For example, to start localai with phi-2, it's possible for instance to also use
|
|||||||
docker run -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
|
docker run -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
The file should be a valid YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}).
|
The file should be a valid LocalAI YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}).
|
||||||
{{% /notice %}}
|
{{% /notice %}}
|
||||||
|
|
||||||
### Container images
|
### Container images
|
||||||
|
@ -43,15 +43,18 @@ Besides llama based models, LocalAI is compatible also with other architectures.
|
|||||||
| [langchain-huggingface](https://github.com/tmc/langchaingo) | Any text generators available on HuggingFace through API | yes | GPT | no | no | N/A |
|
| [langchain-huggingface](https://github.com/tmc/langchaingo) | Any text generators available on HuggingFace through API | yes | GPT | no | no | N/A |
|
||||||
| [piper](https://github.com/rhasspy/piper) ([binding](https://github.com/mudler/go-piper)) | Any piper onnx model | no | Text to voice | no | no | N/A |
|
| [piper](https://github.com/rhasspy/piper) ([binding](https://github.com/mudler/go-piper)) | Any piper onnx model | no | Text to voice | no | no | N/A |
|
||||||
| [falcon](https://github.com/cmp-nct/ggllm.cpp/tree/c12b2d65f732a0d8846db2244e070f0f3e73505c) ([binding](https://github.com/mudler/go-ggllm.cpp)) | Falcon *** | yes | GPT | no | yes | CUDA |
|
| [falcon](https://github.com/cmp-nct/ggllm.cpp/tree/c12b2d65f732a0d8846db2244e070f0f3e73505c) ([binding](https://github.com/mudler/go-ggllm.cpp)) | Falcon *** | yes | GPT | no | yes | CUDA |
|
||||||
| `huggingface-embeddings` [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | BERT | no | Embeddings only | yes | no | N/A |
|
| [sentencetransformers](https://github.com/UKPLab/sentence-transformers) | BERT | no | Embeddings only | yes | no | N/A |
|
||||||
| `bark` | bark | no | Audio generation | no | no | yes |
|
| `bark` | bark | no | Audio generation | no | no | yes |
|
||||||
| `AutoGPTQ` | GPTQ | yes | GPT | yes | no | N/A |
|
| `autogptq` | GPTQ | yes | GPT | yes | no | N/A |
|
||||||
| `exllama` | GPTQ | yes | GPT only | no | no | N/A |
|
| `exllama` | GPTQ | yes | GPT only | no | no | N/A |
|
||||||
| `diffusers` | SD,... | no | Image generation | no | no | N/A |
|
| `diffusers` | SD,... | no | Image generation | no | no | N/A |
|
||||||
| `vall-e-x` | Vall-E | no | Audio generation and Voice cloning | no | no | CPU/CUDA |
|
| `vall-e-x` | Vall-E | no | Audio generation and Voice cloning | no | no | CPU/CUDA |
|
||||||
| `vllm` | Various GPTs and quantization formats | yes | GPT | no | no | CPU/CUDA |
|
| `vllm` | Various GPTs and quantization formats | yes | GPT | no | no | CPU/CUDA |
|
||||||
| `exllama2` | GPTQ | yes | GPT only | no | no | N/A |
|
| `exllama2` | GPTQ | yes | GPT only | no | no | N/A |
|
||||||
| `transformers-musicgen` | | no | Audio generation | no | no | N/A |
|
| `transformers-musicgen` | | no | Audio generation | no | no | N/A |
|
||||||
|
| [tinydream](https://github.com/symisc/tiny-dream#tiny-dreaman-embedded-header-only-stable-diffusion-inference-c-librarypixlabiotiny-dream) | stablediffusion | no | Image | no | no | N/A |
|
||||||
|
| `coqui` | Coqui | no | Audio generation and Voice cloning | no | no | CPU/CUDA |
|
||||||
|
| `petals` | Various GPTs and quantization formats | yes | GPT | no | no | CPU/CUDA |
|
||||||
|
|
||||||
Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "advanced" %}})).
|
Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "advanced" %}})).
|
||||||
|
|
||||||
|
13
embedded/models/all-minilm-l6-v2.yaml
Normal file
13
embedded/models/all-minilm-l6-v2.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
name: all-minilm-l6-v2
|
||||||
|
backend: sentencetransformers
|
||||||
|
embeddings: true
|
||||||
|
parameters:
|
||||||
|
model: all-MiniLM-L6-v2
|
||||||
|
|
||||||
|
usage: |
|
||||||
|
You can test this model with curl like this:
|
||||||
|
|
||||||
|
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
|
||||||
|
"input": "Your text string goes here",
|
||||||
|
"model": "all-minilm-l6-v2"
|
||||||
|
}'
|
8
embedded/models/bark.yaml
Normal file
8
embedded/models/bark.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
usage: |
|
||||||
|
bark works without any configuration, to test it, you can run the following curl command:
|
||||||
|
|
||||||
|
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
|
||||||
|
"backend": "bark",
|
||||||
|
"input":"Hello, this is a test!"
|
||||||
|
}' | aplay
|
||||||
|
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models
|
23
embedded/models/bert-cpp.yaml
Normal file
23
embedded/models/bert-cpp.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
backend: bert-embeddings
|
||||||
|
embeddings: true
|
||||||
|
f16: true
|
||||||
|
|
||||||
|
gpu_layers: 90
|
||||||
|
mmap: true
|
||||||
|
name: bert-cpp-minilm-v6
|
||||||
|
|
||||||
|
parameters:
|
||||||
|
model: bert-MiniLM-L6-v2q4_0.bin
|
||||||
|
|
||||||
|
download_files:
|
||||||
|
- filename: "bert-MiniLM-L6-v2q4_0.bin"
|
||||||
|
sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad"
|
||||||
|
uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin"
|
||||||
|
|
||||||
|
usage: |
|
||||||
|
You can test this model with curl like this:
|
||||||
|
|
||||||
|
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
|
||||||
|
"input": "Your text string goes here",
|
||||||
|
"model": "bert-cpp-minilm-v6"
|
||||||
|
}'
|
9
embedded/models/coqui.yaml
Normal file
9
embedded/models/coqui.yaml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
usage: |
|
||||||
|
coqui works without any configuration, to test it, you can run the following curl command:
|
||||||
|
|
||||||
|
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
|
||||||
|
"backend": "coqui",
|
||||||
|
"model": "tts_models/en/ljspeech/glow-tts",
|
||||||
|
"input":"Hello, this is a test!"
|
||||||
|
}'
|
||||||
|
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models
|
@ -28,4 +28,9 @@ download_files:
|
|||||||
- filename: bakllava.gguf
|
- filename: bakllava.gguf
|
||||||
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
|
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
|
||||||
- filename: bakllava-mmproj.gguf
|
- filename: bakllava-mmproj.gguf
|
||||||
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
|
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
|
||||||
|
|
||||||
|
usage: |
|
||||||
|
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "llava",
|
||||||
|
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||||
|
@ -21,3 +21,9 @@ context_size: 4096
|
|||||||
f16: true
|
f16: true
|
||||||
stopwords:
|
stopwords:
|
||||||
- <|im_end|>
|
- <|im_end|>
|
||||||
|
|
||||||
|
usage: |
|
||||||
|
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||||
|
"model": "mistral-openorca",
|
||||||
|
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
|
||||||
|
}'
|
13
embedded/models/rhasspy-voice-en-us-amy.yaml
Normal file
13
embedded/models/rhasspy-voice-en-us-amy.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
name: voice-en-us-amy-low
|
||||||
|
download_files:
|
||||||
|
- filename: voice-en-us-amy-low.tar.gz
|
||||||
|
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
usage: |
|
||||||
|
To test if this model works as expected, you can use the following curl command:
|
||||||
|
|
||||||
|
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
|
||||||
|
"model":"en-us-amy-low.onnx",
|
||||||
|
"input": "Hi, this is a test."
|
||||||
|
}'
|
8
embedded/models/vall-e-x.yaml
Normal file
8
embedded/models/vall-e-x.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
usage: |
|
||||||
|
Vall-e-x works without any configuration, to test it, you can run the following curl command:
|
||||||
|
|
||||||
|
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
|
||||||
|
"backend": "vall-e-x",
|
||||||
|
"input":"Hello, this is a test!"
|
||||||
|
}' | aplay
|
||||||
|
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models
|
18
embedded/models/whisper-base.yaml
Normal file
18
embedded/models/whisper-base.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
name: whisper
|
||||||
|
backend: whisper
|
||||||
|
parameters:
|
||||||
|
model: ggml-whisper-base.bin
|
||||||
|
|
||||||
|
usage: |
|
||||||
|
## example audio file
|
||||||
|
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
|
||||||
|
|
||||||
|
## Send the example audio file to the transcriptions endpoint
|
||||||
|
curl http://localhost:8080/v1/audio/transcriptions \
|
||||||
|
-H "Content-Type: multipart/form-data" \
|
||||||
|
-F file="@$PWD/gb1.ogg" -F model="whisper"
|
||||||
|
|
||||||
|
download_files:
|
||||||
|
- filename: "ggml-whisper-base.bin"
|
||||||
|
sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
|
||||||
|
uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
|
@ -50,7 +50,7 @@ func (c *Client) setBusy(v bool) {
|
|||||||
c.Unlock()
|
c.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) HealthCheck(ctx context.Context) bool {
|
func (c *Client) HealthCheck(ctx context.Context) (bool, error) {
|
||||||
if !c.parallel {
|
if !c.parallel {
|
||||||
c.opMutex.Lock()
|
c.opMutex.Lock()
|
||||||
defer c.opMutex.Unlock()
|
defer c.opMutex.Unlock()
|
||||||
@ -59,8 +59,7 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
|
|||||||
defer c.setBusy(false)
|
defer c.setBusy(false)
|
||||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
return false, err
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
defer conn.Close()
|
defer conn.Close()
|
||||||
client := pb.NewBackendClient(conn)
|
client := pb.NewBackendClient(conn)
|
||||||
@ -71,15 +70,14 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
|
|||||||
|
|
||||||
res, err := client.Health(ctx, &pb.HealthMessage{})
|
res, err := client.Health(ctx, &pb.HealthMessage{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
return false, err
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if string(res.Message) == "OK" {
|
if string(res.Message) == "OK" {
|
||||||
return true
|
return true, nil
|
||||||
}
|
}
|
||||||
return false
|
|
||||||
|
return false, fmt.Errorf("health check failed: %s", res.Message)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error) {
|
func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error) {
|
||||||
|
@ -131,11 +131,15 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
|||||||
// Wait for the service to start up
|
// Wait for the service to start up
|
||||||
ready := false
|
ready := false
|
||||||
for i := 0; i < o.grpcAttempts; i++ {
|
for i := 0; i < o.grpcAttempts; i++ {
|
||||||
if client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background()) {
|
alive, err := client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background())
|
||||||
|
if alive {
|
||||||
log.Debug().Msgf("GRPC Service Ready")
|
log.Debug().Msgf("GRPC Service Ready")
|
||||||
ready = true
|
ready = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
if err != nil && i == o.grpcAttempts-1 {
|
||||||
|
log.Error().Msgf("Failed starting/connecting to the gRPC service: %s", err.Error())
|
||||||
|
}
|
||||||
time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second)
|
time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -176,7 +180,11 @@ func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (*grpc.C
|
|||||||
func (ml *ModelLoader) BackendLoader(opts ...Option) (client *grpc.Client, err error) {
|
func (ml *ModelLoader) BackendLoader(opts ...Option) (client *grpc.Client, err error) {
|
||||||
o := NewOptions(opts...)
|
o := NewOptions(opts...)
|
||||||
|
|
||||||
log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
|
if o.model != "" {
|
||||||
|
log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
|
||||||
|
} else {
|
||||||
|
log.Info().Msgf("Loading model with backend %s", o.backendString)
|
||||||
|
}
|
||||||
|
|
||||||
backend := strings.ToLower(o.backendString)
|
backend := strings.ToLower(o.backendString)
|
||||||
if realBackend, exists := Aliases[backend]; exists {
|
if realBackend, exists := Aliases[backend]; exists {
|
||||||
@ -239,7 +247,10 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
|
|||||||
for _, b := range o.externalBackends {
|
for _, b := range o.externalBackends {
|
||||||
allBackendsToAutoLoad = append(allBackendsToAutoLoad, b)
|
allBackendsToAutoLoad = append(allBackendsToAutoLoad, b)
|
||||||
}
|
}
|
||||||
log.Info().Msgf("Loading model '%s' greedly from all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))
|
|
||||||
|
if o.model != "" {
|
||||||
|
log.Info().Msgf("Trying to load the model '%s' with all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))
|
||||||
|
}
|
||||||
|
|
||||||
for _, b := range allBackendsToAutoLoad {
|
for _, b := range allBackendsToAutoLoad {
|
||||||
log.Info().Msgf("[%s] Attempting to load", b)
|
log.Info().Msgf("[%s] Attempting to load", b)
|
||||||
|
@ -171,9 +171,10 @@ func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
|
|||||||
} else {
|
} else {
|
||||||
client = m.GRPC(false, ml.wd)
|
client = m.GRPC(false, ml.wd)
|
||||||
}
|
}
|
||||||
|
alive, err := client.HealthCheck(context.Background())
|
||||||
if !client.HealthCheck(context.Background()) {
|
if !alive {
|
||||||
log.Debug().Msgf("GRPC Model not responding: %s", s)
|
log.Warn().Msgf("GRPC Model not responding: %s", err.Error())
|
||||||
|
log.Warn().Msgf("Deleting the process in order to recreate it")
|
||||||
if !ml.grpcProcesses[s].IsAlive() {
|
if !ml.grpcProcesses[s].IsAlive() {
|
||||||
log.Debug().Msgf("GRPC Process is not responding: %s", s)
|
log.Debug().Msgf("GRPC Process is not responding: %s", s)
|
||||||
// stop and delete the process, this forces to re-load the model and re-create again the service
|
// stop and delete the process, this forces to re-load the model and re-create again the service
|
||||||
|
Loading…
Reference in New Issue
Block a user