From 887b3dff0486097f757b2414e941cbed315f6cff Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 8 Dec 2023 15:45:04 +0100 Subject: [PATCH] feat: cuda transformers (#1401) * Use cuda in transformers if available tensorflow probably needs a different check. Signed-off-by: Erich Schubert * feat: expose CUDA at top level Signed-off-by: Ettore Di Giacinto * tests: add to tests and create workflow for py extra backends * doc: update note on how to use core images --------- Signed-off-by: Erich Schubert Signed-off-by: Ettore Di Giacinto Co-authored-by: Erich Schubert --- .github/workflows/test-extra.yml | 75 +++++++++++++++++++ Makefile | 5 ++ api/backend/image.go | 2 +- api/backend/options.go | 1 + api/config/config.go | 5 +- ...formers.py => test_transformers_server.py} | 9 ++- .../transformers/transformers_server.py | 43 +++++++++-- docs/content/advanced/_index.en.md | 33 +++++++- docs/content/getting_started/_index.en.md | 1 + 9 files changed, 163 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/test-extra.yml rename backend/python/transformers/{test_transformers.py => test_transformers_server.py} (93%) diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml new file mode 100644 index 00000000..36621386 --- /dev/null +++ b/.github/workflows/test-extra.yml @@ -0,0 +1,75 @@ +--- +name: 'Tests extras backends' + +on: + pull_request: + push: + branches: + - master + tags: + - '*' + +concurrency: + group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }} + cancel-in-progress: true + +jobs: + tests-linux: + runs-on: ubuntu-latest + steps: + - name: Release space from worker + run: | + echo "Listing top largest packages" + pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr) + head -n 30 <<< "${pkgs}" + echo + df -h + echo + sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true + sudo apt-get remove --auto-remove android-sdk-platform-tools || true + sudo apt-get purge --auto-remove android-sdk-platform-tools || true + sudo rm -rf /usr/local/lib/android + sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true + sudo rm -rf /usr/share/dotnet + sudo apt-get remove -y '^mono-.*' || true + sudo apt-get remove -y '^ghc-.*' || true + sudo apt-get remove -y '.*jdk.*|.*jre.*' || true + sudo apt-get remove -y 'php.*' || true + sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true + sudo apt-get remove -y '^google-.*' || true + sudo apt-get remove -y azure-cli || true + sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true + sudo apt-get remove -y '^gfortran-.*' || true + sudo apt-get autoremove -y + sudo apt-get clean + echo + echo "Listing top largest packages" + pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr) + head -n 30 <<< "${pkgs}" + echo + sudo rm -rfv build || true + df -h + - name: Clone + uses: actions/checkout@v4 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential ffmpeg + curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ + sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ + gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \ + sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \ + sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \ + sudo apt-get update && \ + sudo apt-get install -y conda + sudo apt-get install -y ca-certificates cmake curl patch + sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2 + + sudo rm -rfv /usr/bin/conda || true + + - name: Test + run: | + PATH=$PATH:/opt/conda/bin make test-extra + \ No newline at end of file diff --git a/Makefile b/Makefile index db4e1940..1b8ad7af 100644 --- a/Makefile +++ b/Makefile @@ -414,6 +414,11 @@ prepare-extra-conda-environments: $(MAKE) -C backend/python/petals $(MAKE) -C backend/python/exllama2 +prepare-test-extra: + $(MAKE) -C backend/python/transformers + +test-extra: prepare-test-extra + $(MAKE) -C backend/python/transformers test backend-assets/grpc: mkdir -p backend-assets/grpc diff --git a/api/backend/image.go b/api/backend/image.go index d21b9803..75e145b4 100644 --- a/api/backend/image.go +++ b/api/backend/image.go @@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat model.WithContext(o.Context), model.WithModel(c.Model), model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{ - CUDA: c.Diffusers.CUDA, + CUDA: c.CUDA, SchedulerType: c.Diffusers.SchedulerType, PipelineType: c.Diffusers.PipelineType, CFGScale: c.Diffusers.CFGScale, diff --git a/api/backend/options.go b/api/backend/options.go index c83cb92b..3266d602 100644 --- a/api/backend/options.go +++ b/api/backend/options.go @@ -46,6 +46,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions { Seed: int32(c.Seed), NBatch: int32(b), NoMulMatQ: c.NoMulMatQ, + CUDA: c.CUDA, // diffusers, transformers DraftModel: c.DraftModel, AudioPath: c.VallE.AudioPath, Quantization: c.Quantization, diff --git a/api/config/config.go b/api/config/config.go index c16d63b6..39f572f2 100644 --- a/api/config/config.go +++ b/api/config/config.go @@ -46,6 +46,10 @@ type Config struct { // Vall-e-x VallE VallE `yaml:"vall-e"` + + // CUDA + // Explicitly enable CUDA or not (some backends might need it) + CUDA bool `yaml:"cuda"` } type VallE struct { @@ -67,7 +71,6 @@ type GRPC struct { type Diffusers struct { PipelineType string `yaml:"pipeline_type"` SchedulerType string `yaml:"scheduler_type"` - CUDA bool `yaml:"cuda"` EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser diff --git a/backend/python/transformers/test_transformers.py b/backend/python/transformers/test_transformers_server.py similarity index 93% rename from backend/python/transformers/test_transformers.py rename to backend/python/transformers/test_transformers_server.py index e52022ca..ac4fac28 100644 --- a/backend/python/transformers/test_transformers.py +++ b/backend/python/transformers/test_transformers_server.py @@ -31,7 +31,7 @@ class TestBackendServicer(unittest.TestCase): """ This method tests if the server starts up successfully """ - time.sleep(2) + time.sleep(10) try: self.setUp() with grpc.insecure_channel("localhost:50051") as channel: @@ -48,11 +48,12 @@ class TestBackendServicer(unittest.TestCase): """ This method tests if the model is loaded successfully """ + time.sleep(10) try: self.setUp() with grpc.insecure_channel("localhost:50051") as channel: stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens")) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased")) self.assertTrue(response.success) self.assertEqual(response.message, "Model loaded successfully") except Exception as err: @@ -65,11 +66,13 @@ class TestBackendServicer(unittest.TestCase): """ This method tests if the embeddings are generated successfully """ + time.sleep(10) try: self.setUp() with grpc.insecure_channel("localhost:50051") as channel: stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens")) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased")) + print(response.message) self.assertTrue(response.success) embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.") embedding_response = stub.Embedding(embedding_request) diff --git a/backend/python/transformers/transformers_server.py b/backend/python/transformers/transformers_server.py index 7d8d872b..e87e75cf 100755 --- a/backend/python/transformers/transformers_server.py +++ b/backend/python/transformers/transformers_server.py @@ -14,14 +14,27 @@ import backend_pb2 import backend_pb2_grpc import grpc +import torch -from transformers import AutoModel +from transformers import AutoTokenizer, AutoModel _ONE_DAY_IN_SECONDS = 60 * 60 * 24 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +def mean_pooling(model_output, attention_mask): + """ + Mean pooling to get sentence embeddings. See: + https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 + """ + token_embeddings = model_output[0] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): """ @@ -56,9 +69,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): model_name = request.Model try: self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2 + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + + if request.CUDA: + try: + # TODO: also tensorflow, make configurable + import torch.cuda + if torch.cuda.is_available(): + print("Loading model", model_name, "to CUDA.", file=sys.stderr) + self.model = self.model.to("cuda") + except Exception as err: + print("Not using CUDA:", err, file=sys.stderr) except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - # Implement your logic here for the LoadModel service # Replace this with your desired response return backend_pb2.Result(message="Model loaded successfully", success=True) @@ -74,10 +97,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): Returns: An EmbeddingResult object that contains the calculated embeddings. """ - # Implement your logic here for the Embedding service - # Replace this with your desired response + + # Tokenize input + max_length = 512 + if request.Tokens != 0: + max_length = request.Tokens + encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt") + + # Create word embeddings + model_output = self.model(**encoded_input) + + # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence + sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) - sentence_embeddings = self.model.encode(request.Embeddings) + print("Embeddings:", sentence_embeddings, file=sys.stderr) return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings) diff --git a/docs/content/advanced/_index.en.md b/docs/content/advanced/_index.en.md index 5f6332ee..dd521996 100644 --- a/docs/content/advanced/_index.en.md +++ b/docs/content/advanced/_index.en.md @@ -207,6 +207,9 @@ lora_adapter: "/path/to/lora/adapter" lora_base: "/path/to/lora/base" # Disable mulmatq (CUDA) no_mulmatq: true + +# Diffusers/transformers +cuda: true ``` ### Prompt templates @@ -363,4 +366,32 @@ You can control the backends that are built by setting the `GRPC_BACKENDS` envir make GRPC_BACKENDS=backend-assets/grpc/llama-cpp build ``` -By default, all the backends are built. \ No newline at end of file +By default, all the backends are built. + +### Extra backends + +LocalAI can be extended with extra backends. The backends are implemented as `gRPC` services and can be written in any language. The container images that are built and published on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags) contain a set of images split in core and extra. By default Images bring all the dependencies and backends supported by LocalAI (we call those `extra` images). The `-core` images instead bring only the strictly necessary dependencies to run LocalAI without only a core set of backends. + +If you wish to build a custom container image with extra backends, you can use the core images and build only the backends you are interested into. For instance, to use the diffusers backend: + +```Dockerfile +FROM quay.io/go-skynet/local-ai:master-ffmpeg-core + +RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers +``` + +Remember also to set the `EXTERNAL_GRPC_BACKENDS` environment variable (or `--external-grpc-backends` as CLI flag) to point to the backends you are using (`EXTERNAL_GRPC_BACKENDS="backend_name:/path/to/backend"`), for example with diffusers: + +```Dockerfile +FROM quay.io/go-skynet/local-ai:master-ffmpeg-core + +RUN PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers + +ENV EXTERNAL_GRPC_BACKENDS="diffusers:/build/backend/python/diffusers/run.sh" +``` + +{{% notice note %}} + +You can specify remote external backends or path to local files. The syntax is `backend-name:/path/to/backend` or `backend-name:host:port`. + +{{% /notice %}} diff --git a/docs/content/getting_started/_index.en.md b/docs/content/getting_started/_index.en.md index c5986be2..255bd625 100644 --- a/docs/content/getting_started/_index.en.md +++ b/docs/content/getting_started/_index.en.md @@ -178,6 +178,7 @@ You can control LocalAI with command line arguments, to specify a binding addres | --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. | | --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. | | --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. | +| --external-grpc-backends | EXTERNAL_GRPC_BACKENDS | none | Comma separated list of external gRPC backends to use. Format: `name:host:port` or `name:/path/to/file` | ### Container images