From dd982acf2cc61cc322aaf84c8ecad3a4b0ed113e Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 15 Dec 2023 18:06:20 -0500 Subject: [PATCH] feat(img2vid,txt2vid): Initial support for img2vid,txt2vid (#1442) * feat(img2vid): Initial support for img2vid * doc(SD): fix SDXL Example * Minor fixups for img2vid * docs(img2img): fix example curl call * feat(txt2vid): initial support Signed-off-by: Ettore Di Giacinto * diffusers: be retro-compatible with CUDA settings * docs(img2vid, txt2vid): examples * Add notice on docs --------- Signed-off-by: Ettore Di Giacinto --- api/backend/image.go | 2 +- api/config/config.go | 1 + api/openai/image.go | 53 ++++++++++++-- backend/python/diffusers/backend_diffusers.py | 46 ++++++++++-- docs/content/features/image-generation.md | 1 - docs/content/howtos/easy-setup-sd.md | 1 - docs/content/model-compatibility/diffusers.md | 73 ++++++++++++++++--- 7 files changed, 150 insertions(+), 27 deletions(-) diff --git a/api/backend/image.go b/api/backend/image.go index 2528611e..6183269f 100644 --- a/api/backend/image.go +++ b/api/backend/image.go @@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat model.WithContext(o.Context), model.WithModel(c.Model), model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{ - CUDA: c.CUDA, + CUDA: c.CUDA || c.Diffusers.CUDA, SchedulerType: c.Diffusers.SchedulerType, PipelineType: c.Diffusers.PipelineType, CFGScale: c.Diffusers.CFGScale, diff --git a/api/config/config.go b/api/config/config.go index b50fcae2..84c1f784 100644 --- a/api/config/config.go +++ b/api/config/config.go @@ -68,6 +68,7 @@ type GRPC struct { } type Diffusers struct { + CUDA bool `yaml:"cuda"` PipelineType string `yaml:"pipeline_type"` SchedulerType string `yaml:"scheduler_type"` EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify diff --git a/api/openai/image.go b/api/openai/image.go index 2fe6c5a7..8f806275 100644 --- a/api/openai/image.go +++ b/api/openai/image.go @@ -5,6 +5,8 @@ import ( "encoding/base64" "encoding/json" "fmt" + "io" + "net/http" "os" "path/filepath" "strconv" @@ -22,6 +24,26 @@ import ( "github.com/rs/zerolog/log" ) +func downloadFile(url string) (string, error) { + // Get the data + resp, err := http.Get(url) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Create the file + out, err := os.CreateTemp("", "image") + if err != nil { + return "", err + } + defer out.Close() + + // Write the body to file + _, err = io.Copy(out, resp.Body) + return out.Name(), err +} + // https://platform.openai.com/docs/api-reference/images/create /* @@ -56,12 +78,31 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx src := "" if input.File != "" { - //base 64 decode the file and write it somewhere - // that we will cleanup - decoded, err := base64.StdEncoding.DecodeString(input.File) - if err != nil { - return err + + fileData := []byte{} + // check if input.File is an URL, if so download it and save it + // to a temporary file + if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") { + out, err := downloadFile(input.File) + if err != nil { + return fmt.Errorf("failed downloading file:%w", err) + } + defer os.RemoveAll(out) + + fileData, err = os.ReadFile(out) + if err != nil { + return fmt.Errorf("failed reading file:%w", err) + } + + } else { + // base 64 decode the file and write it somewhere + // that we will cleanup + fileData, err = base64.StdEncoding.DecodeString(input.File) + if err != nil { + return err + } } + // Create a temporary file outputFile, err := os.CreateTemp(o.ImageDir, "b64") if err != nil { @@ -69,7 +110,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx } // write the base64 result writer := bufio.NewWriter(outputFile) - _, err = writer.Write(decoded) + _, err = writer.Write(fileData) if err != nil { outputFile.Close() return err diff --git a/backend/python/diffusers/backend_diffusers.py b/backend/python/diffusers/backend_diffusers.py index f8569584..c66b2476 100755 --- a/backend/python/diffusers/backend_diffusers.py +++ b/backend/python/diffusers/backend_diffusers.py @@ -18,9 +18,9 @@ import backend_pb2_grpc import grpc from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler -from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel +from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline from diffusers.pipelines.stable_diffusion import safety_checker -from diffusers.utils import load_image +from diffusers.utils import load_image,export_to_video from compel import Compel from transformers import CLIPTextModel @@ -31,6 +31,10 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24 COMPEL=os.environ.get("COMPEL", "1") == "1" CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1" SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1" +CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8") +FPS=os.environ.get("FPS", "7") +DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1" +FRAMES=os.environ.get("FRAMES", "64") # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) @@ -163,7 +167,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): modelFile = request.ModelFile fromSingleFile = request.Model.startswith("http") or request.Model.startswith("/") or local - + self.img2vid=False + self.txt2vid=False ## img2img if (request.PipelineType == "StableDiffusionImg2ImgPipeline") or (request.IMG2IMG and request.PipelineType == ""): if fromSingleFile: @@ -179,6 +184,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): self.pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(request.Model, torch_dtype=torchType, guidance_scale=cfg_scale) + ## img2vid + elif request.PipelineType == "StableVideoDiffusionPipeline": + self.img2vid=True + self.pipe = StableVideoDiffusionPipeline.from_pretrained( + request.Model, torch_dtype=torchType, variant=variant + ) + if not DISABLE_CPU_OFFLOAD: + self.pipe.enable_model_cpu_offload() ## text2img elif request.PipelineType == "AutoPipelineForText2Image" or request.PipelineType == "": self.pipe = AutoPipelineForText2Image.from_pretrained(request.Model, @@ -199,6 +212,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): self.pipe = DiffusionPipeline.from_pretrained(request.Model, torch_dtype=torchType, guidance_scale=cfg_scale) + elif request.PipelineType == "VideoDiffusionPipeline": + self.txt2vid=True + self.pipe = DiffusionPipeline.from_pretrained(request.Model, + torch_dtype=torchType, + guidance_scale=cfg_scale) elif request.PipelineType == "StableDiffusionXLPipeline": if fromSingleFile: self.pipe = StableDiffusionXLPipeline.from_single_file(modelFile, @@ -222,7 +240,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if request.SchedulerType != "": self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config) - self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder) + if not self.img2vid: + self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder) if request.ControlNet: @@ -331,7 +350,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): "num_inference_steps": steps, } - if request.src != "" and not self.controlnet: + if request.src != "" and not self.controlnet and not self.img2vid: image = Image.open(request.src) options["image"] = image elif self.controlnet and request.src: @@ -359,6 +378,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): request.seed ) + if self.img2vid: + # Load the conditioning image + image = load_image(request.src) + image = image.resize((1024, 576)) + + generator = torch.manual_seed(request.seed) + frames = self.pipe(image, decode_chunk_size=CHUNK_SIZE, generator=generator).frames[0] + export_to_video(frames, request.dst, fps=FPS) + return backend_pb2.Result(message="Media generated successfully", success=True) + + if self.txt2vid: + video_frames = self.pipe(prompt, num_inference_steps=steps, num_frames=int(FRAMES)).frames + export_to_video(video_frames, request.dst) + return backend_pb2.Result(message="Media generated successfully", success=True) + image = {} if COMPEL: conditioning = self.compel.build_conditioning_tensor(prompt) @@ -377,7 +411,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # save the result image.save(request.dst) - return backend_pb2.Result(message="Model loaded successfully", success=True) + return backend_pb2.Result(message="Media generated", success=True) def serve(address): server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) diff --git a/docs/content/features/image-generation.md b/docs/content/features/image-generation.md index b7099368..efa98555 100644 --- a/docs/content/features/image-generation.md +++ b/docs/content/features/image-generation.md @@ -147,7 +147,6 @@ backend: diffusers # Force CPU usage - set to true for GPU f16: false diffusers: - pipeline_type: StableDiffusionXLPipeline cuda: false # Enable for GPU usage (CUDA) scheduler_type: euler_a ``` diff --git a/docs/content/howtos/easy-setup-sd.md b/docs/content/howtos/easy-setup-sd.md index 0e8735fc..4dd83505 100644 --- a/docs/content/howtos/easy-setup-sd.md +++ b/docs/content/howtos/easy-setup-sd.md @@ -15,7 +15,6 @@ backend: diffusers # Force CPU usage - set to true for GPU f16: false diffusers: - pipeline_type: StableDiffusionXLPipeline cuda: false # Enable for GPU usage (CUDA) scheduler_type: dpm_2_a ``` diff --git a/docs/content/model-compatibility/diffusers.md b/docs/content/model-compatibility/diffusers.md index 3cddbedd..fb076887 100644 --- a/docs/content/model-compatibility/diffusers.md +++ b/docs/content/model-compatibility/diffusers.md @@ -27,12 +27,9 @@ name: animagine-xl parameters: model: Linaqruf/animagine-xl backend: diffusers - -# Force CPU usage - set to true for GPU -f16: false +cuda: true +f16: true diffusers: - pipeline_type: StableDiffusionXLPipeline - cuda: false # Enable for GPU usage (CUDA) scheduler_type: euler_a ``` @@ -47,9 +44,9 @@ parameters: backend: diffusers step: 30 f16: true +cuda: true diffusers: pipeline_type: StableDiffusionPipeline - cuda: true enable_parameters: "negative_prompt,num_inference_steps,clip_skip" scheduler_type: "k_dpmpp_sde" cfg_scale: 8 @@ -69,7 +66,7 @@ The following parameters are available in the configuration file: | `scheduler_type` | Scheduler type | `k_dpp_sde` | | `cfg_scale` | Configuration scale | `8` | | `clip_skip` | Clip skip | None | -| `pipeline_type` | Pipeline type | `StableDiffusionPipeline` | +| `pipeline_type` | Pipeline type | `AutoPipelineForText2Image` | There are available several types of schedulers: @@ -131,17 +128,16 @@ parameters: model: nitrosocke/Ghibli-Diffusion backend: diffusers step: 25 - +cuda: true f16: true diffusers: pipeline_type: StableDiffusionImg2ImgPipeline - cuda: true enable_parameters: "negative_prompt,num_inference_steps,image" ``` ```bash IMAGE_PATH=/path/to/your/image -(echo -n '{"image": "'; base64 $IMAGE_PATH; echo '", "prompt": "a sky background","size": "512x512","model":"stablediffusion-edit"}') | +(echo -n '{"file": "'; base64 $IMAGE_PATH; echo '", "prompt": "a sky background","size": "512x512","model":"stablediffusion-edit"}') | curl -H "Content-Type: application/json" -d @- http://localhost:8080/v1/images/generations ``` @@ -157,14 +153,67 @@ backend: diffusers step: 50 # Force CPU usage f16: true +cuda: true diffusers: pipeline_type: StableDiffusionDepth2ImgPipeline - cuda: true enable_parameters: "negative_prompt,num_inference_steps,image" cfg_scale: 6 ``` ```bash -(echo -n '{"image": "'; base64 ~/path/to/image.jpeg; echo '", "prompt": "a sky background","size": "512x512","model":"stablediffusion-depth"}') | +(echo -n '{"file": "'; base64 ~/path/to/image.jpeg; echo '", "prompt": "a sky background","size": "512x512","model":"stablediffusion-depth"}') | curl -H "Content-Type: application/json" -d @- http://localhost:8080/v1/images/generations ``` + +## img2vid + +{{% notice note %}} + +Experimental and available only on master builds. See: https://github.com/mudler/LocalAI/pull/1442 + +{{% /notice %}} + +```yaml +name: img2vid +parameters: + model: stabilityai/stable-video-diffusion-img2vid +backend: diffusers +step: 25 +# Force CPU usage +f16: true +cuda: true +diffusers: + pipeline_type: StableVideoDiffusionPipeline +``` + +```bash +(echo -n '{"file": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png?download=true","size": "512x512","model":"img2vid"}') | +curl -H "Content-Type: application/json" -X POST -d @- http://localhost:8080/v1/images/generations +``` + +## txt2vid + +{{% notice note %}} + +Experimental and available only on master builds. See: https://github.com/mudler/LocalAI/pull/1442 + +{{% /notice %}} + +```yaml +name: txt2vid +parameters: + model: damo-vilab/text-to-video-ms-1.7b +backend: diffusers +step: 25 +# Force CPU usage +f16: true +cuda: true +diffusers: + pipeline_type: VideoDiffusionPipeline + cuda: true +``` + +```bash +(echo -n '{"prompt": "spiderman surfing","size": "512x512","model":"txt2vid"}') | +curl -H "Content-Type: application/json" -X POST -d @- http://localhost:8080/v1/images/generations +``` \ No newline at end of file