update examples

2026-03-20 23:58:12 +00:00 · 2024-10-24 15:42:46 +08:00
parent aa054db1c7
commit 105fe3961c
6 changed files with 455 additions and 52 deletions
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -204,7 +204,6 @@ preset_models_on_huggingface = {
        ("lllyasviel/omost-llama-3-8b-4bits", "model.safetensors.index.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
        ("lllyasviel/omost-llama-3-8b-4bits", "special_tokens_map.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
    ],
-    
    # Translator
    "opus-mt-zh-en": [
        ("Helsinki-NLP/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
@@ -346,6 +345,24 @@ preset_models_on_modelscope = {
        ("AI-ModelScope/controlnet-union-sdxl-1.0", "diffusion_pytorch_model_promax.safetensors", "models/ControlNet/controlnet_union"),
        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
    ],
+    "Annotators:Depth": [
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Softedge": [
+        ("lllyasviel/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "Annotators:Lineart": [
+        ("lllyasviel/Annotators", "sk_model.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "Annotators:Normal": [
+        ("lllyasviel/Annotators", "scannet.pt", "models/Annotators"),
+    ],
+    "Annotators:Openpose": [
+        ("lllyasviel/Annotators", "body_pose_model.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "facenet.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "hand_pose_model.pth", "models/Annotators"),
+    ],
    # AnimateDiff
    "AnimateDiff_v2": [
        ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
@@ -487,6 +504,30 @@ preset_models_on_modelscope = {
            "models/FLUX/FLUX.1-schnell/flux1-schnell.safetensors"
        ],
    },
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha": [
+        ("InstantX/FLUX.1-dev-Controlnet-Union-alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Depth": [
+        ("jasperai/Flux.1-dev-Controlnet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Depth"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals": [
+        ("jasperai/Flux.1-dev-Controlnet-Surface-Normals", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Upscaler": [
+        ("jasperai/Flux.1-dev-Controlnet-Upscaler", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler"),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha": [
+        ("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha"),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta": [
+        ("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta"),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth": [
+        ("Shakker-Labs/FLUX.1-dev-ControlNet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Depth"),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro": [
+        ("Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro"),
+    ],
    # ESRGAN
    "ESRGAN_x4": [
        ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
@@ -546,10 +587,23 @@ Preset_model_id: TypeAlias = Literal[
    "ControlNet_union_sdxl_promax",
    "FLUX.1-dev",
    "FLUX.1-schnell",
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+    "jasperai/Flux.1-dev-Controlnet-Depth",
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+    "jasperai/Flux.1-dev-Controlnet-Upscaler",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
    "QwenPrompt",
    "OmostPrompt",
    "ESRGAN_x4",
    "RIFE",
    "CogVideoX-5B",
+    "Annotators:Depth",
+    "Annotators:Softedge",
+    "Annotators:Lineart",
+    "Annotators:Normal",
+    "Annotators:Openpose",
 ]
--- a/diffsynth/models/tiler.py
+++ b/diffsynth/models/tiler.py
@@ -107,6 +107,60 @@ class TileWorker:
    


+class FastTileWorker:
+    def __init__(self):
+        pass
+
+
+    def build_mask(self, data, is_bound):
+        _, _, H, W = data.shape
+        h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
+        w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
+        border_width = (H + W) // 4
+        pad = torch.ones_like(h) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else h + 1,
+            pad if is_bound[1] else H - h,
+            pad if is_bound[2] else w + 1,
+            pad if is_bound[3] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
+        mask = rearrange(mask, "H W -> 1 H W")
+        return mask
+
+
+    def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_device="cpu", tile_dtype=torch.float32, border_width=None):
+        # Prepare
+        B, C, H, W = model_input.shape
+        border_width = int(tile_stride*0.5) if border_width is None else border_width
+        weight = torch.zeros((1, 1, H, W), dtype=tile_dtype, device=tile_device)
+        values = torch.zeros((B, C, H, W), dtype=tile_dtype, device=tile_device)
+
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride):
+            for w in range(0, W, tile_stride):
+                if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
+                    continue
+                h_, w_ = h + tile_size, w + tile_size
+                if h_ > H: h, h_ = H - tile_size, H
+                if w_ > W: w, w_ = W - tile_size, W
+                tasks.append((h, h_, w, w_))
+        
+        # Run
+        for hl, hr, wl, wr in tasks:
+            # Forward
+            hidden_states_batch = forward_fn(hl, hr, wl, wr).to(dtype=tile_dtype, device=tile_device)
+
+            mask = self.build_mask(hidden_states_batch, is_bound=(hl==0, hr>=H, wl==0, wr>=W))
+            values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
+            weight[:, :, hl:hr, wl:wr] += mask
+        values /= weight
+        return values
+
+
+
 class TileWorker2Dto3D:
    """
    Process 3D tensors, but only enable TileWorker on 2D.
--- a/diffsynth/pipelines/base.py
+++ b/diffsynth/pipelines/base.py
@@ -47,9 +47,12 @@ class BasePipeline(torch.nn.Module):
        return value


-    def control_noise_via_local_prompts(self, prompt_emb_global, prompt_emb_locals, masks, mask_scales, inference_callback):
-        noise_pred_global = inference_callback(prompt_emb_global)
-        noise_pred_locals = [inference_callback(prompt_emb_local) for prompt_emb_local in prompt_emb_locals]
+    def control_noise_via_local_prompts(self, prompt_emb_global, prompt_emb_locals, masks, mask_scales, inference_callback, special_kwargs={}, special_local_kwargs_list=None):
+        noise_pred_global = inference_callback(prompt_emb_global, special_kwargs)
+        if special_local_kwargs_list is None:
+            noise_pred_locals = [inference_callback(prompt_emb_local) for prompt_emb_local in prompt_emb_locals]
+        else:
+            noise_pred_locals = [inference_callback(prompt_emb_local, special_kwargs) for prompt_emb_local, special_kwargs in zip(prompt_emb_locals, special_local_kwargs_list)]
        noise_pred = self.merge_latents(noise_pred_global, noise_pred_locals, masks, mask_scales)
        return noise_pred
    
--- a/diffsynth/pipelines/flux_image.py
+++ b/diffsynth/pipelines/flux_image.py
@@ -8,6 +8,7 @@ import torch
 from tqdm import tqdm
 import numpy as np
 from PIL import Image
+from ..models.tiler import FastTileWorker



@@ -142,6 +143,7 @@ class FluxImagePipeline(BasePipeline):
        input_image=None,
        controlnet_image=None,
        controlnet_inpaint_mask=None,
+        enable_controlnet_on_negative=False,
        denoising_strength=1.0,
        height=1024,
        width=1024,
@@ -186,8 +188,13 @@ class FluxImagePipeline(BasePipeline):
        # Prepare ControlNets
        if controlnet_image is not None:
            controlnet_kwargs = {"controlnet_frames": self.prepare_controlnet_input(controlnet_image, controlnet_inpaint_mask, tiler_kwargs)}
+            if len(masks) > 0 and controlnet_inpaint_mask is not None:
+                print("The controlnet_inpaint_mask will be overridden by masks.")
+                local_controlnet_kwargs = [{"controlnet_frames": self.prepare_controlnet_input(controlnet_image, mask, tiler_kwargs)} for mask in masks]
+            else:
+                local_controlnet_kwargs = None
        else:
-            controlnet_kwargs = {"controlnet_frames": None}
+            controlnet_kwargs, local_controlnet_kwargs = {"controlnet_frames": None}, [{}] * len(masks)

        # Denoise
        self.load_models_to_device(['dit', 'controlnet'])
@@ -195,17 +202,21 @@ class FluxImagePipeline(BasePipeline):
            timestep = timestep.unsqueeze(0).to(self.device)

            # Classifier-free guidance
-            inference_callback = lambda prompt_emb_posi: lets_dance_flux(
+            inference_callback = lambda prompt_emb_posi, controlnet_kwargs: lets_dance_flux(
                dit=self.dit, controlnet=self.controlnet,
                hidden_states=latents, timestep=timestep,
                **prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs
            )
-            noise_pred_posi = self.control_noise_via_local_prompts(prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback)
+            noise_pred_posi = self.control_noise_via_local_prompts(
+                prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback,
+                special_kwargs=controlnet_kwargs, special_local_kwargs_list=local_controlnet_kwargs
+            )
            if cfg_scale != 1.0:
+                negative_controlnet_kwargs = controlnet_kwargs if enable_controlnet_on_negative else {}
                noise_pred_nega = lets_dance_flux(
                    dit=self.dit, controlnet=self.controlnet,
                    hidden_states=latents, timestep=timestep,
-                    **prompt_emb_nega, **tiler_kwargs, **extra_input, **controlnet_kwargs
+                    **prompt_emb_nega, **tiler_kwargs, **extra_input, **negative_controlnet_kwargs,
                )
                noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
            else:
@@ -244,6 +255,32 @@ def lets_dance_flux(
    tile_stride=64,
    **kwargs
 ):
+    if tiled:
+        def flux_forward_fn(hl, hr, wl, wr):
+            return lets_dance_flux(
+                dit=dit,
+                controlnet=controlnet,
+                hidden_states=hidden_states[:, :, hl: hr, wl: wr],
+                timestep=timestep,
+                prompt_emb=prompt_emb,
+                pooled_prompt_emb=pooled_prompt_emb,
+                guidance=guidance,
+                text_ids=text_ids,
+                image_ids=None,
+                controlnet_frames=[f[:, :, hl: hr, wl: wr] for f in controlnet_frames],
+                tiled=False,
+                **kwargs
+            )
+        return FastTileWorker().tiled_forward(
+            flux_forward_fn,
+            hidden_states,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+            tile_device=hidden_states.device,
+            tile_dtype=hidden_states.dtype
+        )
+
+
    # ControlNet
    if controlnet is not None and controlnet_frames is not None:
        controlnet_extra_kwargs = {