mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-20 23:58:12 +00:00
update examples
This commit is contained in:
@@ -204,7 +204,6 @@ preset_models_on_huggingface = {
|
||||
("lllyasviel/omost-llama-3-8b-4bits", "model.safetensors.index.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
|
||||
("lllyasviel/omost-llama-3-8b-4bits", "special_tokens_map.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
|
||||
],
|
||||
|
||||
# Translator
|
||||
"opus-mt-zh-en": [
|
||||
("Helsinki-NLP/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
|
||||
@@ -346,6 +345,24 @@ preset_models_on_modelscope = {
|
||||
("AI-ModelScope/controlnet-union-sdxl-1.0", "diffusion_pytorch_model_promax.safetensors", "models/ControlNet/controlnet_union"),
|
||||
("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
|
||||
],
|
||||
"Annotators:Depth": [
|
||||
("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
|
||||
],
|
||||
"Annotators:Softedge": [
|
||||
("lllyasviel/Annotators", "ControlNetHED.pth", "models/Annotators"),
|
||||
],
|
||||
"Annotators:Lineart": [
|
||||
("lllyasviel/Annotators", "sk_model.pth", "models/Annotators"),
|
||||
("lllyasviel/Annotators", "sk_model2.pth", "models/Annotators"),
|
||||
],
|
||||
"Annotators:Normal": [
|
||||
("lllyasviel/Annotators", "scannet.pt", "models/Annotators"),
|
||||
],
|
||||
"Annotators:Openpose": [
|
||||
("lllyasviel/Annotators", "body_pose_model.pth", "models/Annotators"),
|
||||
("lllyasviel/Annotators", "facenet.pth", "models/Annotators"),
|
||||
("lllyasviel/Annotators", "hand_pose_model.pth", "models/Annotators"),
|
||||
],
|
||||
# AnimateDiff
|
||||
"AnimateDiff_v2": [
|
||||
("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
|
||||
@@ -487,6 +504,30 @@ preset_models_on_modelscope = {
|
||||
"models/FLUX/FLUX.1-schnell/flux1-schnell.safetensors"
|
||||
],
|
||||
},
|
||||
"InstantX/FLUX.1-dev-Controlnet-Union-alpha": [
|
||||
("InstantX/FLUX.1-dev-Controlnet-Union-alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha"),
|
||||
],
|
||||
"jasperai/Flux.1-dev-Controlnet-Depth": [
|
||||
("jasperai/Flux.1-dev-Controlnet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Depth"),
|
||||
],
|
||||
"jasperai/Flux.1-dev-Controlnet-Surface-Normals": [
|
||||
("jasperai/Flux.1-dev-Controlnet-Surface-Normals", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals"),
|
||||
],
|
||||
"jasperai/Flux.1-dev-Controlnet-Upscaler": [
|
||||
("jasperai/Flux.1-dev-Controlnet-Upscaler", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler"),
|
||||
],
|
||||
"alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha": [
|
||||
("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha"),
|
||||
],
|
||||
"alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta": [
|
||||
("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta"),
|
||||
],
|
||||
"Shakker-Labs/FLUX.1-dev-ControlNet-Depth": [
|
||||
("Shakker-Labs/FLUX.1-dev-ControlNet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Depth"),
|
||||
],
|
||||
"Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro": [
|
||||
("Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro"),
|
||||
],
|
||||
# ESRGAN
|
||||
"ESRGAN_x4": [
|
||||
("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
|
||||
@@ -546,10 +587,23 @@ Preset_model_id: TypeAlias = Literal[
|
||||
"ControlNet_union_sdxl_promax",
|
||||
"FLUX.1-dev",
|
||||
"FLUX.1-schnell",
|
||||
"InstantX/FLUX.1-dev-Controlnet-Union-alpha",
|
||||
"jasperai/Flux.1-dev-Controlnet-Depth",
|
||||
"jasperai/Flux.1-dev-Controlnet-Surface-Normals",
|
||||
"jasperai/Flux.1-dev-Controlnet-Upscaler",
|
||||
"alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
|
||||
"alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
|
||||
"Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
|
||||
"Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
|
||||
"SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
|
||||
"QwenPrompt",
|
||||
"OmostPrompt",
|
||||
"ESRGAN_x4",
|
||||
"RIFE",
|
||||
"CogVideoX-5B",
|
||||
"Annotators:Depth",
|
||||
"Annotators:Softedge",
|
||||
"Annotators:Lineart",
|
||||
"Annotators:Normal",
|
||||
"Annotators:Openpose",
|
||||
]
|
||||
|
||||
@@ -107,6 +107,60 @@ class TileWorker:
|
||||
|
||||
|
||||
|
||||
class FastTileWorker:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
def build_mask(self, data, is_bound):
|
||||
_, _, H, W = data.shape
|
||||
h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
|
||||
w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
|
||||
border_width = (H + W) // 4
|
||||
pad = torch.ones_like(h) * border_width
|
||||
mask = torch.stack([
|
||||
pad if is_bound[0] else h + 1,
|
||||
pad if is_bound[1] else H - h,
|
||||
pad if is_bound[2] else w + 1,
|
||||
pad if is_bound[3] else W - w
|
||||
]).min(dim=0).values
|
||||
mask = mask.clip(1, border_width)
|
||||
mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
|
||||
mask = rearrange(mask, "H W -> 1 H W")
|
||||
return mask
|
||||
|
||||
|
||||
def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_device="cpu", tile_dtype=torch.float32, border_width=None):
|
||||
# Prepare
|
||||
B, C, H, W = model_input.shape
|
||||
border_width = int(tile_stride*0.5) if border_width is None else border_width
|
||||
weight = torch.zeros((1, 1, H, W), dtype=tile_dtype, device=tile_device)
|
||||
values = torch.zeros((B, C, H, W), dtype=tile_dtype, device=tile_device)
|
||||
|
||||
# Split tasks
|
||||
tasks = []
|
||||
for h in range(0, H, tile_stride):
|
||||
for w in range(0, W, tile_stride):
|
||||
if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
|
||||
continue
|
||||
h_, w_ = h + tile_size, w + tile_size
|
||||
if h_ > H: h, h_ = H - tile_size, H
|
||||
if w_ > W: w, w_ = W - tile_size, W
|
||||
tasks.append((h, h_, w, w_))
|
||||
|
||||
# Run
|
||||
for hl, hr, wl, wr in tasks:
|
||||
# Forward
|
||||
hidden_states_batch = forward_fn(hl, hr, wl, wr).to(dtype=tile_dtype, device=tile_device)
|
||||
|
||||
mask = self.build_mask(hidden_states_batch, is_bound=(hl==0, hr>=H, wl==0, wr>=W))
|
||||
values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
|
||||
weight[:, :, hl:hr, wl:wr] += mask
|
||||
values /= weight
|
||||
return values
|
||||
|
||||
|
||||
|
||||
class TileWorker2Dto3D:
|
||||
"""
|
||||
Process 3D tensors, but only enable TileWorker on 2D.
|
||||
|
||||
@@ -47,9 +47,12 @@ class BasePipeline(torch.nn.Module):
|
||||
return value
|
||||
|
||||
|
||||
def control_noise_via_local_prompts(self, prompt_emb_global, prompt_emb_locals, masks, mask_scales, inference_callback):
|
||||
noise_pred_global = inference_callback(prompt_emb_global)
|
||||
noise_pred_locals = [inference_callback(prompt_emb_local) for prompt_emb_local in prompt_emb_locals]
|
||||
def control_noise_via_local_prompts(self, prompt_emb_global, prompt_emb_locals, masks, mask_scales, inference_callback, special_kwargs={}, special_local_kwargs_list=None):
|
||||
noise_pred_global = inference_callback(prompt_emb_global, special_kwargs)
|
||||
if special_local_kwargs_list is None:
|
||||
noise_pred_locals = [inference_callback(prompt_emb_local) for prompt_emb_local in prompt_emb_locals]
|
||||
else:
|
||||
noise_pred_locals = [inference_callback(prompt_emb_local, special_kwargs) for prompt_emb_local, special_kwargs in zip(prompt_emb_locals, special_local_kwargs_list)]
|
||||
noise_pred = self.merge_latents(noise_pred_global, noise_pred_locals, masks, mask_scales)
|
||||
return noise_pred
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import torch
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from ..models.tiler import FastTileWorker
|
||||
|
||||
|
||||
|
||||
@@ -142,6 +143,7 @@ class FluxImagePipeline(BasePipeline):
|
||||
input_image=None,
|
||||
controlnet_image=None,
|
||||
controlnet_inpaint_mask=None,
|
||||
enable_controlnet_on_negative=False,
|
||||
denoising_strength=1.0,
|
||||
height=1024,
|
||||
width=1024,
|
||||
@@ -186,8 +188,13 @@ class FluxImagePipeline(BasePipeline):
|
||||
# Prepare ControlNets
|
||||
if controlnet_image is not None:
|
||||
controlnet_kwargs = {"controlnet_frames": self.prepare_controlnet_input(controlnet_image, controlnet_inpaint_mask, tiler_kwargs)}
|
||||
if len(masks) > 0 and controlnet_inpaint_mask is not None:
|
||||
print("The controlnet_inpaint_mask will be overridden by masks.")
|
||||
local_controlnet_kwargs = [{"controlnet_frames": self.prepare_controlnet_input(controlnet_image, mask, tiler_kwargs)} for mask in masks]
|
||||
else:
|
||||
local_controlnet_kwargs = None
|
||||
else:
|
||||
controlnet_kwargs = {"controlnet_frames": None}
|
||||
controlnet_kwargs, local_controlnet_kwargs = {"controlnet_frames": None}, [{}] * len(masks)
|
||||
|
||||
# Denoise
|
||||
self.load_models_to_device(['dit', 'controlnet'])
|
||||
@@ -195,17 +202,21 @@ class FluxImagePipeline(BasePipeline):
|
||||
timestep = timestep.unsqueeze(0).to(self.device)
|
||||
|
||||
# Classifier-free guidance
|
||||
inference_callback = lambda prompt_emb_posi: lets_dance_flux(
|
||||
inference_callback = lambda prompt_emb_posi, controlnet_kwargs: lets_dance_flux(
|
||||
dit=self.dit, controlnet=self.controlnet,
|
||||
hidden_states=latents, timestep=timestep,
|
||||
**prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs
|
||||
)
|
||||
noise_pred_posi = self.control_noise_via_local_prompts(prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback)
|
||||
noise_pred_posi = self.control_noise_via_local_prompts(
|
||||
prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback,
|
||||
special_kwargs=controlnet_kwargs, special_local_kwargs_list=local_controlnet_kwargs
|
||||
)
|
||||
if cfg_scale != 1.0:
|
||||
negative_controlnet_kwargs = controlnet_kwargs if enable_controlnet_on_negative else {}
|
||||
noise_pred_nega = lets_dance_flux(
|
||||
dit=self.dit, controlnet=self.controlnet,
|
||||
hidden_states=latents, timestep=timestep,
|
||||
**prompt_emb_nega, **tiler_kwargs, **extra_input, **controlnet_kwargs
|
||||
**prompt_emb_nega, **tiler_kwargs, **extra_input, **negative_controlnet_kwargs,
|
||||
)
|
||||
noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
|
||||
else:
|
||||
@@ -244,6 +255,32 @@ def lets_dance_flux(
|
||||
tile_stride=64,
|
||||
**kwargs
|
||||
):
|
||||
if tiled:
|
||||
def flux_forward_fn(hl, hr, wl, wr):
|
||||
return lets_dance_flux(
|
||||
dit=dit,
|
||||
controlnet=controlnet,
|
||||
hidden_states=hidden_states[:, :, hl: hr, wl: wr],
|
||||
timestep=timestep,
|
||||
prompt_emb=prompt_emb,
|
||||
pooled_prompt_emb=pooled_prompt_emb,
|
||||
guidance=guidance,
|
||||
text_ids=text_ids,
|
||||
image_ids=None,
|
||||
controlnet_frames=[f[:, :, hl: hr, wl: wr] for f in controlnet_frames],
|
||||
tiled=False,
|
||||
**kwargs
|
||||
)
|
||||
return FastTileWorker().tiled_forward(
|
||||
flux_forward_fn,
|
||||
hidden_states,
|
||||
tile_size=tile_size,
|
||||
tile_stride=tile_stride,
|
||||
tile_device=hidden_states.device,
|
||||
tile_dtype=hidden_states.dtype
|
||||
)
|
||||
|
||||
|
||||
# ControlNet
|
||||
if controlnet is not None and controlnet_frames is not None:
|
||||
controlnet_extra_kwargs = {
|
||||
|
||||
Reference in New Issue
Block a user