support z-image-omni-base-i2L

This commit is contained in:
Artiprocher
2026-01-07 20:36:53 +08:00
parent bac39b1cd2
commit dd479e5bff
6 changed files with 413 additions and 6 deletions

View File

@@ -541,11 +541,18 @@ z_image_series = [
"model_class": "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M",
},
{
# ModelConfig(model_id="PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1", origin_file_pattern="Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors")
# Example: ModelConfig(model_id="PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1", origin_file_pattern="Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors")
"model_hash": "1677708d40029ab380a95f6c731a57d7",
"model_name": "z_image_controlnet",
"model_class": "diffsynth.models.z_image_controlnet.ZImageControlNet",
}
},
{
# Example: ???
"model_hash": "9510cb8cd1dd34ee0e4f111c24905510",
"model_name": "z_image_image2lora_style",
"model_class": "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel",
"extra_kwargs": {"compress_dim": 128},
},
]
MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series

View File

@@ -235,6 +235,7 @@ class BasePipeline(torch.nn.Module):
alpha=1,
hotload=None,
state_dict=None,
verbose=1,
):
if state_dict is None:
if isinstance(lora_config, str):
@@ -261,12 +262,13 @@ class BasePipeline(torch.nn.Module):
updated_num += 1
module.lora_A_weights.append(lora[lora_a_name] * alpha)
module.lora_B_weights.append(lora[lora_b_name])
print(f"{updated_num} tensors are patched by LoRA. You can use `pipe.clear_lora()` to clear all LoRA layers.")
if verbose >= 1:
print(f"{updated_num} tensors are patched by LoRA. You can use `pipe.clear_lora()` to clear all LoRA layers.")
else:
lora_loader.fuse_lora_to_base_model(module, lora, alpha=alpha)
def clear_lora(self):
def clear_lora(self, verbose=1):
cleared_num = 0
for name, module in self.named_modules():
if isinstance(module, AutoWrappedLinear):
@@ -276,7 +278,8 @@ class BasePipeline(torch.nn.Module):
module.lora_A_weights.clear()
if hasattr(module, "lora_B_weights"):
module.lora_B_weights.clear()
print(f"{cleared_num} LoRA layers are cleared.")
if verbose >= 1:
print(f"{cleared_num} LoRA layers are cleared.")
def download_and_load_models(self, model_configs: list[ModelConfig] = [], vram_limit: float = None):
@@ -304,8 +307,13 @@ class BasePipeline(torch.nn.Module):
def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega, **inputs_others):
if inputs_shared.get("positive_only_lora", None) is not None:
self.clear_lora(verbose=0)
self.load_lora(self.dit, state_dict=inputs_shared["positive_only_lora"], verbose=0)
noise_pred_posi = model_fn(**inputs_posi, **inputs_shared, **inputs_others)
if cfg_scale != 1.0:
if inputs_shared.get("positive_only_lora", None) is not None:
self.clear_lora(verbose=0)
noise_pred_nega = model_fn(**inputs_nega, **inputs_shared, **inputs_others)
noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
else:

View File

@@ -0,0 +1,189 @@
import torch
from .qwen_image_image2lora import ImageEmbeddingToLoraMatrix, SequencialMLP
class LoRATrainerBlock(torch.nn.Module):
def __init__(self, lora_patterns, in_dim=1536+4096, compress_dim=128, rank=4, block_id=0, use_residual=True, residual_length=64+7, residual_dim=3584, residual_mid_dim=1024, prefix="transformer_blocks"):
super().__init__()
self.prefix = prefix
self.lora_patterns = lora_patterns
self.block_id = block_id
self.layers = []
for name, lora_a_dim, lora_b_dim in self.lora_patterns:
self.layers.append(ImageEmbeddingToLoraMatrix(in_dim, compress_dim, lora_a_dim, lora_b_dim, rank))
self.layers = torch.nn.ModuleList(self.layers)
if use_residual:
self.proj_residual = SequencialMLP(residual_length, residual_dim, residual_mid_dim, compress_dim)
else:
self.proj_residual = None
def forward(self, x, residual=None):
lora = {}
if self.proj_residual is not None: residual = self.proj_residual(residual)
for lora_pattern, layer in zip(self.lora_patterns, self.layers):
name = lora_pattern[0]
lora_a, lora_b = layer(x, residual=residual)
lora[f"{self.prefix}.{self.block_id}.{name}.lora_A.default.weight"] = lora_a
lora[f"{self.prefix}.{self.block_id}.{name}.lora_B.default.weight"] = lora_b
return lora
class ZImageImage2LoRAComponent(torch.nn.Module):
def __init__(self, lora_patterns, prefix, num_blocks=60, use_residual=True, compress_dim=128, rank=4, residual_length=64+7, residual_mid_dim=1024):
super().__init__()
self.lora_patterns = lora_patterns
self.num_blocks = num_blocks
self.blocks = []
for lora_patterns in self.lora_patterns:
for block_id in range(self.num_blocks):
self.blocks.append(LoRATrainerBlock(lora_patterns, block_id=block_id, use_residual=use_residual, compress_dim=compress_dim, rank=rank, residual_length=residual_length, residual_mid_dim=residual_mid_dim, prefix=prefix))
self.blocks = torch.nn.ModuleList(self.blocks)
self.residual_scale = 0.05
self.use_residual = use_residual
def forward(self, x, residual=None):
if residual is not None:
if self.use_residual:
residual = residual * self.residual_scale
else:
residual = None
lora = {}
for block in self.blocks:
lora.update(block(x, residual))
return lora
class ZImageImage2LoRAModel(torch.nn.Module):
def __init__(self, use_residual=False, compress_dim=64, rank=4, residual_length=64+7, residual_mid_dim=1024):
super().__init__()
lora_patterns = [
[
("attention.to_q", 3840, 3840),
("attention.to_k", 3840, 3840),
("attention.to_v", 3840, 3840),
("attention.to_out.0", 3840, 3840),
],
[
("feed_forward.w1", 3840, 10240),
("feed_forward.w2", 10240, 3840),
("feed_forward.w3", 3840, 10240),
],
]
config = {
"lora_patterns": lora_patterns,
"use_residual": use_residual,
"compress_dim": compress_dim,
"rank": rank,
"residual_length": residual_length,
"residual_mid_dim": residual_mid_dim,
}
self.layers_lora = ZImageImage2LoRAComponent(
prefix="layers",
num_blocks=30,
**config,
)
self.context_refiner_lora = ZImageImage2LoRAComponent(
prefix="context_refiner",
num_blocks=2,
**config,
)
self.noise_refiner_lora = ZImageImage2LoRAComponent(
prefix="noise_refiner",
num_blocks=2,
**config,
)
def forward(self, x, residual=None):
lora = {}
lora.update(self.layers_lora(x, residual=residual))
lora.update(self.context_refiner_lora(x, residual=residual))
lora.update(self.noise_refiner_lora(x, residual=residual))
return lora
def initialize_weights(self):
state_dict = self.state_dict()
for name in state_dict:
if ".proj_a." in name:
state_dict[name] = state_dict[name] * 0.3
elif ".proj_b.proj_out." in name:
state_dict[name] = state_dict[name] * 0
elif ".proj_residual.proj_out." in name:
state_dict[name] = state_dict[name] * 0.3
self.load_state_dict(state_dict)
class ImageEmb2LoRAWeightCompressed(torch.nn.Module):
def __init__(self, in_dim, out_dim, emb_dim, rank):
super().__init__()
self.lora_a = torch.nn.Parameter(torch.randn((rank, in_dim)))
self.lora_b = torch.nn.Parameter(torch.randn((out_dim, rank)))
self.proj = torch.nn.Linear(emb_dim, rank * rank, bias=True)
self.rank = rank
def forward(self, x):
x = self.proj(x).view(self.rank, self.rank)
lora_a = x @ self.lora_a
lora_b = self.lora_b
return lora_a, lora_b
class ZImageImage2LoRAModelCompressed(torch.nn.Module):
def __init__(self, emb_dim=1536+4096, rank=32):
super().__init__()
target_layers = [
("attention.to_q", 3840, 3840),
("attention.to_k", 3840, 3840),
("attention.to_v", 3840, 3840),
("attention.to_out.0", 3840, 3840),
("feed_forward.w1", 3840, 10240),
("feed_forward.w2", 10240, 3840),
("feed_forward.w3", 3840, 10240),
]
self.lora_patterns = [
{
"prefix": "layers",
"num_layers": 30,
"target_layers": target_layers,
},
{
"prefix": "context_refiner",
"num_layers": 2,
"target_layers": target_layers,
},
{
"prefix": "noise_refiner",
"num_layers": 2,
"target_layers": target_layers,
},
]
module_dict = {}
for lora_pattern in self.lora_patterns:
prefix, num_layers, target_layers = lora_pattern["prefix"], lora_pattern["num_layers"], lora_pattern["target_layers"]
for layer_id in range(num_layers):
for layer_name, in_dim, out_dim in target_layers:
name = f"{prefix}.{layer_id}.{layer_name}".replace(".", "___")
model = ImageEmb2LoRAWeightCompressed(in_dim, out_dim, emb_dim, rank)
module_dict[name] = model
self.module_dict = torch.nn.ModuleDict(module_dict)
def forward(self, x, residual=None):
lora = {}
for name, module in self.module_dict.items():
name = name.replace("___", ".")
name_a, name_b = f"{name}.lora_A.default.weight", f"{name}.lora_B.default.weight"
lora_a, lora_b = module(x)
lora[name_a] = lora_a
lora[name_b] = lora_b
return lora
def initialize_weights(self):
state_dict = self.state_dict()
for name in state_dict:
if "lora_b" in name:
state_dict[name] = state_dict[name] * 0
elif "lora_a" in name:
state_dict[name] = state_dict[name] * 0.2
elif "proj.weight" in name:
print(name)
state_dict[name] = state_dict[name] * 0.2
self.load_state_dict(state_dict)

View File

@@ -4,12 +4,13 @@ from typing import Union
from tqdm import tqdm
from einops import rearrange
import numpy as np
from typing import Union, List, Optional, Tuple, Iterable
from typing import Union, List, Optional, Tuple, Iterable, Dict
from ..diffusion import FlowMatchScheduler
from ..core import ModelConfig, gradient_checkpoint_forward
from ..core.data.operators import ImageCropAndResize
from ..diffusion.base_pipeline import BasePipeline, PipelineUnit, ControlNetInput
from ..utils.lora import merge_lora
from transformers import AutoTokenizer
from ..models.z_image_text_encoder import ZImageTextEncoder
@@ -17,6 +18,9 @@ from ..models.z_image_dit import ZImageDiT
from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder
from ..models.siglip2_image_encoder import Siglip2ImageEncoder428M
from ..models.z_image_controlnet import ZImageControlNet
from ..models.siglip2_image_encoder import Siglip2ImageEncoder
from ..models.dinov3_image_encoder import DINOv3ImageEncoder
from ..models.z_image_image2lora import ZImageImage2LoRAModel
class ZImagePipeline(BasePipeline):
@@ -33,6 +37,9 @@ class ZImagePipeline(BasePipeline):
self.vae_decoder: FluxVAEDecoder = None
self.image_encoder: Siglip2ImageEncoder428M = None
self.controlnet: ZImageControlNet = None
self.siglip2_image_encoder: Siglip2ImageEncoder = None
self.dinov3_image_encoder: DINOv3ImageEncoder = None
self.image2lora_style: ZImageImage2LoRAModel = None
self.tokenizer: AutoTokenizer = None
self.in_iteration_models = ("dit", "controlnet")
self.units = [
@@ -67,6 +74,9 @@ class ZImagePipeline(BasePipeline):
pipe.vae_decoder = model_pool.fetch_model("flux_vae_decoder")
pipe.image_encoder = model_pool.fetch_model("siglip_vision_model_428m")
pipe.controlnet = model_pool.fetch_model("z_image_controlnet")
pipe.siglip2_image_encoder = model_pool.fetch_model("siglip2_image_encoder")
pipe.dinov3_image_encoder = model_pool.fetch_model("dinov3_image_encoder")
pipe.image2lora_style = model_pool.fetch_model("z_image_image2lora_style")
if tokenizer_config is not None:
tokenizer_config.download_if_necessary()
pipe.tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.path)
@@ -100,6 +110,9 @@ class ZImagePipeline(BasePipeline):
sigma_shift: float = None,
# ControlNet
controlnet_inputs: List[ControlNetInput] = None,
# Image to LoRA
image2lora_images: List[Image.Image] = None,
positive_only_lora: Dict[str, torch.Tensor] = None,
# Progress bar
progress_bar_cmd = tqdm,
):
@@ -121,6 +134,7 @@ class ZImagePipeline(BasePipeline):
"num_inference_steps": num_inference_steps,
"edit_image": edit_image, "edit_image_auto_resize": edit_image_auto_resize,
"controlnet_inputs": controlnet_inputs,
"image2lora_images": image2lora_images, "positive_only_lora": positive_only_lora,
}
for unit in self.units:
inputs_shared, inputs_posi, inputs_nega = self.unit_runner(unit, self, inputs_shared, inputs_posi, inputs_nega)
@@ -480,6 +494,71 @@ def model_fn_z_image(
return model_output
class ZImageUnit_Image2LoRAEncode(PipelineUnit):
def __init__(self):
super().__init__(
input_params=("image2lora_images",),
output_params=("image2lora_x",),
onload_model_names=("siglip2_image_encoder", "dinov3_image_encoder",),
)
from ..core.data.operators import ImageCropAndResize
self.processor_highres = ImageCropAndResize(height=1024, width=1024)
def encode_images_using_siglip2(self, pipe: ZImagePipeline, images: list[Image.Image]):
pipe.load_models_to_device(["siglip2_image_encoder"])
embs = []
for image in images:
image = self.processor_highres(image)
embs.append(pipe.siglip2_image_encoder(image).to(pipe.torch_dtype))
embs = torch.stack(embs)
return embs
def encode_images_using_dinov3(self, pipe: ZImagePipeline, images: list[Image.Image]):
pipe.load_models_to_device(["dinov3_image_encoder"])
embs = []
for image in images:
image = self.processor_highres(image)
embs.append(pipe.dinov3_image_encoder(image).to(pipe.torch_dtype))
embs = torch.stack(embs)
return embs
def encode_images(self, pipe: ZImagePipeline, images: list[Image.Image]):
if images is None:
return {}
if not isinstance(images, list):
images = [images]
embs_siglip2 = self.encode_images_using_siglip2(pipe, images)
embs_dinov3 = self.encode_images_using_dinov3(pipe, images)
x = torch.concat([embs_siglip2, embs_dinov3], dim=-1)
return x
def process(self, pipe: ZImagePipeline, image2lora_images):
if image2lora_images is None:
return {}
x = self.encode_images(pipe, image2lora_images)
return {"image2lora_x": x}
class ZImageUnit_Image2LoRADecode(PipelineUnit):
def __init__(self):
super().__init__(
input_params=("image2lora_x",),
output_params=("lora",),
onload_model_names=("image2lora_style",),
)
def process(self, pipe: ZImagePipeline, image2lora_x):
if image2lora_x is None:
return {}
loras = []
if pipe.image2lora_style is not None:
pipe.load_models_to_device(["image2lora_style"])
for x in image2lora_x:
loras.append(pipe.image2lora_style(x=x, residual=None))
lora = merge_lora(loras, alpha=1 / len(image2lora_x))
return {"lora": lora}
def model_fn_z_image_turbo(
dit: ZImageDiT,
controlnet: ZImageControlNet = None,