mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
support z-image-omni-base-i2L
This commit is contained in:
@@ -541,11 +541,18 @@ z_image_series = [
|
||||
"model_class": "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M",
|
||||
},
|
||||
{
|
||||
# ModelConfig(model_id="PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1", origin_file_pattern="Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors")
|
||||
# Example: ModelConfig(model_id="PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1", origin_file_pattern="Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors")
|
||||
"model_hash": "1677708d40029ab380a95f6c731a57d7",
|
||||
"model_name": "z_image_controlnet",
|
||||
"model_class": "diffsynth.models.z_image_controlnet.ZImageControlNet",
|
||||
}
|
||||
},
|
||||
{
|
||||
# Example: ???
|
||||
"model_hash": "9510cb8cd1dd34ee0e4f111c24905510",
|
||||
"model_name": "z_image_image2lora_style",
|
||||
"model_class": "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel",
|
||||
"extra_kwargs": {"compress_dim": 128},
|
||||
},
|
||||
]
|
||||
|
||||
MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series
|
||||
|
||||
@@ -235,6 +235,7 @@ class BasePipeline(torch.nn.Module):
|
||||
alpha=1,
|
||||
hotload=None,
|
||||
state_dict=None,
|
||||
verbose=1,
|
||||
):
|
||||
if state_dict is None:
|
||||
if isinstance(lora_config, str):
|
||||
@@ -261,12 +262,13 @@ class BasePipeline(torch.nn.Module):
|
||||
updated_num += 1
|
||||
module.lora_A_weights.append(lora[lora_a_name] * alpha)
|
||||
module.lora_B_weights.append(lora[lora_b_name])
|
||||
print(f"{updated_num} tensors are patched by LoRA. You can use `pipe.clear_lora()` to clear all LoRA layers.")
|
||||
if verbose >= 1:
|
||||
print(f"{updated_num} tensors are patched by LoRA. You can use `pipe.clear_lora()` to clear all LoRA layers.")
|
||||
else:
|
||||
lora_loader.fuse_lora_to_base_model(module, lora, alpha=alpha)
|
||||
|
||||
|
||||
def clear_lora(self):
|
||||
def clear_lora(self, verbose=1):
|
||||
cleared_num = 0
|
||||
for name, module in self.named_modules():
|
||||
if isinstance(module, AutoWrappedLinear):
|
||||
@@ -276,7 +278,8 @@ class BasePipeline(torch.nn.Module):
|
||||
module.lora_A_weights.clear()
|
||||
if hasattr(module, "lora_B_weights"):
|
||||
module.lora_B_weights.clear()
|
||||
print(f"{cleared_num} LoRA layers are cleared.")
|
||||
if verbose >= 1:
|
||||
print(f"{cleared_num} LoRA layers are cleared.")
|
||||
|
||||
|
||||
def download_and_load_models(self, model_configs: list[ModelConfig] = [], vram_limit: float = None):
|
||||
@@ -304,8 +307,13 @@ class BasePipeline(torch.nn.Module):
|
||||
|
||||
|
||||
def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega, **inputs_others):
|
||||
if inputs_shared.get("positive_only_lora", None) is not None:
|
||||
self.clear_lora(verbose=0)
|
||||
self.load_lora(self.dit, state_dict=inputs_shared["positive_only_lora"], verbose=0)
|
||||
noise_pred_posi = model_fn(**inputs_posi, **inputs_shared, **inputs_others)
|
||||
if cfg_scale != 1.0:
|
||||
if inputs_shared.get("positive_only_lora", None) is not None:
|
||||
self.clear_lora(verbose=0)
|
||||
noise_pred_nega = model_fn(**inputs_nega, **inputs_shared, **inputs_others)
|
||||
noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
|
||||
else:
|
||||
|
||||
189
diffsynth/models/z_image_image2lora.py
Normal file
189
diffsynth/models/z_image_image2lora.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import torch
|
||||
from .qwen_image_image2lora import ImageEmbeddingToLoraMatrix, SequencialMLP
|
||||
|
||||
|
||||
class LoRATrainerBlock(torch.nn.Module):
|
||||
def __init__(self, lora_patterns, in_dim=1536+4096, compress_dim=128, rank=4, block_id=0, use_residual=True, residual_length=64+7, residual_dim=3584, residual_mid_dim=1024, prefix="transformer_blocks"):
|
||||
super().__init__()
|
||||
self.prefix = prefix
|
||||
self.lora_patterns = lora_patterns
|
||||
self.block_id = block_id
|
||||
self.layers = []
|
||||
for name, lora_a_dim, lora_b_dim in self.lora_patterns:
|
||||
self.layers.append(ImageEmbeddingToLoraMatrix(in_dim, compress_dim, lora_a_dim, lora_b_dim, rank))
|
||||
self.layers = torch.nn.ModuleList(self.layers)
|
||||
if use_residual:
|
||||
self.proj_residual = SequencialMLP(residual_length, residual_dim, residual_mid_dim, compress_dim)
|
||||
else:
|
||||
self.proj_residual = None
|
||||
|
||||
def forward(self, x, residual=None):
|
||||
lora = {}
|
||||
if self.proj_residual is not None: residual = self.proj_residual(residual)
|
||||
for lora_pattern, layer in zip(self.lora_patterns, self.layers):
|
||||
name = lora_pattern[0]
|
||||
lora_a, lora_b = layer(x, residual=residual)
|
||||
lora[f"{self.prefix}.{self.block_id}.{name}.lora_A.default.weight"] = lora_a
|
||||
lora[f"{self.prefix}.{self.block_id}.{name}.lora_B.default.weight"] = lora_b
|
||||
return lora
|
||||
|
||||
|
||||
class ZImageImage2LoRAComponent(torch.nn.Module):
|
||||
def __init__(self, lora_patterns, prefix, num_blocks=60, use_residual=True, compress_dim=128, rank=4, residual_length=64+7, residual_mid_dim=1024):
|
||||
super().__init__()
|
||||
self.lora_patterns = lora_patterns
|
||||
self.num_blocks = num_blocks
|
||||
self.blocks = []
|
||||
for lora_patterns in self.lora_patterns:
|
||||
for block_id in range(self.num_blocks):
|
||||
self.blocks.append(LoRATrainerBlock(lora_patterns, block_id=block_id, use_residual=use_residual, compress_dim=compress_dim, rank=rank, residual_length=residual_length, residual_mid_dim=residual_mid_dim, prefix=prefix))
|
||||
self.blocks = torch.nn.ModuleList(self.blocks)
|
||||
self.residual_scale = 0.05
|
||||
self.use_residual = use_residual
|
||||
|
||||
def forward(self, x, residual=None):
|
||||
if residual is not None:
|
||||
if self.use_residual:
|
||||
residual = residual * self.residual_scale
|
||||
else:
|
||||
residual = None
|
||||
lora = {}
|
||||
for block in self.blocks:
|
||||
lora.update(block(x, residual))
|
||||
return lora
|
||||
|
||||
|
||||
class ZImageImage2LoRAModel(torch.nn.Module):
|
||||
def __init__(self, use_residual=False, compress_dim=64, rank=4, residual_length=64+7, residual_mid_dim=1024):
|
||||
super().__init__()
|
||||
lora_patterns = [
|
||||
[
|
||||
("attention.to_q", 3840, 3840),
|
||||
("attention.to_k", 3840, 3840),
|
||||
("attention.to_v", 3840, 3840),
|
||||
("attention.to_out.0", 3840, 3840),
|
||||
],
|
||||
[
|
||||
("feed_forward.w1", 3840, 10240),
|
||||
("feed_forward.w2", 10240, 3840),
|
||||
("feed_forward.w3", 3840, 10240),
|
||||
],
|
||||
]
|
||||
config = {
|
||||
"lora_patterns": lora_patterns,
|
||||
"use_residual": use_residual,
|
||||
"compress_dim": compress_dim,
|
||||
"rank": rank,
|
||||
"residual_length": residual_length,
|
||||
"residual_mid_dim": residual_mid_dim,
|
||||
}
|
||||
self.layers_lora = ZImageImage2LoRAComponent(
|
||||
prefix="layers",
|
||||
num_blocks=30,
|
||||
**config,
|
||||
)
|
||||
self.context_refiner_lora = ZImageImage2LoRAComponent(
|
||||
prefix="context_refiner",
|
||||
num_blocks=2,
|
||||
**config,
|
||||
)
|
||||
self.noise_refiner_lora = ZImageImage2LoRAComponent(
|
||||
prefix="noise_refiner",
|
||||
num_blocks=2,
|
||||
**config,
|
||||
)
|
||||
|
||||
def forward(self, x, residual=None):
|
||||
lora = {}
|
||||
lora.update(self.layers_lora(x, residual=residual))
|
||||
lora.update(self.context_refiner_lora(x, residual=residual))
|
||||
lora.update(self.noise_refiner_lora(x, residual=residual))
|
||||
return lora
|
||||
|
||||
def initialize_weights(self):
|
||||
state_dict = self.state_dict()
|
||||
for name in state_dict:
|
||||
if ".proj_a." in name:
|
||||
state_dict[name] = state_dict[name] * 0.3
|
||||
elif ".proj_b.proj_out." in name:
|
||||
state_dict[name] = state_dict[name] * 0
|
||||
elif ".proj_residual.proj_out." in name:
|
||||
state_dict[name] = state_dict[name] * 0.3
|
||||
self.load_state_dict(state_dict)
|
||||
|
||||
|
||||
class ImageEmb2LoRAWeightCompressed(torch.nn.Module):
|
||||
def __init__(self, in_dim, out_dim, emb_dim, rank):
|
||||
super().__init__()
|
||||
self.lora_a = torch.nn.Parameter(torch.randn((rank, in_dim)))
|
||||
self.lora_b = torch.nn.Parameter(torch.randn((out_dim, rank)))
|
||||
self.proj = torch.nn.Linear(emb_dim, rank * rank, bias=True)
|
||||
self.rank = rank
|
||||
|
||||
def forward(self, x):
|
||||
x = self.proj(x).view(self.rank, self.rank)
|
||||
lora_a = x @ self.lora_a
|
||||
lora_b = self.lora_b
|
||||
return lora_a, lora_b
|
||||
|
||||
|
||||
class ZImageImage2LoRAModelCompressed(torch.nn.Module):
|
||||
def __init__(self, emb_dim=1536+4096, rank=32):
|
||||
super().__init__()
|
||||
target_layers = [
|
||||
("attention.to_q", 3840, 3840),
|
||||
("attention.to_k", 3840, 3840),
|
||||
("attention.to_v", 3840, 3840),
|
||||
("attention.to_out.0", 3840, 3840),
|
||||
("feed_forward.w1", 3840, 10240),
|
||||
("feed_forward.w2", 10240, 3840),
|
||||
("feed_forward.w3", 3840, 10240),
|
||||
]
|
||||
self.lora_patterns = [
|
||||
{
|
||||
"prefix": "layers",
|
||||
"num_layers": 30,
|
||||
"target_layers": target_layers,
|
||||
},
|
||||
{
|
||||
"prefix": "context_refiner",
|
||||
"num_layers": 2,
|
||||
"target_layers": target_layers,
|
||||
},
|
||||
{
|
||||
"prefix": "noise_refiner",
|
||||
"num_layers": 2,
|
||||
"target_layers": target_layers,
|
||||
},
|
||||
]
|
||||
module_dict = {}
|
||||
for lora_pattern in self.lora_patterns:
|
||||
prefix, num_layers, target_layers = lora_pattern["prefix"], lora_pattern["num_layers"], lora_pattern["target_layers"]
|
||||
for layer_id in range(num_layers):
|
||||
for layer_name, in_dim, out_dim in target_layers:
|
||||
name = f"{prefix}.{layer_id}.{layer_name}".replace(".", "___")
|
||||
model = ImageEmb2LoRAWeightCompressed(in_dim, out_dim, emb_dim, rank)
|
||||
module_dict[name] = model
|
||||
self.module_dict = torch.nn.ModuleDict(module_dict)
|
||||
|
||||
def forward(self, x, residual=None):
|
||||
lora = {}
|
||||
for name, module in self.module_dict.items():
|
||||
name = name.replace("___", ".")
|
||||
name_a, name_b = f"{name}.lora_A.default.weight", f"{name}.lora_B.default.weight"
|
||||
lora_a, lora_b = module(x)
|
||||
lora[name_a] = lora_a
|
||||
lora[name_b] = lora_b
|
||||
return lora
|
||||
|
||||
def initialize_weights(self):
|
||||
state_dict = self.state_dict()
|
||||
for name in state_dict:
|
||||
if "lora_b" in name:
|
||||
state_dict[name] = state_dict[name] * 0
|
||||
elif "lora_a" in name:
|
||||
state_dict[name] = state_dict[name] * 0.2
|
||||
elif "proj.weight" in name:
|
||||
print(name)
|
||||
state_dict[name] = state_dict[name] * 0.2
|
||||
self.load_state_dict(state_dict)
|
||||
@@ -4,12 +4,13 @@ from typing import Union
|
||||
from tqdm import tqdm
|
||||
from einops import rearrange
|
||||
import numpy as np
|
||||
from typing import Union, List, Optional, Tuple, Iterable
|
||||
from typing import Union, List, Optional, Tuple, Iterable, Dict
|
||||
|
||||
from ..diffusion import FlowMatchScheduler
|
||||
from ..core import ModelConfig, gradient_checkpoint_forward
|
||||
from ..core.data.operators import ImageCropAndResize
|
||||
from ..diffusion.base_pipeline import BasePipeline, PipelineUnit, ControlNetInput
|
||||
from ..utils.lora import merge_lora
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from ..models.z_image_text_encoder import ZImageTextEncoder
|
||||
@@ -17,6 +18,9 @@ from ..models.z_image_dit import ZImageDiT
|
||||
from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder
|
||||
from ..models.siglip2_image_encoder import Siglip2ImageEncoder428M
|
||||
from ..models.z_image_controlnet import ZImageControlNet
|
||||
from ..models.siglip2_image_encoder import Siglip2ImageEncoder
|
||||
from ..models.dinov3_image_encoder import DINOv3ImageEncoder
|
||||
from ..models.z_image_image2lora import ZImageImage2LoRAModel
|
||||
|
||||
|
||||
class ZImagePipeline(BasePipeline):
|
||||
@@ -33,6 +37,9 @@ class ZImagePipeline(BasePipeline):
|
||||
self.vae_decoder: FluxVAEDecoder = None
|
||||
self.image_encoder: Siglip2ImageEncoder428M = None
|
||||
self.controlnet: ZImageControlNet = None
|
||||
self.siglip2_image_encoder: Siglip2ImageEncoder = None
|
||||
self.dinov3_image_encoder: DINOv3ImageEncoder = None
|
||||
self.image2lora_style: ZImageImage2LoRAModel = None
|
||||
self.tokenizer: AutoTokenizer = None
|
||||
self.in_iteration_models = ("dit", "controlnet")
|
||||
self.units = [
|
||||
@@ -67,6 +74,9 @@ class ZImagePipeline(BasePipeline):
|
||||
pipe.vae_decoder = model_pool.fetch_model("flux_vae_decoder")
|
||||
pipe.image_encoder = model_pool.fetch_model("siglip_vision_model_428m")
|
||||
pipe.controlnet = model_pool.fetch_model("z_image_controlnet")
|
||||
pipe.siglip2_image_encoder = model_pool.fetch_model("siglip2_image_encoder")
|
||||
pipe.dinov3_image_encoder = model_pool.fetch_model("dinov3_image_encoder")
|
||||
pipe.image2lora_style = model_pool.fetch_model("z_image_image2lora_style")
|
||||
if tokenizer_config is not None:
|
||||
tokenizer_config.download_if_necessary()
|
||||
pipe.tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.path)
|
||||
@@ -100,6 +110,9 @@ class ZImagePipeline(BasePipeline):
|
||||
sigma_shift: float = None,
|
||||
# ControlNet
|
||||
controlnet_inputs: List[ControlNetInput] = None,
|
||||
# Image to LoRA
|
||||
image2lora_images: List[Image.Image] = None,
|
||||
positive_only_lora: Dict[str, torch.Tensor] = None,
|
||||
# Progress bar
|
||||
progress_bar_cmd = tqdm,
|
||||
):
|
||||
@@ -121,6 +134,7 @@ class ZImagePipeline(BasePipeline):
|
||||
"num_inference_steps": num_inference_steps,
|
||||
"edit_image": edit_image, "edit_image_auto_resize": edit_image_auto_resize,
|
||||
"controlnet_inputs": controlnet_inputs,
|
||||
"image2lora_images": image2lora_images, "positive_only_lora": positive_only_lora,
|
||||
}
|
||||
for unit in self.units:
|
||||
inputs_shared, inputs_posi, inputs_nega = self.unit_runner(unit, self, inputs_shared, inputs_posi, inputs_nega)
|
||||
@@ -480,6 +494,71 @@ def model_fn_z_image(
|
||||
return model_output
|
||||
|
||||
|
||||
class ZImageUnit_Image2LoRAEncode(PipelineUnit):
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
input_params=("image2lora_images",),
|
||||
output_params=("image2lora_x",),
|
||||
onload_model_names=("siglip2_image_encoder", "dinov3_image_encoder",),
|
||||
)
|
||||
from ..core.data.operators import ImageCropAndResize
|
||||
self.processor_highres = ImageCropAndResize(height=1024, width=1024)
|
||||
|
||||
def encode_images_using_siglip2(self, pipe: ZImagePipeline, images: list[Image.Image]):
|
||||
pipe.load_models_to_device(["siglip2_image_encoder"])
|
||||
embs = []
|
||||
for image in images:
|
||||
image = self.processor_highres(image)
|
||||
embs.append(pipe.siglip2_image_encoder(image).to(pipe.torch_dtype))
|
||||
embs = torch.stack(embs)
|
||||
return embs
|
||||
|
||||
def encode_images_using_dinov3(self, pipe: ZImagePipeline, images: list[Image.Image]):
|
||||
pipe.load_models_to_device(["dinov3_image_encoder"])
|
||||
embs = []
|
||||
for image in images:
|
||||
image = self.processor_highres(image)
|
||||
embs.append(pipe.dinov3_image_encoder(image).to(pipe.torch_dtype))
|
||||
embs = torch.stack(embs)
|
||||
return embs
|
||||
|
||||
def encode_images(self, pipe: ZImagePipeline, images: list[Image.Image]):
|
||||
if images is None:
|
||||
return {}
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
embs_siglip2 = self.encode_images_using_siglip2(pipe, images)
|
||||
embs_dinov3 = self.encode_images_using_dinov3(pipe, images)
|
||||
x = torch.concat([embs_siglip2, embs_dinov3], dim=-1)
|
||||
return x
|
||||
|
||||
def process(self, pipe: ZImagePipeline, image2lora_images):
|
||||
if image2lora_images is None:
|
||||
return {}
|
||||
x = self.encode_images(pipe, image2lora_images)
|
||||
return {"image2lora_x": x}
|
||||
|
||||
|
||||
class ZImageUnit_Image2LoRADecode(PipelineUnit):
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
input_params=("image2lora_x",),
|
||||
output_params=("lora",),
|
||||
onload_model_names=("image2lora_style",),
|
||||
)
|
||||
|
||||
def process(self, pipe: ZImagePipeline, image2lora_x):
|
||||
if image2lora_x is None:
|
||||
return {}
|
||||
loras = []
|
||||
if pipe.image2lora_style is not None:
|
||||
pipe.load_models_to_device(["image2lora_style"])
|
||||
for x in image2lora_x:
|
||||
loras.append(pipe.image2lora_style(x=x, residual=None))
|
||||
lora = merge_lora(loras, alpha=1 / len(image2lora_x))
|
||||
return {"lora": lora}
|
||||
|
||||
|
||||
def model_fn_z_image_turbo(
|
||||
dit: ZImageDiT,
|
||||
controlnet: ZImageControlNet = None,
|
||||
|
||||
62
examples/z_image/model_inference/Z-Image-Omni-Base-i2L.py
Normal file
62
examples/z_image/model_inference/Z-Image-Omni-Base-i2L.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from diffsynth.pipelines.z_image import (
|
||||
ZImagePipeline, ModelConfig,
|
||||
ZImageUnit_Image2LoRAEncode, ZImageUnit_Image2LoRADecode
|
||||
)
|
||||
from modelscope import snapshot_download
|
||||
from safetensors.torch import save_file
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
# Use `vram_config` to enable LoRA hot-loading
|
||||
vram_config = {
|
||||
"offload_dtype": torch.bfloat16,
|
||||
"offload_device": "cuda",
|
||||
"onload_dtype": torch.bfloat16,
|
||||
"onload_device": "cuda",
|
||||
"preparing_dtype": torch.bfloat16,
|
||||
"preparing_device": "cuda",
|
||||
"computation_dtype": torch.bfloat16,
|
||||
"computation_device": "cuda",
|
||||
}
|
||||
|
||||
# Load models
|
||||
pipe = ZImagePipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="transformer/*.safetensors", **vram_config),
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="siglip/model.safetensors"),
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors"),
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors"),
|
||||
ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors"),
|
||||
ModelConfig(model_id="DiffSynth-Studio/Z-Image-Omni-Base-i2L", origin_file_pattern="model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"),
|
||||
)
|
||||
|
||||
# Load images
|
||||
snapshot_download(
|
||||
model_id="DiffSynth-Studio/Qwen-Image-i2L",
|
||||
allow_file_pattern="assets/style/*",
|
||||
local_dir="data/examples"
|
||||
)
|
||||
images = [Image.open(f"data/style/1/{i}.jpg") for i in range(5)]
|
||||
|
||||
# Image to LoRA
|
||||
with torch.no_grad():
|
||||
embs = ZImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=images)
|
||||
lora = ZImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]
|
||||
save_file(lora, "lora.safetensors")
|
||||
|
||||
# Generate images
|
||||
prompt = "a cat"
|
||||
negative_prompt = "泛黄,发绿,模糊,低分辨率,低质量图像,扭曲的肢体,诡异的外观,丑陋,AI感,噪点,网格感,JPEG压缩条纹,异常的肢体,水印,乱码,意义不明的字符"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
seed=0, cfg_scale=7, num_inference_steps=50,
|
||||
positive_only_lora=lora,
|
||||
sigma_shift=8
|
||||
)
|
||||
image.save("image.jpg")
|
||||
@@ -0,0 +1,62 @@
|
||||
from diffsynth.pipelines.z_image import (
|
||||
ZImagePipeline, ModelConfig,
|
||||
ZImageUnit_Image2LoRAEncode, ZImageUnit_Image2LoRADecode
|
||||
)
|
||||
from modelscope import snapshot_download
|
||||
from safetensors.torch import save_file
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
# Use `vram_config` to enable LoRA hot-loading
|
||||
vram_config = {
|
||||
"offload_dtype": torch.bfloat16,
|
||||
"offload_device": "cpu",
|
||||
"onload_dtype": torch.bfloat16,
|
||||
"onload_device": "cpu",
|
||||
"preparing_dtype": torch.bfloat16,
|
||||
"preparing_device": "cuda",
|
||||
"computation_dtype": torch.bfloat16,
|
||||
"computation_device": "cuda",
|
||||
}
|
||||
|
||||
# Load models
|
||||
pipe = ZImagePipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="transformer/*.safetensors", **vram_config),
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="siglip/model.safetensors", **vram_config),
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
|
||||
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
|
||||
ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors", **vram_config),
|
||||
ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors", **vram_config),
|
||||
ModelConfig(model_id="DiffSynth-Studio/Z-Image-Omni-Base-i2L", origin_file_pattern="model.safetensors", **vram_config),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"),
|
||||
)
|
||||
|
||||
# Load images
|
||||
snapshot_download(
|
||||
model_id="DiffSynth-Studio/Qwen-Image-i2L",
|
||||
allow_file_pattern="assets/style/*",
|
||||
local_dir="data/examples"
|
||||
)
|
||||
images = [Image.open(f"data/style/1/{i}.jpg") for i in range(5)]
|
||||
|
||||
# Image to LoRA
|
||||
with torch.no_grad():
|
||||
embs = ZImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=images)
|
||||
lora = ZImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]
|
||||
save_file(lora, "lora.safetensors")
|
||||
|
||||
# Generate images
|
||||
prompt = "a cat"
|
||||
negative_prompt = "泛黄,发绿,模糊,低分辨率,低质量图像,扭曲的肢体,诡异的外观,丑陋,AI感,噪点,网格感,JPEG压缩条纹,异常的肢体,水印,乱码,意义不明的字符"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
seed=0, cfg_scale=7, num_inference_steps=50,
|
||||
positive_only_lora=lora,
|
||||
sigma_shift=8
|
||||
)
|
||||
image.save("image.jpg")
|
||||
Reference in New Issue
Block a user