support z-image and z-image-i2L

This commit is contained in:
Artiprocher
2026-01-27 10:56:15 +08:00
parent ffb7a138f7
commit d12bf71bcc
12 changed files with 266 additions and 4 deletions

View File

@@ -0,0 +1,61 @@
from diffsynth.pipelines.z_image import (
ZImagePipeline, ModelConfig,
ZImageUnit_Image2LoRAEncode, ZImageUnit_Image2LoRADecode
)
from modelscope import snapshot_download
from safetensors.torch import save_file
import torch
from PIL import Image
# Use `vram_config` to enable LoRA hot-loading
vram_config = {
"offload_dtype": torch.bfloat16,
"offload_device": "cpu",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
# Load models
pipe = ZImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Tongyi-MAI/Z-Image", origin_file_pattern="transformer/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/Z-Image-i2L", origin_file_pattern="model.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"),
)
# Load images
snapshot_download(
model_id="DiffSynth-Studio/Z-Image-i2L",
allow_file_pattern="assets/style/*",
local_dir="data/style_input"
)
images = [Image.open(f"data/style_input/assets/style/1/{i}.jpg") for i in range(6)]
# Image to LoRA
with torch.no_grad():
embs = ZImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=images)
lora = ZImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]
save_file(lora, "lora.safetensors")
# Generate images
prompt = "a cat"
negative_prompt = "泛黄发绿模糊低分辨率低质量图像扭曲的肢体诡异的外观丑陋AI感噪点网格感JPEG压缩条纹异常的肢体水印乱码意义不明的字符"
image = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
seed=0, cfg_scale=7, num_inference_steps=50,
positive_only_lora=lora,
sigma_shift=8
)
image.save("image.jpg")

View File

@@ -0,0 +1,26 @@
from diffsynth.pipelines.z_image import ZImagePipeline, ModelConfig
import torch
vram_config = {
"offload_dtype": torch.bfloat16,
"offload_device": "cpu",
"onload_dtype": torch.bfloat16,
"onload_device": "cpu",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = ZImagePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(model_id="Tongyi-MAI/Z-Image", origin_file_pattern="transformer/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"),
)
prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
image = pipe(prompt=prompt, seed=42, rand_device="cuda", num_inference_steps=50, cfg_scale=4)
image.save("image_Z-Image.jpg")