From 1d76d5e828ec108b80885e2bfa90dbe9129df16a Mon Sep 17 00:00:00 2001 From: Artiprocher Date: Tue, 12 Aug 2025 17:17:08 +0800 Subject: [PATCH] support qwen-image controlnet --- README.md | 4 + README_zh.md | 3 + diffsynth/configs/model_config.py | 2 + diffsynth/models/qwen_image_controlnet.py | 71 ++++++++++++++ diffsynth/pipelines/qwen_image.py | 97 ++++++++++++++++++- examples/qwen_image/README.md | 1 + examples/qwen_image/README_zh.md | 1 + .../Qwen-Image-Blockwise-ControlNet-Canny.py | 31 ++++++ 8 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 diffsynth/models/qwen_image_controlnet.py create mode 100644 examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py diff --git a/README.md b/README.md index 41a8801..80e70bc 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,8 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| + ### FLUX Series @@ -365,6 +367,8 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## Update History +- **August 12, 2025**: We trained and open-sourced the ControlNet model for Qwen-Image, [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny), which adopts a lightweight architectural design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py). + - **August 11, 2025** We released another distilled acceleration model for Qwen-Image, [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA). It uses the same training process as [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), but the model structure is changed to LoRA. This makes it work better with other open-source models. - **August 7, 2025** We open-sourced the entity control LoRA of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet). diff --git a/README_zh.md b/README_zh.md index 005d76a..22a28b3 100644 --- a/README_zh.md +++ b/README_zh.md @@ -95,6 +95,7 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| @@ -382,6 +383,8 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## 更新历史 +- **2025年8月12日** 我们训练并开源了 Qwen-Image 的 ControlNet 模型 [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny),模型结构采用了轻量化的设计,请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)。 + - **2025年8月11日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA),沿用了与 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) 相同的训练流程,但模型结构修改为了 LoRA,因此能够更好地与其他开源生态模型兼容。 - **2025年8月7日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级可控的文生图。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。 diff --git a/diffsynth/configs/model_config.py b/diffsynth/configs/model_config.py index e328593..f2da874 100644 --- a/diffsynth/configs/model_config.py +++ b/diffsynth/configs/model_config.py @@ -75,6 +75,7 @@ from ..models.nexus_gen import NexusGenAutoregressiveModel from ..models.qwen_image_dit import QwenImageDiT from ..models.qwen_image_text_encoder import QwenImageTextEncoder from ..models.qwen_image_vae import QwenImageVAE +from ..models.qwen_image_controlnet import QwenImageBlockWiseControlNet model_loader_configs = [ # These configs are provided for detecting model type automatically. @@ -167,6 +168,7 @@ model_loader_configs = [ (None, "0319a1cb19835fb510907dd3367c95ff", ["qwen_image_dit"], [QwenImageDiT], "civitai"), (None, "8004730443f55db63092006dd9f7110e", ["qwen_image_text_encoder"], [QwenImageTextEncoder], "diffusers"), (None, "ed4ea5824d55ec3107b09815e318123a", ["qwen_image_vae"], [QwenImageVAE], "diffusers"), + (None, "073bce9cf969e317e5662cd570c3e79c", ["qwen_image_blockwise_controlnet"], [QwenImageBlockWiseControlNet], "civitai"), ] huggingface_model_loader_configs = [ # These configs are provided for detecting model type automatically. diff --git a/diffsynth/models/qwen_image_controlnet.py b/diffsynth/models/qwen_image_controlnet.py new file mode 100644 index 0000000..31a8148 --- /dev/null +++ b/diffsynth/models/qwen_image_controlnet.py @@ -0,0 +1,71 @@ +import torch +import torch.nn as nn +from .qwen_image_dit import QwenEmbedRope, QwenImageTransformerBlock +from ..vram_management import gradient_checkpoint_forward +from einops import rearrange +from .sd3_dit import TimestepEmbeddings, RMSNorm + + + +class BlockWiseControlBlock(torch.nn.Module): + # [linear, gelu, linear] + def __init__(self, dim: int = 3072): + super().__init__() + self.x_rms = RMSNorm(dim, eps=1e-6) + self.y_rms = RMSNorm(dim, eps=1e-6) + self.input_proj = nn.Linear(dim, dim) + self.act = nn.GELU() + self.output_proj = nn.Linear(dim, dim) + + def forward(self, x, y): + x, y = self.x_rms(x), self.y_rms(y) + x = self.input_proj(x + y) + x = self.act(x) + x = self.output_proj(x) + return x + + def init_weights(self): + # zero initialize output_proj + nn.init.zeros_(self.output_proj.weight) + nn.init.zeros_(self.output_proj.bias) + + +class QwenImageBlockWiseControlNet(torch.nn.Module): + def __init__( + self, + num_layers: int = 60, + in_dim: int = 64, + dim: int = 3072, + ): + super().__init__() + self.img_in = nn.Linear(in_dim, dim) + self.controlnet_blocks = nn.ModuleList( + [ + BlockWiseControlBlock(dim) + for _ in range(num_layers) + ] + ) + + def init_weight(self): + nn.init.zeros_(self.img_in.weight) + nn.init.zeros_(self.img_in.bias) + for block in self.controlnet_blocks: + block.init_weights() + + def process_controlnet_conditioning(self, controlnet_conditioning): + return self.img_in(controlnet_conditioning) + + def blockwise_forward(self, img, controlnet_conditioning, block_id): + return self.controlnet_blocks[block_id](img, controlnet_conditioning) + + @staticmethod + def state_dict_converter(): + return QwenImageBlockWiseControlNetStateDictConverter() + + +class QwenImageBlockWiseControlNetStateDictConverter(): + def __init__(self): + pass + + def from_civitai(self, state_dict): + return state_dict diff --git a/diffsynth/pipelines/qwen_image.py b/diffsynth/pipelines/qwen_image.py index 3e952c0..c475415 100644 --- a/diffsynth/pipelines/qwen_image.py +++ b/diffsynth/pipelines/qwen_image.py @@ -4,18 +4,46 @@ from typing import Union from PIL import Image from tqdm import tqdm from einops import rearrange +import numpy as np from ..models import ModelManager, load_state_dict from ..models.qwen_image_dit import QwenImageDiT from ..models.qwen_image_text_encoder import QwenImageTextEncoder from ..models.qwen_image_vae import QwenImageVAE +from ..models.qwen_image_controlnet import QwenImageBlockWiseControlNet from ..schedulers import FlowMatchScheduler from ..utils import BasePipeline, ModelConfig, PipelineUnitRunner, PipelineUnit from ..lora import GeneralLoRALoader +from .flux_image_new import ControlNetInput from ..vram_management import gradient_checkpoint_forward, enable_vram_management, AutoWrappedModule, AutoWrappedLinear +class QwenImageBlockwiseMultiControlNet(torch.nn.Module): + def __init__(self, models: list[QwenImageBlockWiseControlNet]): + super().__init__() + if not isinstance(models, list): + models = [models] + self.models = torch.nn.ModuleList(models) + + def preprocess(self, controlnet_inputs: list[ControlNetInput], conditionings: list[torch.Tensor], **kwargs): + processed_conditionings = [] + for controlnet_input, conditioning in zip(controlnet_inputs, conditionings): + conditioning = rearrange(conditioning, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2) + model_output = self.models[controlnet_input.controlnet_id].process_controlnet_conditioning(conditioning) + processed_conditionings.append(model_output) + return processed_conditionings + + def blockwise_forward(self, image, conditionings: list[torch.Tensor], controlnet_inputs: list[ControlNetInput], progress_id, num_inference_steps, block_id, **kwargs): + res = 0 + for controlnet_input, conditioning in zip(controlnet_inputs, conditionings): + progress = (num_inference_steps - 1 - progress_id) / max(num_inference_steps - 1, 1) + if progress > controlnet_input.start + (1e-4) or progress < controlnet_input.end - (1e-4): + continue + model_output = self.models[controlnet_input.controlnet_id].blockwise_forward(image, conditioning, block_id) + res = res + model_output * controlnet_input.scale + return res + class QwenImagePipeline(BasePipeline): @@ -30,15 +58,17 @@ class QwenImagePipeline(BasePipeline): self.text_encoder: QwenImageTextEncoder = None self.dit: QwenImageDiT = None self.vae: QwenImageVAE = None + self.blockwise_controlnet: QwenImageBlockwiseMultiControlNet = None self.tokenizer: Qwen2Tokenizer = None self.unit_runner = PipelineUnitRunner() - self.in_iteration_models = ("dit",) + self.in_iteration_models = ("dit", "blockwise_controlnet") self.units = [ QwenImageUnit_ShapeChecker(), QwenImageUnit_NoiseInitializer(), QwenImageUnit_InputImageEmbedder(), QwenImageUnit_PromptEmbedder(), QwenImageUnit_EntityControl(), + QwenImageUnit_BlockwiseControlNet(), ] self.model_fn = model_fn_qwen_image @@ -187,6 +217,7 @@ class QwenImagePipeline(BasePipeline): pipe.text_encoder = model_manager.fetch_model("qwen_image_text_encoder") pipe.dit = model_manager.fetch_model("qwen_image_dit") pipe.vae = model_manager.fetch_model("qwen_image_vae") + pipe.blockwise_controlnet = QwenImageBlockwiseMultiControlNet(model_manager.fetch_model("qwen_image_blockwise_controlnet", index="all")) if tokenizer_config is not None and pipe.text_encoder is not None: tokenizer_config.download_if_necessary() from transformers import Qwen2Tokenizer @@ -212,6 +243,8 @@ class QwenImagePipeline(BasePipeline): rand_device: str = "cpu", # Steps num_inference_steps: int = 30, + # Blockwise ControlNet + blockwise_controlnet_inputs: list[ControlNetInput] = None, # EliGen eligen_entity_prompts: list[str] = None, eligen_entity_masks: list[Image.Image] = None, @@ -241,6 +274,8 @@ class QwenImagePipeline(BasePipeline): "height": height, "width": width, "seed": seed, "rand_device": rand_device, "enable_fp8_attention": enable_fp8_attention, + "num_inference_steps": num_inference_steps, + "blockwise_controlnet_inputs": blockwise_controlnet_inputs, "tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride, "eligen_entity_prompts": eligen_entity_prompts, "eligen_entity_masks": eligen_entity_masks, "eligen_enable_on_negative": eligen_enable_on_negative, } @@ -431,14 +466,62 @@ class QwenImageUnit_EntityControl(PipelineUnit): return inputs_shared, inputs_posi, inputs_nega + +class QwenImageUnit_BlockwiseControlNet(PipelineUnit): + def __init__(self): + super().__init__( + input_params=("blockwise_controlnet_inputs", "tiled", "tile_size", "tile_stride"), + onload_model_names=("vae",) + ) + + def apply_controlnet_mask_on_latents(self, pipe, latents, mask): + mask = (pipe.preprocess_image(mask) + 1) / 2 + mask = mask.mean(dim=1, keepdim=True) + mask = 1 - torch.nn.functional.interpolate(mask, size=latents.shape[-2:]) + latents = torch.concat([latents, mask], dim=1) + return latents + + def apply_controlnet_mask_on_image(self, pipe, image, mask): + mask = mask.resize(image.size) + mask = pipe.preprocess_image(mask).mean(dim=[0, 1]).cpu() + image = np.array(image) + image[mask > 0] = 0 + image = Image.fromarray(image) + return image + + def process(self, pipe: QwenImagePipeline, blockwise_controlnet_inputs: list[ControlNetInput], tiled, tile_size, tile_stride): + if blockwise_controlnet_inputs is None: + return {} + pipe.load_models_to_device(self.onload_model_names) + conditionings = [] + for controlnet_input in blockwise_controlnet_inputs: + image = controlnet_input.image + if controlnet_input.inpaint_mask is not None: + image = self.apply_controlnet_mask_on_image(pipe, image, controlnet_input.inpaint_mask) + + image = pipe.preprocess_image(image).to(device=pipe.device, dtype=pipe.torch_dtype) + image = pipe.vae.encode(image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) + + if controlnet_input.inpaint_mask is not None: + image = self.apply_controlnet_mask_on_latents(pipe, image, controlnet_input.inpaint_mask) + conditionings.append(image) + + return {"blockwise_controlnet_conditioning": conditionings} + + def model_fn_qwen_image( dit: QwenImageDiT = None, + blockwise_controlnet: QwenImageBlockwiseMultiControlNet = None, latents=None, timestep=None, prompt_emb=None, prompt_emb_mask=None, height=None, width=None, + blockwise_controlnet_conditioning=None, + blockwise_controlnet_inputs=None, + progress_id=0, + num_inference_steps=1, entity_prompt_emb=None, entity_prompt_emb_mask=None, entity_masks=None, @@ -465,8 +548,12 @@ def model_fn_qwen_image( text = dit.txt_in(dit.txt_norm(prompt_emb)) image_rotary_emb = dit.pos_embed(img_shapes, txt_seq_lens, device=latents.device) attention_mask = None + + if blockwise_controlnet_conditioning is not None: + blockwise_controlnet_conditioning = blockwise_controlnet.preprocess( + blockwise_controlnet_inputs, blockwise_controlnet_conditioning) - for block in dit.transformer_blocks: + for block_id, block in enumerate(dit.transformer_blocks): text, image = gradient_checkpoint_forward( block, use_gradient_checkpointing, @@ -478,6 +565,12 @@ def model_fn_qwen_image( attention_mask=attention_mask, enable_fp8_attention=enable_fp8_attention, ) + if blockwise_controlnet_conditioning is not None: + image = image + blockwise_controlnet.blockwise_forward( + image=image, conditionings=blockwise_controlnet_conditioning, + controlnet_inputs=blockwise_controlnet_inputs, block_id=block_id, + progress_id=progress_id, num_inference_steps=num_inference_steps, + ) image = dit.norm_out(image, conditioning) image = dit.proj_out(image) diff --git a/examples/qwen_image/README.md b/examples/qwen_image/README.md index bd5c11d..bfc1126 100644 --- a/examples/qwen_image/README.md +++ b/examples/qwen_image/README.md @@ -46,6 +46,7 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| ## Model Inference diff --git a/examples/qwen_image/README_zh.md b/examples/qwen_image/README_zh.md index 6d7c8e6..70534ef 100644 --- a/examples/qwen_image/README_zh.md +++ b/examples/qwen_image/README_zh.md @@ -46,6 +46,7 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| ## 模型推理 diff --git a/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py b/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py new file mode 100644 index 0000000..5ae223a --- /dev/null +++ b/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py @@ -0,0 +1,31 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig, ControlNetInput +from PIL import Image +import torch +from modelscope import dataset_snapshot_download + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny", origin_file_pattern="model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) + +dataset_snapshot_download( + dataset_id="DiffSynth-Studio/example_image_dataset", + local_dir="./", + allow_file_pattern="data/example_image_dataset/canny/image_1.jpg" +) +controlnet_image = Image.open("data/example_image_dataset/canny/image_1.jpg").resize((1328, 1328)) + +prompt = "一只小狗,毛发光洁柔顺,眼神灵动,背景是樱花纷飞的春日庭院,唯美温馨。" +image = pipe( + prompt, seed=0, + blockwise_controlnet_inputs=[ControlNetInput(image=controlnet_image)] +) +image.save("image.jpg")