mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-23 17:38:10 +00:00
support qwen-image controlnet
This commit is contained in:
@@ -93,6 +93,8 @@ image.save("image.jpg")
|
|||||||
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
||||||
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
||||||
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|
||||||
|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
### FLUX Series
|
### FLUX Series
|
||||||
@@ -365,6 +367,8 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44
|
|||||||
|
|
||||||
## Update History
|
## Update History
|
||||||
|
|
||||||
|
- **August 12, 2025**: We trained and open-sourced the ControlNet model for Qwen-Image, [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny), which adopts a lightweight architectural design. Please refer to [our sample code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py).
|
||||||
|
|
||||||
- **August 11, 2025** We released another distilled acceleration model for Qwen-Image, [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA). It uses the same training process as [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), but the model structure is changed to LoRA. This makes it work better with other open-source models.
|
- **August 11, 2025** We released another distilled acceleration model for Qwen-Image, [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA). It uses the same training process as [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), but the model structure is changed to LoRA. This makes it work better with other open-source models.
|
||||||
|
|
||||||
- **August 7, 2025** We open-sourced the entity control LoRA of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet).
|
- **August 7, 2025** We open-sourced the entity control LoRA of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet).
|
||||||
|
|||||||
@@ -95,6 +95,7 @@ image.save("image.jpg")
|
|||||||
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
||||||
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
||||||
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)|
|
||||||
|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -382,6 +383,8 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44
|
|||||||
|
|
||||||
## 更新历史
|
## 更新历史
|
||||||
|
|
||||||
|
- **2025年8月12日** 我们训练并开源了 Qwen-Image 的 ControlNet 模型 [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny),模型结构采用了轻量化的设计,请参考[我们的示例代码](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)。
|
||||||
|
|
||||||
- **2025年8月11日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA),沿用了与 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) 相同的训练流程,但模型结构修改为了 LoRA,因此能够更好地与其他开源生态模型兼容。
|
- **2025年8月11日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA),沿用了与 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) 相同的训练流程,但模型结构修改为了 LoRA,因此能够更好地与其他开源生态模型兼容。
|
||||||
|
|
||||||
- **2025年8月7日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级可控的文生图。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。
|
- **2025年8月7日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级可控的文生图。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ from ..models.nexus_gen import NexusGenAutoregressiveModel
|
|||||||
from ..models.qwen_image_dit import QwenImageDiT
|
from ..models.qwen_image_dit import QwenImageDiT
|
||||||
from ..models.qwen_image_text_encoder import QwenImageTextEncoder
|
from ..models.qwen_image_text_encoder import QwenImageTextEncoder
|
||||||
from ..models.qwen_image_vae import QwenImageVAE
|
from ..models.qwen_image_vae import QwenImageVAE
|
||||||
|
from ..models.qwen_image_controlnet import QwenImageBlockWiseControlNet
|
||||||
|
|
||||||
model_loader_configs = [
|
model_loader_configs = [
|
||||||
# These configs are provided for detecting model type automatically.
|
# These configs are provided for detecting model type automatically.
|
||||||
@@ -167,6 +168,7 @@ model_loader_configs = [
|
|||||||
(None, "0319a1cb19835fb510907dd3367c95ff", ["qwen_image_dit"], [QwenImageDiT], "civitai"),
|
(None, "0319a1cb19835fb510907dd3367c95ff", ["qwen_image_dit"], [QwenImageDiT], "civitai"),
|
||||||
(None, "8004730443f55db63092006dd9f7110e", ["qwen_image_text_encoder"], [QwenImageTextEncoder], "diffusers"),
|
(None, "8004730443f55db63092006dd9f7110e", ["qwen_image_text_encoder"], [QwenImageTextEncoder], "diffusers"),
|
||||||
(None, "ed4ea5824d55ec3107b09815e318123a", ["qwen_image_vae"], [QwenImageVAE], "diffusers"),
|
(None, "ed4ea5824d55ec3107b09815e318123a", ["qwen_image_vae"], [QwenImageVAE], "diffusers"),
|
||||||
|
(None, "073bce9cf969e317e5662cd570c3e79c", ["qwen_image_blockwise_controlnet"], [QwenImageBlockWiseControlNet], "civitai"),
|
||||||
]
|
]
|
||||||
huggingface_model_loader_configs = [
|
huggingface_model_loader_configs = [
|
||||||
# These configs are provided for detecting model type automatically.
|
# These configs are provided for detecting model type automatically.
|
||||||
|
|||||||
71
diffsynth/models/qwen_image_controlnet.py
Normal file
71
diffsynth/models/qwen_image_controlnet.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from .qwen_image_dit import QwenEmbedRope, QwenImageTransformerBlock
|
||||||
|
from ..vram_management import gradient_checkpoint_forward
|
||||||
|
from einops import rearrange
|
||||||
|
from .sd3_dit import TimestepEmbeddings, RMSNorm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class BlockWiseControlBlock(torch.nn.Module):
|
||||||
|
# [linear, gelu, linear]
|
||||||
|
def __init__(self, dim: int = 3072):
|
||||||
|
super().__init__()
|
||||||
|
self.x_rms = RMSNorm(dim, eps=1e-6)
|
||||||
|
self.y_rms = RMSNorm(dim, eps=1e-6)
|
||||||
|
self.input_proj = nn.Linear(dim, dim)
|
||||||
|
self.act = nn.GELU()
|
||||||
|
self.output_proj = nn.Linear(dim, dim)
|
||||||
|
|
||||||
|
def forward(self, x, y):
|
||||||
|
x, y = self.x_rms(x), self.y_rms(y)
|
||||||
|
x = self.input_proj(x + y)
|
||||||
|
x = self.act(x)
|
||||||
|
x = self.output_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def init_weights(self):
|
||||||
|
# zero initialize output_proj
|
||||||
|
nn.init.zeros_(self.output_proj.weight)
|
||||||
|
nn.init.zeros_(self.output_proj.bias)
|
||||||
|
|
||||||
|
|
||||||
|
class QwenImageBlockWiseControlNet(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_layers: int = 60,
|
||||||
|
in_dim: int = 64,
|
||||||
|
dim: int = 3072,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.img_in = nn.Linear(in_dim, dim)
|
||||||
|
self.controlnet_blocks = nn.ModuleList(
|
||||||
|
[
|
||||||
|
BlockWiseControlBlock(dim)
|
||||||
|
for _ in range(num_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def init_weight(self):
|
||||||
|
nn.init.zeros_(self.img_in.weight)
|
||||||
|
nn.init.zeros_(self.img_in.bias)
|
||||||
|
for block in self.controlnet_blocks:
|
||||||
|
block.init_weights()
|
||||||
|
|
||||||
|
def process_controlnet_conditioning(self, controlnet_conditioning):
|
||||||
|
return self.img_in(controlnet_conditioning)
|
||||||
|
|
||||||
|
def blockwise_forward(self, img, controlnet_conditioning, block_id):
|
||||||
|
return self.controlnet_blocks[block_id](img, controlnet_conditioning)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def state_dict_converter():
|
||||||
|
return QwenImageBlockWiseControlNetStateDictConverter()
|
||||||
|
|
||||||
|
|
||||||
|
class QwenImageBlockWiseControlNetStateDictConverter():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def from_civitai(self, state_dict):
|
||||||
|
return state_dict
|
||||||
@@ -4,18 +4,46 @@ from typing import Union
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from ..models import ModelManager, load_state_dict
|
from ..models import ModelManager, load_state_dict
|
||||||
from ..models.qwen_image_dit import QwenImageDiT
|
from ..models.qwen_image_dit import QwenImageDiT
|
||||||
from ..models.qwen_image_text_encoder import QwenImageTextEncoder
|
from ..models.qwen_image_text_encoder import QwenImageTextEncoder
|
||||||
from ..models.qwen_image_vae import QwenImageVAE
|
from ..models.qwen_image_vae import QwenImageVAE
|
||||||
|
from ..models.qwen_image_controlnet import QwenImageBlockWiseControlNet
|
||||||
from ..schedulers import FlowMatchScheduler
|
from ..schedulers import FlowMatchScheduler
|
||||||
from ..utils import BasePipeline, ModelConfig, PipelineUnitRunner, PipelineUnit
|
from ..utils import BasePipeline, ModelConfig, PipelineUnitRunner, PipelineUnit
|
||||||
from ..lora import GeneralLoRALoader
|
from ..lora import GeneralLoRALoader
|
||||||
|
from .flux_image_new import ControlNetInput
|
||||||
|
|
||||||
from ..vram_management import gradient_checkpoint_forward, enable_vram_management, AutoWrappedModule, AutoWrappedLinear
|
from ..vram_management import gradient_checkpoint_forward, enable_vram_management, AutoWrappedModule, AutoWrappedLinear
|
||||||
|
|
||||||
|
|
||||||
|
class QwenImageBlockwiseMultiControlNet(torch.nn.Module):
|
||||||
|
def __init__(self, models: list[QwenImageBlockWiseControlNet]):
|
||||||
|
super().__init__()
|
||||||
|
if not isinstance(models, list):
|
||||||
|
models = [models]
|
||||||
|
self.models = torch.nn.ModuleList(models)
|
||||||
|
|
||||||
|
def preprocess(self, controlnet_inputs: list[ControlNetInput], conditionings: list[torch.Tensor], **kwargs):
|
||||||
|
processed_conditionings = []
|
||||||
|
for controlnet_input, conditioning in zip(controlnet_inputs, conditionings):
|
||||||
|
conditioning = rearrange(conditioning, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
|
||||||
|
model_output = self.models[controlnet_input.controlnet_id].process_controlnet_conditioning(conditioning)
|
||||||
|
processed_conditionings.append(model_output)
|
||||||
|
return processed_conditionings
|
||||||
|
|
||||||
|
def blockwise_forward(self, image, conditionings: list[torch.Tensor], controlnet_inputs: list[ControlNetInput], progress_id, num_inference_steps, block_id, **kwargs):
|
||||||
|
res = 0
|
||||||
|
for controlnet_input, conditioning in zip(controlnet_inputs, conditionings):
|
||||||
|
progress = (num_inference_steps - 1 - progress_id) / max(num_inference_steps - 1, 1)
|
||||||
|
if progress > controlnet_input.start + (1e-4) or progress < controlnet_input.end - (1e-4):
|
||||||
|
continue
|
||||||
|
model_output = self.models[controlnet_input.controlnet_id].blockwise_forward(image, conditioning, block_id)
|
||||||
|
res = res + model_output * controlnet_input.scale
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
class QwenImagePipeline(BasePipeline):
|
class QwenImagePipeline(BasePipeline):
|
||||||
|
|
||||||
@@ -30,15 +58,17 @@ class QwenImagePipeline(BasePipeline):
|
|||||||
self.text_encoder: QwenImageTextEncoder = None
|
self.text_encoder: QwenImageTextEncoder = None
|
||||||
self.dit: QwenImageDiT = None
|
self.dit: QwenImageDiT = None
|
||||||
self.vae: QwenImageVAE = None
|
self.vae: QwenImageVAE = None
|
||||||
|
self.blockwise_controlnet: QwenImageBlockwiseMultiControlNet = None
|
||||||
self.tokenizer: Qwen2Tokenizer = None
|
self.tokenizer: Qwen2Tokenizer = None
|
||||||
self.unit_runner = PipelineUnitRunner()
|
self.unit_runner = PipelineUnitRunner()
|
||||||
self.in_iteration_models = ("dit",)
|
self.in_iteration_models = ("dit", "blockwise_controlnet")
|
||||||
self.units = [
|
self.units = [
|
||||||
QwenImageUnit_ShapeChecker(),
|
QwenImageUnit_ShapeChecker(),
|
||||||
QwenImageUnit_NoiseInitializer(),
|
QwenImageUnit_NoiseInitializer(),
|
||||||
QwenImageUnit_InputImageEmbedder(),
|
QwenImageUnit_InputImageEmbedder(),
|
||||||
QwenImageUnit_PromptEmbedder(),
|
QwenImageUnit_PromptEmbedder(),
|
||||||
QwenImageUnit_EntityControl(),
|
QwenImageUnit_EntityControl(),
|
||||||
|
QwenImageUnit_BlockwiseControlNet(),
|
||||||
]
|
]
|
||||||
self.model_fn = model_fn_qwen_image
|
self.model_fn = model_fn_qwen_image
|
||||||
|
|
||||||
@@ -187,6 +217,7 @@ class QwenImagePipeline(BasePipeline):
|
|||||||
pipe.text_encoder = model_manager.fetch_model("qwen_image_text_encoder")
|
pipe.text_encoder = model_manager.fetch_model("qwen_image_text_encoder")
|
||||||
pipe.dit = model_manager.fetch_model("qwen_image_dit")
|
pipe.dit = model_manager.fetch_model("qwen_image_dit")
|
||||||
pipe.vae = model_manager.fetch_model("qwen_image_vae")
|
pipe.vae = model_manager.fetch_model("qwen_image_vae")
|
||||||
|
pipe.blockwise_controlnet = QwenImageBlockwiseMultiControlNet(model_manager.fetch_model("qwen_image_blockwise_controlnet", index="all"))
|
||||||
if tokenizer_config is not None and pipe.text_encoder is not None:
|
if tokenizer_config is not None and pipe.text_encoder is not None:
|
||||||
tokenizer_config.download_if_necessary()
|
tokenizer_config.download_if_necessary()
|
||||||
from transformers import Qwen2Tokenizer
|
from transformers import Qwen2Tokenizer
|
||||||
@@ -212,6 +243,8 @@ class QwenImagePipeline(BasePipeline):
|
|||||||
rand_device: str = "cpu",
|
rand_device: str = "cpu",
|
||||||
# Steps
|
# Steps
|
||||||
num_inference_steps: int = 30,
|
num_inference_steps: int = 30,
|
||||||
|
# Blockwise ControlNet
|
||||||
|
blockwise_controlnet_inputs: list[ControlNetInput] = None,
|
||||||
# EliGen
|
# EliGen
|
||||||
eligen_entity_prompts: list[str] = None,
|
eligen_entity_prompts: list[str] = None,
|
||||||
eligen_entity_masks: list[Image.Image] = None,
|
eligen_entity_masks: list[Image.Image] = None,
|
||||||
@@ -241,6 +274,8 @@ class QwenImagePipeline(BasePipeline):
|
|||||||
"height": height, "width": width,
|
"height": height, "width": width,
|
||||||
"seed": seed, "rand_device": rand_device,
|
"seed": seed, "rand_device": rand_device,
|
||||||
"enable_fp8_attention": enable_fp8_attention,
|
"enable_fp8_attention": enable_fp8_attention,
|
||||||
|
"num_inference_steps": num_inference_steps,
|
||||||
|
"blockwise_controlnet_inputs": blockwise_controlnet_inputs,
|
||||||
"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride,
|
"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride,
|
||||||
"eligen_entity_prompts": eligen_entity_prompts, "eligen_entity_masks": eligen_entity_masks, "eligen_enable_on_negative": eligen_enable_on_negative,
|
"eligen_entity_prompts": eligen_entity_prompts, "eligen_entity_masks": eligen_entity_masks, "eligen_enable_on_negative": eligen_enable_on_negative,
|
||||||
}
|
}
|
||||||
@@ -431,14 +466,62 @@ class QwenImageUnit_EntityControl(PipelineUnit):
|
|||||||
return inputs_shared, inputs_posi, inputs_nega
|
return inputs_shared, inputs_posi, inputs_nega
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class QwenImageUnit_BlockwiseControlNet(PipelineUnit):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(
|
||||||
|
input_params=("blockwise_controlnet_inputs", "tiled", "tile_size", "tile_stride"),
|
||||||
|
onload_model_names=("vae",)
|
||||||
|
)
|
||||||
|
|
||||||
|
def apply_controlnet_mask_on_latents(self, pipe, latents, mask):
|
||||||
|
mask = (pipe.preprocess_image(mask) + 1) / 2
|
||||||
|
mask = mask.mean(dim=1, keepdim=True)
|
||||||
|
mask = 1 - torch.nn.functional.interpolate(mask, size=latents.shape[-2:])
|
||||||
|
latents = torch.concat([latents, mask], dim=1)
|
||||||
|
return latents
|
||||||
|
|
||||||
|
def apply_controlnet_mask_on_image(self, pipe, image, mask):
|
||||||
|
mask = mask.resize(image.size)
|
||||||
|
mask = pipe.preprocess_image(mask).mean(dim=[0, 1]).cpu()
|
||||||
|
image = np.array(image)
|
||||||
|
image[mask > 0] = 0
|
||||||
|
image = Image.fromarray(image)
|
||||||
|
return image
|
||||||
|
|
||||||
|
def process(self, pipe: QwenImagePipeline, blockwise_controlnet_inputs: list[ControlNetInput], tiled, tile_size, tile_stride):
|
||||||
|
if blockwise_controlnet_inputs is None:
|
||||||
|
return {}
|
||||||
|
pipe.load_models_to_device(self.onload_model_names)
|
||||||
|
conditionings = []
|
||||||
|
for controlnet_input in blockwise_controlnet_inputs:
|
||||||
|
image = controlnet_input.image
|
||||||
|
if controlnet_input.inpaint_mask is not None:
|
||||||
|
image = self.apply_controlnet_mask_on_image(pipe, image, controlnet_input.inpaint_mask)
|
||||||
|
|
||||||
|
image = pipe.preprocess_image(image).to(device=pipe.device, dtype=pipe.torch_dtype)
|
||||||
|
image = pipe.vae.encode(image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
|
||||||
|
|
||||||
|
if controlnet_input.inpaint_mask is not None:
|
||||||
|
image = self.apply_controlnet_mask_on_latents(pipe, image, controlnet_input.inpaint_mask)
|
||||||
|
conditionings.append(image)
|
||||||
|
|
||||||
|
return {"blockwise_controlnet_conditioning": conditionings}
|
||||||
|
|
||||||
|
|
||||||
def model_fn_qwen_image(
|
def model_fn_qwen_image(
|
||||||
dit: QwenImageDiT = None,
|
dit: QwenImageDiT = None,
|
||||||
|
blockwise_controlnet: QwenImageBlockwiseMultiControlNet = None,
|
||||||
latents=None,
|
latents=None,
|
||||||
timestep=None,
|
timestep=None,
|
||||||
prompt_emb=None,
|
prompt_emb=None,
|
||||||
prompt_emb_mask=None,
|
prompt_emb_mask=None,
|
||||||
height=None,
|
height=None,
|
||||||
width=None,
|
width=None,
|
||||||
|
blockwise_controlnet_conditioning=None,
|
||||||
|
blockwise_controlnet_inputs=None,
|
||||||
|
progress_id=0,
|
||||||
|
num_inference_steps=1,
|
||||||
entity_prompt_emb=None,
|
entity_prompt_emb=None,
|
||||||
entity_prompt_emb_mask=None,
|
entity_prompt_emb_mask=None,
|
||||||
entity_masks=None,
|
entity_masks=None,
|
||||||
@@ -465,8 +548,12 @@ def model_fn_qwen_image(
|
|||||||
text = dit.txt_in(dit.txt_norm(prompt_emb))
|
text = dit.txt_in(dit.txt_norm(prompt_emb))
|
||||||
image_rotary_emb = dit.pos_embed(img_shapes, txt_seq_lens, device=latents.device)
|
image_rotary_emb = dit.pos_embed(img_shapes, txt_seq_lens, device=latents.device)
|
||||||
attention_mask = None
|
attention_mask = None
|
||||||
|
|
||||||
|
if blockwise_controlnet_conditioning is not None:
|
||||||
|
blockwise_controlnet_conditioning = blockwise_controlnet.preprocess(
|
||||||
|
blockwise_controlnet_inputs, blockwise_controlnet_conditioning)
|
||||||
|
|
||||||
for block in dit.transformer_blocks:
|
for block_id, block in enumerate(dit.transformer_blocks):
|
||||||
text, image = gradient_checkpoint_forward(
|
text, image = gradient_checkpoint_forward(
|
||||||
block,
|
block,
|
||||||
use_gradient_checkpointing,
|
use_gradient_checkpointing,
|
||||||
@@ -478,6 +565,12 @@ def model_fn_qwen_image(
|
|||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
enable_fp8_attention=enable_fp8_attention,
|
enable_fp8_attention=enable_fp8_attention,
|
||||||
)
|
)
|
||||||
|
if blockwise_controlnet_conditioning is not None:
|
||||||
|
image = image + blockwise_controlnet.blockwise_forward(
|
||||||
|
image=image, conditionings=blockwise_controlnet_conditioning,
|
||||||
|
controlnet_inputs=blockwise_controlnet_inputs, block_id=block_id,
|
||||||
|
progress_id=progress_id, num_inference_steps=num_inference_steps,
|
||||||
|
)
|
||||||
|
|
||||||
image = dit.norm_out(image, conditioning)
|
image = dit.norm_out(image, conditioning)
|
||||||
image = dit.proj_out(image)
|
image = dit.proj_out(image)
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ image.save("image.jpg")
|
|||||||
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
||||||
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
||||||
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)|
|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)|
|
||||||
|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-|
|
||||||
|
|
||||||
## Model Inference
|
## Model Inference
|
||||||
|
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ image.save("image.jpg")
|
|||||||
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
|[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)|
|
||||||
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
|[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|-|-|-|-|
|
||||||
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)|
|
|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)|
|
||||||
|
|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-|
|
||||||
|
|
||||||
## 模型推理
|
## 模型推理
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,31 @@
|
|||||||
|
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig, ControlNetInput
|
||||||
|
from PIL import Image
|
||||||
|
import torch
|
||||||
|
from modelscope import dataset_snapshot_download
|
||||||
|
|
||||||
|
|
||||||
|
pipe = QwenImagePipeline.from_pretrained(
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device="cuda",
|
||||||
|
model_configs=[
|
||||||
|
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
|
||||||
|
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
||||||
|
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||||
|
ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny", origin_file_pattern="model.safetensors"),
|
||||||
|
],
|
||||||
|
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset_snapshot_download(
|
||||||
|
dataset_id="DiffSynth-Studio/example_image_dataset",
|
||||||
|
local_dir="./",
|
||||||
|
allow_file_pattern="data/example_image_dataset/canny/image_1.jpg"
|
||||||
|
)
|
||||||
|
controlnet_image = Image.open("data/example_image_dataset/canny/image_1.jpg").resize((1328, 1328))
|
||||||
|
|
||||||
|
prompt = "一只小狗,毛发光洁柔顺,眼神灵动,背景是樱花纷飞的春日庭院,唯美温馨。"
|
||||||
|
image = pipe(
|
||||||
|
prompt, seed=0,
|
||||||
|
blockwise_controlnet_inputs=[ControlNetInput(image=controlnet_image)]
|
||||||
|
)
|
||||||
|
image.save("image.jpg")
|
||||||
Reference in New Issue
Block a user