From 07d70a6a56d6668d340f219ab12afcf78c0b7cbe Mon Sep 17 00:00:00 2001
From: Artiprocher <wangye87v5@hotmail.com>
Date: Tue, 22 Oct 2024 18:52:24 +0800
Subject: [PATCH] support flux-controlnet

---
 diffsynth/configs/model_config.py           |   5 +
 diffsynth/controlnets/__init__.py           |   2 +-
 diffsynth/controlnets/controlnet_unit.py    |  29 ++-
 diffsynth/controlnets/processors.py         |  43 ++--
 diffsynth/models/flux_controlnet.py         | 226 ++++++++++++++++++++
 diffsynth/models/model_manager.py           |  45 +---
 diffsynth/models/utils.py                   |  38 ++++
 diffsynth/pipelines/flux_image.py           | 166 ++++++++++++--
 examples/image_synthesis/flux_controlnet.py |  44 ++++
 9 files changed, 522 insertions(+), 76 deletions(-)
 create mode 100644 diffsynth/models/flux_controlnet.py
 create mode 100644 examples/image_synthesis/flux_controlnet.py

diff --git a/diffsynth/configs/model_config.py b/diffsynth/configs/model_config.py
index 27223e9..963af72 100644
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -35,6 +35,7 @@ from ..models.hunyuan_dit import HunyuanDiT
 from ..models.flux_dit import FluxDiT
 from ..models.flux_text_encoder import FluxTextEncoder1, FluxTextEncoder2
 from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder
+from ..models.flux_controlnet import FluxControlNet
 
 from ..models.cog_vae import CogVAEEncoder, CogVAEDecoder
 from ..models.cog_dit import CogDiT
@@ -80,6 +81,10 @@ model_loader_configs = [
     (None, "280189ee084bca10f70907bf6ce1649d", ["cog_vae_encoder", "cog_vae_decoder"], [CogVAEEncoder, CogVAEDecoder], "diffusers"),
     (None, "9b9313d104ac4df27991352fec013fd4", ["rife"], [IFNet], "civitai"),
     (None, "6b7116078c4170bfbeaedc8fe71f6649", ["esrgan"], [RRDBNet], "civitai"),
+    (None, "78d18b9101345ff695f312e7e62538c0", ["flux_controlnet"], [FluxControlNet], "diffusers"),
+    (None, "b001c89139b5f053c715fe772362dd2a", ["flux_controlnet"], [FluxControlNet], "diffusers"),
+    (None, "52357cb26250681367488a8954c271e8", ["flux_controlnet"], [FluxControlNet], "diffusers"),
+    (None, "0cfd1740758423a2a854d67c136d1e8c", ["flux_controlnet"], [FluxControlNet], "diffusers"),
 ]
 huggingface_model_loader_configs = [
     # These configs are provided for detecting model type automatically.
diff --git a/diffsynth/controlnets/__init__.py b/diffsynth/controlnets/__init__.py
index b08ba4c..a3e15ad 100644
--- a/diffsynth/controlnets/__init__.py
+++ b/diffsynth/controlnets/__init__.py
@@ -1,2 +1,2 @@
-from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager
+from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager, FluxMultiControlNetManager
 from .processors import Annotator
diff --git a/diffsynth/controlnets/controlnet_unit.py b/diffsynth/controlnets/controlnet_unit.py
index f03fec5..fba09b6 100644
--- a/diffsynth/controlnets/controlnet_unit.py
+++ b/diffsynth/controlnets/controlnet_unit.py
@@ -4,10 +4,11 @@ from .processors import Processor_id
 
 
 class ControlNetConfigUnit:
-    def __init__(self, processor_id: Processor_id, model_path, scale=1.0):
+    def __init__(self, processor_id: Processor_id, model_path, scale=1.0, skip_processor=False):
         self.processor_id = processor_id
         self.model_path = model_path
         self.scale = scale
+        self.skip_processor = skip_processor
 
 
 class ControlNetUnit:
@@ -60,3 +61,29 @@ class MultiControlNetManager:
             else:
                 res_stack = [i + j for i, j in zip(res_stack, res_stack_)]
         return res_stack
+
+
+class FluxMultiControlNetManager(MultiControlNetManager):
+    def __init__(self, controlnet_units=[]):
+        super().__init__(controlnet_units=controlnet_units)
+
+    def process_image(self, image, processor_id=None):
+        if processor_id is None:
+            processed_image = [processor(image) for processor in self.processors]
+        else:
+            processed_image = [self.processors[processor_id](image)]
+        return processed_image
+
+    def __call__(self, conditionings, **kwargs):
+        res_stack, single_res_stack = None, None
+        for processor, conditioning, model, scale in zip(self.processors, conditionings, self.models, self.scales):
+            res_stack_, single_res_stack_ = model(controlnet_conditioning=conditioning, processor_id=processor.processor_id, **kwargs)
+            res_stack_ = [res * scale for res in res_stack_]
+            single_res_stack_ = [res * scale for res in single_res_stack_]
+            if res_stack is None:
+                res_stack = res_stack_
+                single_res_stack = single_res_stack_
+            else:
+                res_stack = [i + j for i, j in zip(res_stack, res_stack_)]
+                single_res_stack = [i + j for i, j in zip(single_res_stack, single_res_stack_)]
+        return res_stack, single_res_stack
diff --git a/diffsynth/controlnets/processors.py b/diffsynth/controlnets/processors.py
index 1d23c73..71e47da 100644
--- a/diffsynth/controlnets/processors.py
+++ b/diffsynth/controlnets/processors.py
@@ -3,37 +3,42 @@ import warnings
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     from controlnet_aux.processor import (
-        CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector
+        CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector, NormalBaeDetector
     )
 
 
 Processor_id: TypeAlias = Literal[
-    "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "tile"
+    "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "normal", "tile", "none", "inpaint"
 ]
 
 class Annotator:
-    def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda'):
-        if processor_id == "canny":
-            self.processor = CannyDetector()
-        elif processor_id == "depth":
-            self.processor = MidasDetector.from_pretrained(model_path).to(device)
-        elif processor_id == "softedge":
-            self.processor = HEDdetector.from_pretrained(model_path).to(device)
-        elif processor_id == "lineart":
-            self.processor = LineartDetector.from_pretrained(model_path).to(device)
-        elif processor_id == "lineart_anime":
-            self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
-        elif processor_id == "openpose":
-            self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
-        elif processor_id == "tile":
-            self.processor = None
+    def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda', skip_processor=False):
+        if not skip_processor:
+            if processor_id == "canny":
+                self.processor = CannyDetector()
+            elif processor_id == "depth":
+                self.processor = MidasDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "softedge":
+                self.processor = HEDdetector.from_pretrained(model_path).to(device)
+            elif processor_id == "lineart":
+                self.processor = LineartDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "lineart_anime":
+                self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "openpose":
+                self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "normal":
+                self.processor = NormalBaeDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "tile" or processor_id == "none" or processor_id == "inpaint":
+                self.processor = None
+            else:
+                raise ValueError(f"Unsupported processor_id: {processor_id}")
         else:
-            raise ValueError(f"Unsupported processor_id: {processor_id}")
+            self.processor = None
 
         self.processor_id = processor_id
         self.detect_resolution = detect_resolution
 
-    def __call__(self, image):
+    def __call__(self, image, mask=None):
         width, height = image.size
         if self.processor_id == "openpose":
             kwargs = {
diff --git a/diffsynth/models/flux_controlnet.py b/diffsynth/models/flux_controlnet.py
new file mode 100644
index 0000000..d6053b1
--- /dev/null
+++ b/diffsynth/models/flux_controlnet.py
@@ -0,0 +1,226 @@
+import torch
+from einops import rearrange, repeat
+from .flux_dit import RoPEEmbedding, TimestepEmbeddings, FluxJointTransformerBlock, FluxSingleTransformerBlock
+from .utils import hash_state_dict_keys
+
+
+
+class FluxControlNet(torch.nn.Module):
+    def __init__(self, disable_guidance_embedder=False, num_joint_blocks=5, num_single_blocks=10, num_mode=0, mode_dict={}, additional_input_dim=0):
+        super().__init__()
+        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
+        self.time_embedder = TimestepEmbeddings(256, 3072)
+        self.guidance_embedder = None if disable_guidance_embedder else TimestepEmbeddings(256, 3072)
+        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(768, 3072), torch.nn.SiLU(), torch.nn.Linear(3072, 3072))
+        self.context_embedder = torch.nn.Linear(4096, 3072)
+        self.x_embedder = torch.nn.Linear(64, 3072)
+
+        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(num_joint_blocks)])
+        self.single_blocks = torch.nn.ModuleList([FluxSingleTransformerBlock(3072, 24) for _ in range(num_single_blocks)])
+
+        self.controlnet_blocks = torch.nn.ModuleList([torch.nn.Linear(3072, 3072) for _ in range(num_joint_blocks)])
+        self.controlnet_single_blocks = torch.nn.ModuleList([torch.nn.Linear(3072, 3072) for _ in range(num_single_blocks)])
+        
+        self.mode_dict = mode_dict
+        self.controlnet_mode_embedder = torch.nn.Embedding(num_mode, 3072) if len(mode_dict) > 0 else None
+        self.controlnet_x_embedder = torch.nn.Linear(64 + additional_input_dim, 3072)
+
+
+    def prepare_image_ids(self, latents):
+        batch_size, _, height, width = latents.shape
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        latent_image_ids = latent_image_ids.to(device=latents.device, dtype=latents.dtype)
+
+        return latent_image_ids
+    
+
+    def patchify(self, hidden_states):
+        hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
+        return hidden_states
+    
+
+    def align_res_stack_to_original_blocks(self, res_stack, num_blocks, hidden_states):
+        if len(res_stack) == 0:
+            return [torch.zeros_like(hidden_states)] * num_blocks
+        interval = (num_blocks + len(res_stack) - 1) // len(res_stack)
+        aligned_res_stack = [res_stack[block_id // interval] for block_id in range(num_blocks)]
+        return aligned_res_stack
+
+
+    def forward(
+        self,
+        hidden_states,
+        controlnet_conditioning,
+        timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids, image_ids=None,
+        processor_id=None,
+        tiled=False, tile_size=128, tile_stride=64,
+        **kwargs
+    ):
+        if image_ids is None:
+            image_ids = self.prepare_image_ids(hidden_states)
+
+        conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
+        if self.guidance_embedder is not None:
+            guidance = guidance * 1000
+            conditioning = conditioning + self.guidance_embedder(guidance, hidden_states.dtype)
+        prompt_emb = self.context_embedder(prompt_emb)
+        if self.controlnet_mode_embedder is not None: # Different from FluxDiT
+            processor_id = torch.tensor([self.mode_dict[processor_id]], dtype=torch.int)
+            processor_id = repeat(processor_id, "D -> B D", B=1).to(text_ids.device)
+            prompt_emb = torch.concat([self.controlnet_mode_embedder(processor_id), prompt_emb], dim=1)
+            text_ids = torch.cat([text_ids[:, :1], text_ids], dim=1)
+        image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
+
+        hidden_states = self.patchify(hidden_states)
+        hidden_states = self.x_embedder(hidden_states)
+        controlnet_conditioning = self.patchify(controlnet_conditioning) # Different from FluxDiT
+        hidden_states = hidden_states + self.controlnet_x_embedder(controlnet_conditioning) # Different from FluxDiT
+
+        controlnet_res_stack = []
+        for block, controlnet_block in zip(self.blocks, self.controlnet_blocks):
+            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+            controlnet_res_stack.append(controlnet_block(hidden_states))
+
+        controlnet_single_res_stack = []
+        hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
+        for block, controlnet_block in zip(self.single_blocks, self.controlnet_single_blocks):
+            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+            controlnet_single_res_stack.append(controlnet_block(hidden_states[:, prompt_emb.shape[1]:]))
+
+        controlnet_res_stack = self.align_res_stack_to_original_blocks(controlnet_res_stack, 19, hidden_states[:, prompt_emb.shape[1]:])
+        controlnet_single_res_stack = self.align_res_stack_to_original_blocks(controlnet_single_res_stack, 38, hidden_states[:, prompt_emb.shape[1]:])
+
+        return controlnet_res_stack, controlnet_single_res_stack
+
+
+    @staticmethod
+    def state_dict_converter():
+        return FluxControlNetStateDictConverter()
+    
+
+
+class FluxControlNetStateDictConverter:
+    def __init__(self):
+        pass
+
+    def from_diffusers(self, state_dict):
+        hash_value = hash_state_dict_keys(state_dict)
+        global_rename_dict = {
+            "context_embedder": "context_embedder",
+            "x_embedder": "x_embedder",
+            "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
+            "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
+            "time_text_embed.guidance_embedder.linear_1": "guidance_embedder.timestep_embedder.0",
+            "time_text_embed.guidance_embedder.linear_2": "guidance_embedder.timestep_embedder.2",
+            "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
+            "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
+            "norm_out.linear": "final_norm_out.linear",
+            "proj_out": "final_proj_out",
+        }
+        rename_dict = {
+            "proj_out": "proj_out",
+            "norm1.linear": "norm1_a.linear",
+            "norm1_context.linear": "norm1_b.linear",
+            "attn.to_q": "attn.a_to_q",
+            "attn.to_k": "attn.a_to_k",
+            "attn.to_v": "attn.a_to_v",
+            "attn.to_out.0": "attn.a_to_out",
+            "attn.add_q_proj": "attn.b_to_q",
+            "attn.add_k_proj": "attn.b_to_k",
+            "attn.add_v_proj": "attn.b_to_v",
+            "attn.to_add_out": "attn.b_to_out",
+            "ff.net.0.proj": "ff_a.0",
+            "ff.net.2": "ff_a.2",
+            "ff_context.net.0.proj": "ff_b.0",
+            "ff_context.net.2": "ff_b.2",
+            "attn.norm_q": "attn.norm_q_a",
+            "attn.norm_k": "attn.norm_k_a",
+            "attn.norm_added_q": "attn.norm_q_b",
+            "attn.norm_added_k": "attn.norm_k_b",
+        }
+        rename_dict_single = {
+            "attn.to_q": "a_to_q",
+            "attn.to_k": "a_to_k",
+            "attn.to_v": "a_to_v",
+            "attn.norm_q": "norm_q_a",
+            "attn.norm_k": "norm_k_a",
+            "norm.linear": "norm.linear",
+            "proj_mlp": "proj_in_besides_attn",
+            "proj_out": "proj_out",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.endswith(".weight") or name.endswith(".bias"):
+                suffix = ".weight" if name.endswith(".weight") else ".bias"
+                prefix = name[:-len(suffix)]
+                if prefix in global_rename_dict:
+                    state_dict_[global_rename_dict[prefix] + suffix] = param
+                elif prefix.startswith("transformer_blocks."):
+                    names = prefix.split(".")
+                    names[0] = "blocks"
+                    middle = ".".join(names[2:])
+                    if middle in rename_dict:
+                        name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
+                        state_dict_[name_] = param
+                elif prefix.startswith("single_transformer_blocks."):
+                    names = prefix.split(".")
+                    names[0] = "single_blocks"
+                    middle = ".".join(names[2:])
+                    if middle in rename_dict_single:
+                        name_ = ".".join(names[:2] + [rename_dict_single[middle]] + [suffix[1:]])
+                        state_dict_[name_] = param
+                    else:
+                        state_dict_[name] = param
+                else:
+                    state_dict_[name] = param
+        for name in list(state_dict_.keys()):
+            if ".proj_in_besides_attn." in name:
+                name_ = name.replace(".proj_in_besides_attn.", ".to_qkv_mlp.")
+                param = torch.concat([
+                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_q.")],
+                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_k.")],
+                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_v.")],
+                    state_dict_[name],
+                ], dim=0)
+                state_dict_[name_] = param
+                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_q."))
+                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_k."))
+                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_v."))
+                state_dict_.pop(name)
+        for name in list(state_dict_.keys()):
+            for component in ["a", "b"]:
+                if f".{component}_to_q." in name:
+                    name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
+                    param = torch.concat([
+                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                    ], dim=0)
+                    state_dict_[name_] = param
+                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
+                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
+                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))
+        if hash_value == "78d18b9101345ff695f312e7e62538c0":
+            extra_kwargs = {"num_mode": 10, "mode_dict": {"canny": 0, "tile": 1, "depth": 2, "blur": 3, "pose": 4, "gray": 5, "lq": 6}}
+        elif hash_value == "b001c89139b5f053c715fe772362dd2a":
+            extra_kwargs = {"num_single_blocks": 0}
+        elif hash_value == "52357cb26250681367488a8954c271e8":
+            extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4}
+        elif hash_value == "0cfd1740758423a2a854d67c136d1e8c":
+            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1}
+        else:
+            extra_kwargs = {}
+        return state_dict_, extra_kwargs
+    
+
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)
diff --git a/diffsynth/models/model_manager.py b/diffsynth/models/model_manager.py
index 9a5c4f1..c68bdde 100644
--- a/diffsynth/models/model_manager.py
+++ b/diffsynth/models/model_manager.py
@@ -1,7 +1,4 @@
-import os, torch, hashlib, json, importlib
-from safetensors import safe_open
-from torch import Tensor
-from typing_extensions import Literal, TypeAlias
+import os, torch, json, importlib
 from typing import List
 
 from .downloader import download_models, download_customized_models, Preset_model_id, Preset_model_website
@@ -50,45 +47,7 @@ from ..extensions.RIFE import IFNet
 from ..extensions.ESRGAN import RRDBNet
 
 from ..configs.model_config import model_loader_configs, huggingface_model_loader_configs, patch_model_loader_configs
-from .utils import load_state_dict, init_weights_on_device
-
-
-
-def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
-    keys = []
-    for key, value in state_dict.items():
-        if isinstance(key, str):
-            if isinstance(value, Tensor):
-                if with_shape:
-                    shape = "_".join(map(str, list(value.shape)))
-                    keys.append(key + ":" + shape)
-                keys.append(key)
-            elif isinstance(value, dict):
-                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
-    keys.sort()
-    keys_str = ",".join(keys)
-    return keys_str
-
-
-def split_state_dict_with_prefix(state_dict):
-    keys = sorted([key for key in state_dict if isinstance(key, str)])
-    prefix_dict = {}
-    for key in  keys:
-        prefix = key if "." not in key else key.split(".")[0]
-        if prefix not in prefix_dict:
-            prefix_dict[prefix] = []
-        prefix_dict[prefix].append(key)
-    state_dicts = []
-    for prefix, keys in prefix_dict.items():
-        sub_state_dict = {key: state_dict[key] for key in keys}
-        state_dicts.append(sub_state_dict)
-    return state_dicts
-
-
-def hash_state_dict_keys(state_dict, with_shape=True):
-    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
-    keys_str = keys_str.encode(encoding="UTF-8")
-    return hashlib.md5(keys_str).hexdigest()
+from .utils import load_state_dict, init_weights_on_device, hash_state_dict_keys, split_state_dict_with_prefix
 
 
 def load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device):
diff --git a/diffsynth/models/utils.py b/diffsynth/models/utils.py
index bd579e4..e18e2dd 100644
--- a/diffsynth/models/utils.py
+++ b/diffsynth/models/utils.py
@@ -1,6 +1,7 @@
 import torch, os
 from safetensors import safe_open
 from contextlib import contextmanager
+import hashlib
 
 @contextmanager
 def init_weights_on_device(device = torch.device("meta"), include_buffers :bool = False):
@@ -142,3 +143,40 @@ def search_for_files(folder, extensions):
                 files.append(folder)
                 break
     return files
+
+
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+
+
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in  keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+
+
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()
\ No newline at end of file
diff --git a/diffsynth/pipelines/flux_image.py b/diffsynth/pipelines/flux_image.py
index 06f5649..176651e 100644
--- a/diffsynth/pipelines/flux_image.py
+++ b/diffsynth/pipelines/flux_image.py
@@ -1,9 +1,13 @@
 from ..models import ModelManager, FluxDiT, FluxTextEncoder1, FluxTextEncoder2, FluxVAEDecoder, FluxVAEEncoder
+from ..controlnets import FluxMultiControlNetManager, ControlNetUnit, ControlNetConfigUnit, Annotator
 from ..prompters import FluxPrompter
 from ..schedulers import FlowMatchScheduler
 from .base import BasePipeline
+from typing import List
 import torch
 from tqdm import tqdm
+import numpy as np
+from PIL import Image
 
 
 
@@ -19,14 +23,15 @@ class FluxImagePipeline(BasePipeline):
         self.dit: FluxDiT = None
         self.vae_decoder: FluxVAEDecoder = None
         self.vae_encoder: FluxVAEEncoder = None
-        self.model_names = ['text_encoder_1', 'text_encoder_2', 'dit', 'vae_decoder', 'vae_encoder']
+        self.controlnet: FluxMultiControlNetManager = None
+        self.model_names = ['text_encoder_1', 'text_encoder_2', 'dit', 'vae_decoder', 'vae_encoder', 'controlnet']
 
 
     def denoising_model(self):
         return self.dit
 
 
-    def fetch_models(self, model_manager: ModelManager, prompt_refiner_classes=[], prompt_extender_classes=[]):
+    def fetch_models(self, model_manager: ModelManager, controlnet_config_units: List[ControlNetConfigUnit]=[], prompt_refiner_classes=[], prompt_extender_classes=[]):
         self.text_encoder_1 = model_manager.fetch_model("flux_text_encoder_1")
         self.text_encoder_2 = model_manager.fetch_model("flux_text_encoder_2")
         self.dit = model_manager.fetch_model("flux_dit")
@@ -36,14 +41,25 @@ class FluxImagePipeline(BasePipeline):
         self.prompter.load_prompt_refiners(model_manager, prompt_refiner_classes)
         self.prompter.load_prompt_extenders(model_manager, prompt_extender_classes)
 
+        # ControlNets
+        controlnet_units = []
+        for config in controlnet_config_units:
+            controlnet_unit = ControlNetUnit(
+                Annotator(config.processor_id, device=self.device, skip_processor=config.skip_processor),
+                model_manager.fetch_model("flux_controlnet", config.model_path),
+                config.scale
+            )
+            controlnet_units.append(controlnet_unit)
+        self.controlnet = FluxMultiControlNetManager(controlnet_units)
+
 
     @staticmethod
-    def from_model_manager(model_manager: ModelManager, prompt_refiner_classes=[], prompt_extender_classes=[], device=None):
+    def from_model_manager(model_manager: ModelManager, controlnet_config_units: List[ControlNetConfigUnit]=[], prompt_refiner_classes=[], prompt_extender_classes=[], device=None):
         pipe = FluxImagePipeline(
             device=model_manager.device if device is None else device,
             torch_dtype=model_manager.torch_dtype,
         )
-        pipe.fetch_models(model_manager, prompt_refiner_classes,prompt_extender_classes)
+        pipe.fetch_models(model_manager, controlnet_config_units, prompt_refiner_classes, prompt_extender_classes)
         return pipe
     
 
@@ -71,17 +87,61 @@ class FluxImagePipeline(BasePipeline):
         return {"image_ids": latent_image_ids, "guidance": guidance}
     
 
+    def apply_controlnet_mask_on_latents(self, latents, mask):
+        mask = (self.preprocess_image(mask) + 1) / 2
+        mask = mask.mean(dim=1, keepdim=True)
+        mask = mask.to(dtype=self.torch_dtype, device=self.device)
+        mask = 1 - torch.nn.functional.interpolate(mask, size=latents.shape[-2:])
+        latents = torch.concat([latents, mask], dim=1)
+        return latents
+    
+
+    def apply_controlnet_mask_on_image(self, image, mask):
+        mask = mask.resize(image.size)
+        mask = self.preprocess_image(mask).mean(dim=[0, 1])
+        image = np.array(image)
+        image[mask > 0] = 0
+        image = Image.fromarray(image)
+        return image
+    
+
+    def prepare_controlnet_input(self, controlnet_image, controlnet_inpaint_mask, tiler_kwargs):
+        if isinstance(controlnet_image, Image.Image):
+            controlnet_image = [controlnet_image] * len(self.controlnet.processors)
+
+        controlnet_frames = []
+        for i in range(len(self.controlnet.processors)):
+            # image annotator
+            image = self.controlnet.process_image(controlnet_image[i], processor_id=i)[0]
+            if controlnet_inpaint_mask is not None and self.controlnet.processors[i].processor_id == "inpaint":
+                image = self.apply_controlnet_mask_on_image(image, controlnet_inpaint_mask)
+
+            # image to tensor
+            image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype)
+
+            # vae encoder
+            image = self.encode_image(image, **tiler_kwargs)
+            if controlnet_inpaint_mask is not None and self.controlnet.processors[i].processor_id == "inpaint":
+                image = self.apply_controlnet_mask_on_latents(image, controlnet_inpaint_mask)
+            
+            # store it
+            controlnet_frames.append(image)
+        return controlnet_frames
+    
+
     @torch.no_grad()
     def __call__(
         self,
         prompt,
-        local_prompts= None,
-        masks= None,        
-        mask_scales= None,
+        local_prompts=None,
+        masks=None,        
+        mask_scales=None,
         negative_prompt="",
         cfg_scale=1.0,
         embedded_guidance=3.5,
         input_image=None,
+        controlnet_image=None,
+        controlnet_inpaint_mask=None,
         denoising_strength=1.0,
         height=1024,
         width=1024,
@@ -123,19 +183,29 @@ class FluxImagePipeline(BasePipeline):
         # Extra input
         extra_input = self.prepare_extra_input(latents, guidance=embedded_guidance)
 
+        # Prepare ControlNets
+        if controlnet_image is not None:
+            controlnet_kwargs = {"controlnet_frames": self.prepare_controlnet_input(controlnet_image, controlnet_inpaint_mask, tiler_kwargs)}
+        else:
+            controlnet_kwargs = {"controlnet_frames": None}
+
         # Denoise
-        self.load_models_to_device(['dit'])
+        self.load_models_to_device(['dit', 'controlnet'])
         for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
             timestep = timestep.unsqueeze(0).to(self.device)
 
             # Classifier-free guidance
-            inference_callback = lambda prompt_emb_posi: self.dit(
-                latents, timestep=timestep, **prompt_emb_posi, **tiler_kwargs, **extra_input
+            inference_callback = lambda prompt_emb_posi: lets_dance_flux(
+                dit=self.dit, controlnet=self.controlnet,
+                hidden_states=latents, timestep=timestep,
+                **prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs
             )
             noise_pred_posi = self.control_noise_via_local_prompts(prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback)
             if cfg_scale != 1.0:
-                noise_pred_nega = self.dit(
-                    latents, timestep=timestep, **prompt_emb_nega, **tiler_kwargs, **extra_input
+                noise_pred_nega = lets_dance_flux(
+                    dit=self.dit, controlnet=self.controlnet,
+                    hidden_states=latents, timestep=timestep,
+                    **prompt_emb_nega, **tiler_kwargs, **extra_input, **controlnet_kwargs
                 )
                 noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
             else:
@@ -155,3 +225,75 @@ class FluxImagePipeline(BasePipeline):
         # Offload all models
         self.load_models_to_device([])
         return image
+
+
+
+def lets_dance_flux(
+    dit: FluxDiT,
+    controlnet: FluxMultiControlNetManager = None,
+    hidden_states=None,
+    timestep=None,
+    prompt_emb=None,
+    pooled_prompt_emb=None,
+    guidance=None,
+    text_ids=None,
+    image_ids=None,
+    controlnet_frames=None,
+    tiled=False,
+    tile_size=128,
+    tile_stride=64,
+    **kwargs
+):
+    # ControlNet
+    if controlnet is not None and controlnet_frames is not None:
+        controlnet_extra_kwargs = {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "prompt_emb": prompt_emb,
+            "pooled_prompt_emb": pooled_prompt_emb,
+            "guidance": guidance,
+            "text_ids": text_ids,
+            "image_ids": image_ids,
+            "tiled": tiled,
+            "tile_size": tile_size,
+            "tile_stride": tile_stride,
+        }
+        controlnet_res_stack, controlnet_single_res_stack = controlnet(
+            controlnet_frames, **controlnet_extra_kwargs
+        )
+
+    if image_ids is None:
+        image_ids = dit.prepare_image_ids(hidden_states)
+    
+    conditioning = dit.time_embedder(timestep, hidden_states.dtype) + dit.pooled_text_embedder(pooled_prompt_emb)
+    if dit.guidance_embedder is not None:
+        guidance = guidance * 1000
+        conditioning = conditioning + dit.guidance_embedder(guidance, hidden_states.dtype)
+    prompt_emb = dit.context_embedder(prompt_emb)
+    image_rotary_emb = dit.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
+
+    height, width = hidden_states.shape[-2:]
+    hidden_states = dit.patchify(hidden_states)
+    hidden_states = dit.x_embedder(hidden_states)
+    
+    # Joint Blocks
+    for block_id, block in enumerate(dit.blocks):
+        hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+        # ControlNet
+        if controlnet is not None and controlnet_frames is not None:
+            hidden_states = hidden_states + controlnet_res_stack[block_id]
+
+    # Single Blocks
+    hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
+    for block_id, block in enumerate(dit.single_blocks):
+        hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+        # ControlNet
+        if controlnet is not None and controlnet_frames is not None:
+            hidden_states[:, prompt_emb.shape[1]:] = hidden_states[:, prompt_emb.shape[1]:] + controlnet_single_res_stack[block_id]
+    hidden_states = hidden_states[:, prompt_emb.shape[1]:]
+
+    hidden_states = dit.final_norm_out(hidden_states, conditioning)
+    hidden_states = dit.final_proj_out(hidden_states)
+    hidden_states = dit.unpatchify(hidden_states, height, width)
+
+    return hidden_states
diff --git a/examples/image_synthesis/flux_controlnet.py b/examples/image_synthesis/flux_controlnet.py
new file mode 100644
index 0000000..80be320
--- /dev/null
+++ b/examples/image_synthesis/flux_controlnet.py
@@ -0,0 +1,44 @@
+from diffsynth.models.flux_controlnet import FluxControlNet
+from diffsynth import load_state_dict, ModelManager, FluxImagePipeline, hash_state_dict_keys, ControlNetConfigUnit
+import torch
+from PIL import Image
+import numpy as np
+
+
+model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["FLUX.1-dev"])
+model_manager.load_models([
+    "models/ControlNet/InstantX/FLUX___1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/jasperai/Flux___1-dev-Controlnet-Depth/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/jasperai/Flux___1-dev-Controlnet-Surface-Normals/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/jasperai/Flux___1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/alimama-creative/FLUX___1-dev-Controlnet-Inpainting-Alpha/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/alimama-creative/FLUX___1-dev-Controlnet-Inpainting-Beta/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/Shakker-Labs/FLUX___1-dev-ControlNet-Depth/diffusion_pytorch_model.safetensors",
+    "models/ControlNet/Shakker-Labs/FLUX___1-dev-ControlNet-Union-Pro/diffusion_pytorch_model.safetensors"
+])
+pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+    ControlNetConfigUnit(processor_id="canny", model_path="models/ControlNet/InstantX/FLUX___1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors", scale=0.3),
+    ControlNetConfigUnit(processor_id="depth", model_path="models/ControlNet/jasperai/Flux___1-dev-Controlnet-Depth/diffusion_pytorch_model.safetensors", scale=0.1),
+    ControlNetConfigUnit(processor_id="normal", model_path="models/ControlNet/jasperai/Flux___1-dev-Controlnet-Surface-Normals/diffusion_pytorch_model.safetensors", scale=0.1),
+    ControlNetConfigUnit(processor_id="tile", model_path="models/ControlNet/jasperai/Flux___1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors", scale=0.05),
+    ControlNetConfigUnit(processor_id="inpaint", model_path="models/ControlNet/alimama-creative/FLUX___1-dev-Controlnet-Inpainting-Alpha/diffusion_pytorch_model.safetensors", scale=0.01),
+    ControlNetConfigUnit(processor_id="inpaint", model_path="models/ControlNet/alimama-creative/FLUX___1-dev-Controlnet-Inpainting-Beta/diffusion_pytorch_model.safetensors", scale=0.01),
+    ControlNetConfigUnit(processor_id="depth", model_path="models/ControlNet/Shakker-Labs/FLUX___1-dev-ControlNet-Depth/diffusion_pytorch_model.safetensors", scale=0.05),
+    ControlNetConfigUnit(processor_id="canny", model_path="models/ControlNet/Shakker-Labs/FLUX___1-dev-ControlNet-Union-Pro/diffusion_pytorch_model.safetensors", scale=0.3),
+])
+
+torch.manual_seed(0)
+
+control_image = Image.open("controlnet_input.jpeg").resize((768, 1024))
+control_mask = Image.open("controlnet_mask.jpg").resize((768, 1024))
+
+prompt = "masterpiece, best quality, a beautiful girl, CG, blue sky, long red hair, black clothes"
+negative_prompt = "oil painting, worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+
+image = pipe(
+    prompt=prompt, negative_prompt=negative_prompt,
+    embedded_guidance=3.5, num_inference_steps=50,
+    height=1024, width=768,
+    controlnet_image=control_image, controlnet_inpaint_mask=control_mask,
+)
+image.save("image.jpg")