flux

2026-04-08 08:58:20 +00:00 · 2025-11-17 14:37:10 +08:00
parent 47246060d6
commit 416ff5df74
24 changed files with 2387 additions and 18 deletions
--- a/diffsynth/configs/model_configs.py
+++ b/diffsynth/configs/model_configs.py
@@ -285,6 +285,34 @@ flux_series = [
        "model_class": "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_text_encoder_t5.FluxTextEncoderT5StateDictConverter",
    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
+        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
+        "model_name": "flux_vae_encoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
+        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
+        "model_name": "flux_vae_decoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="ostris/Flex.2-preview", origin_file_pattern="Flex.2-preview.safetensors")
+        "model_hash": "d02f41c13549fa5093d3521f62a5570a",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "extra_kwargs": {'input_dim': 196, 'num_blocks': 8},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/AttriCtrl-FLUX.1-Dev", origin_file_pattern="models/brightness.safetensors")
+        "model_hash": "0629116fce1472503a66992f96f3eb1a",
+        "model_name": "flux_value_controller",
+        "model_class": "diffsynth.models.flux_value_control.SingleValueEncoder",
+    }
 ]

 MODEL_CONFIGS = qwen_image_series + wan_series + flux_series
--- a/diffsynth/models/flux_ipadapter.py
+++ b/diffsynth/models/flux_ipadapter.py
@@ -1,9 +1,38 @@
-from .svd_image_encoder import SVDImageEncoder
-from .sd3_dit import RMSNorm
-from transformers import CLIPImageProcessor
+from .general_modules import RMSNorm
+from transformers import SiglipVisionModel, SiglipVisionConfig
 import torch


+class SiglipVisionModelSO400M(SiglipVisionModel):
+    def __init__(self):
+        config = SiglipVisionConfig(**{
+            "architectures": [
+                "SiglipModel"
+            ],
+            "initializer_factor": 1.0,
+            "model_type": "siglip",
+            "text_config": {
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "model_type": "siglip_text_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27
+            },
+            "torch_dtype": "float32",
+            "transformers_version": "4.37.0.dev0",
+            "vision_config": {
+                "hidden_size": 1152,
+                "image_size": 384,
+                "intermediate_size": 4304,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27,
+                "patch_size": 14
+            }
+        })
+        super().__init__(config)
+
+
 class MLPProjModel(torch.nn.Module):
    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
        super().__init__()
--- a/diffsynth/models/flux_vae.py
+++ b/diffsynth/models/flux_vae.py
@@ -106,7 +106,7 @@ class TileWorker:
        return model_output


-class Attention(torch.nn.Module):
+class ConvAttention(torch.nn.Module):

    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
        super().__init__()
@@ -115,10 +115,10 @@ class Attention(torch.nn.Module):
        self.num_heads = num_heads
        self.head_dim = head_dim

-        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
-        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
-        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
-        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+        self.to_q = torch.nn.Conv2d(q_dim, dim_inner, kernel_size=(1, 1), bias=bias_q)
+        self.to_k = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
+        self.to_v = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
+        self.to_out = torch.nn.Conv2d(dim_inner, q_dim, kernel_size=(1, 1), bias=bias_out)

    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
        if encoder_hidden_states is None:
@@ -126,9 +126,14 @@ class Attention(torch.nn.Module):

        batch_size = encoder_hidden_states.shape[0]

-        q = self.to_q(hidden_states)
-        k = self.to_k(encoder_hidden_states)
-        v = self.to_v(encoder_hidden_states)
+        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
+        q = self.to_q(conv_input)
+        q = rearrange(q[:, :, :, 0], "B C L -> B L C")
+        conv_input = rearrange(encoder_hidden_states, "B L C -> B C L 1")
+        k = self.to_k(conv_input)
+        v = self.to_v(conv_input)
+        k = rearrange(k[:, :, :, 0], "B C L -> B L C")
+        v = rearrange(v[:, :, :, 0], "B C L -> B L C")

        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
@@ -138,7 +143,9 @@ class Attention(torch.nn.Module):
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)

-        hidden_states = self.to_out(hidden_states)
+        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
+        hidden_states = self.to_out(conv_input)
+        hidden_states = rearrange(hidden_states[:, :, :, 0], "B C L -> B L C")

        return hidden_states

@@ -152,7 +159,7 @@ class VAEAttentionBlock(torch.nn.Module):
        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)

        self.transformer_blocks = torch.nn.ModuleList([
-            Attention(
+            ConvAttention(
                inner_dim,
                num_attention_heads,
                attention_head_dim,
@@ -236,7 +243,7 @@ class DownSampler(torch.nn.Module):
        return hidden_states, time_emb, text_emb, res_stack


-class SD3VAEDecoder(torch.nn.Module):
+class FluxVAEDecoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.scaling_factor = 0.3611
@@ -308,7 +315,7 @@ class SD3VAEDecoder(torch.nn.Module):
        return hidden_states


-class SD3VAEEncoder(torch.nn.Module):
+class FluxVAEEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.scaling_factor = 0.3611
--- a/diffsynth/models/flux_value_control.py
+++ b/diffsynth/models/flux_value_control.py
@@ -1,10 +1,12 @@
 import torch
-from diffsynth.models.svd_unet import TemporalTimesteps
+from .general_modules import TemporalTimesteps


 class MultiValueEncoder(torch.nn.Module):
    def __init__(self, encoders=()):
        super().__init__()
+        if not isinstance(encoders, list):
+            encoders = [encoders]
        self.encoders = torch.nn.ModuleList(encoders)

    def __call__(self, values, dtype):
--- a/diffsynth/pipelines/flux_image.py
+++ b/diffsynth/pipelines/flux_image.py
--- a/diffsynth/utils/lora/flux.py
+++ b/diffsynth/utils/lora/flux.py
@@ -0,0 +1,204 @@
+from .general import GeneralLoRALoader
+import torch, math
+
+
+class FluxLoRALoader(GeneralLoRALoader):
+    def __init__(self, device="cpu", torch_dtype=torch.float32):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+    
+        self.diffusers_rename_dict = {
+            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_A.weight":"single_blocks.blockid.a_to_k.lora_A.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_B.weight":"single_blocks.blockid.a_to_k.lora_B.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_A.weight":"single_blocks.blockid.a_to_q.lora_A.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_B.weight":"single_blocks.blockid.a_to_q.lora_B.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_A.weight":"single_blocks.blockid.a_to_v.lora_A.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_B.weight":"single_blocks.blockid.a_to_v.lora_B.weight",
+            "transformer.single_transformer_blocks.blockid.norm.linear.lora_A.weight":"single_blocks.blockid.norm.linear.lora_A.weight",
+            "transformer.single_transformer_blocks.blockid.norm.linear.lora_B.weight":"single_blocks.blockid.norm.linear.lora_B.weight",
+            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_A.weight":"single_blocks.blockid.proj_in_besides_attn.lora_A.weight",
+            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_B.weight":"single_blocks.blockid.proj_in_besides_attn.lora_B.weight",
+            "transformer.single_transformer_blocks.blockid.proj_out.lora_A.weight":"single_blocks.blockid.proj_out.lora_A.weight",
+            "transformer.single_transformer_blocks.blockid.proj_out.lora_B.weight":"single_blocks.blockid.proj_out.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_A.weight":"blocks.blockid.attn.b_to_k.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_B.weight":"blocks.blockid.attn.b_to_k.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_A.weight":"blocks.blockid.attn.b_to_q.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_B.weight":"blocks.blockid.attn.b_to_q.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_A.weight":"blocks.blockid.attn.b_to_v.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_B.weight":"blocks.blockid.attn.b_to_v.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_A.weight":"blocks.blockid.attn.b_to_out.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_B.weight":"blocks.blockid.attn.b_to_out.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.to_k.lora_A.weight":"blocks.blockid.attn.a_to_k.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.to_k.lora_B.weight":"blocks.blockid.attn.a_to_k.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_A.weight":"blocks.blockid.attn.a_to_out.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_B.weight":"blocks.blockid.attn.a_to_out.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.to_q.lora_A.weight":"blocks.blockid.attn.a_to_q.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.to_q.lora_B.weight":"blocks.blockid.attn.a_to_q.lora_B.weight",
+            "transformer.transformer_blocks.blockid.attn.to_v.lora_A.weight":"blocks.blockid.attn.a_to_v.lora_A.weight",
+            "transformer.transformer_blocks.blockid.attn.to_v.lora_B.weight":"blocks.blockid.attn.a_to_v.lora_B.weight",
+            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_A.weight":"blocks.blockid.ff_a.0.lora_A.weight",
+            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_B.weight":"blocks.blockid.ff_a.0.lora_B.weight",
+            "transformer.transformer_blocks.blockid.ff.net.2.lora_A.weight":"blocks.blockid.ff_a.2.lora_A.weight",
+            "transformer.transformer_blocks.blockid.ff.net.2.lora_B.weight":"blocks.blockid.ff_a.2.lora_B.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_A.weight":"blocks.blockid.ff_b.0.lora_A.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_B.weight":"blocks.blockid.ff_b.0.lora_B.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_A.weight":"blocks.blockid.ff_b.2.lora_A.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_B.weight":"blocks.blockid.ff_b.2.lora_B.weight",
+            "transformer.transformer_blocks.blockid.norm1.linear.lora_A.weight":"blocks.blockid.norm1_a.linear.lora_A.weight",
+            "transformer.transformer_blocks.blockid.norm1.linear.lora_B.weight":"blocks.blockid.norm1_a.linear.lora_B.weight",
+            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_A.weight":"blocks.blockid.norm1_b.linear.lora_A.weight",
+            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_B.weight":"blocks.blockid.norm1_b.linear.lora_B.weight",
+        }
+
+        self.civitai_rename_dict = {
+            "lora_unet_double_blocks_blockid_img_mod_lin.lora_down.weight": "blocks.blockid.norm1_a.linear.lora_A.weight",
+            "lora_unet_double_blocks_blockid_img_mod_lin.lora_up.weight": "blocks.blockid.norm1_a.linear.lora_B.weight",
+            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_down.weight": "blocks.blockid.norm1_b.linear.lora_A.weight",
+            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_up.weight": "blocks.blockid.norm1_b.linear.lora_B.weight",
+            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_down.weight": "blocks.blockid.attn.a_to_qkv.lora_A.weight",
+            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_up.weight": "blocks.blockid.attn.a_to_qkv.lora_B.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_down.weight": "blocks.blockid.attn.b_to_qkv.lora_A.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_up.weight": "blocks.blockid.attn.b_to_qkv.lora_B.weight",
+            "lora_unet_double_blocks_blockid_img_attn_proj.lora_down.weight": "blocks.blockid.attn.a_to_out.lora_A.weight",
+            "lora_unet_double_blocks_blockid_img_attn_proj.lora_up.weight": "blocks.blockid.attn.a_to_out.lora_B.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_down.weight": "blocks.blockid.attn.b_to_out.lora_A.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_up.weight": "blocks.blockid.attn.b_to_out.lora_B.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_0.lora_down.weight": "blocks.blockid.ff_a.0.lora_A.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_0.lora_up.weight": "blocks.blockid.ff_a.0.lora_B.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_2.lora_down.weight": "blocks.blockid.ff_a.2.lora_A.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_2.lora_up.weight": "blocks.blockid.ff_a.2.lora_B.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_down.weight": "blocks.blockid.ff_b.0.lora_A.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_up.weight": "blocks.blockid.ff_b.0.lora_B.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_down.weight": "blocks.blockid.ff_b.2.lora_A.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_up.weight": "blocks.blockid.ff_b.2.lora_B.weight",
+            "lora_unet_single_blocks_blockid_modulation_lin.lora_down.weight": "single_blocks.blockid.norm.linear.lora_A.weight",
+            "lora_unet_single_blocks_blockid_modulation_lin.lora_up.weight": "single_blocks.blockid.norm.linear.lora_B.weight",
+            "lora_unet_single_blocks_blockid_linear1.lora_down.weight": "single_blocks.blockid.to_qkv_mlp.lora_A.weight",
+            "lora_unet_single_blocks_blockid_linear1.lora_up.weight": "single_blocks.blockid.to_qkv_mlp.lora_B.weight",
+            "lora_unet_single_blocks_blockid_linear2.lora_down.weight": "single_blocks.blockid.proj_out.lora_A.weight",
+            "lora_unet_single_blocks_blockid_linear2.lora_up.weight": "single_blocks.blockid.proj_out.lora_B.weight",
+        }
+
+    def fuse_lora_to_base_model(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
+        super().fuse_lora_to_base_model(model, state_dict_lora, alpha)
+    
+    def convert_state_dict(self, state_dict):
+
+        def guess_block_id(name,model_resource):
+            if model_resource == 'civitai':
+                names = name.split("_")
+                for i in names:
+                    if i.isdigit():
+                        return i, name.replace(f"_{i}_", "_blockid_")
+            if model_resource == 'diffusers':
+                names = name.split(".")
+                for i in names:
+                    if i.isdigit():
+                        return i, name.replace(f"transformer_blocks.{i}.", "transformer_blocks.blockid.")
+            return None, None
+
+        def guess_resource(state_dict):
+            for k in state_dict:
+                if "lora_unet_" in k:
+                    return 'civitai'
+                elif k.startswith("transformer."):
+                    return 'diffusers'
+                else:
+                    None
+        
+        model_resource = guess_resource(state_dict)
+        if model_resource is None:
+            return state_dict
+
+        rename_dict = self.diffusers_rename_dict if model_resource == 'diffusers' else self.civitai_rename_dict
+        def guess_alpha(state_dict):
+                for name, param in state_dict.items():
+                    if ".alpha" in name:
+                        for suffix in [".lora_down.weight", ".lora_A.weight"]:
+                            name_ = name.replace(".alpha", suffix)
+                            if name_ in state_dict:
+                                lora_alpha = param.item() / state_dict[name_].shape[0]
+                                lora_alpha = math.sqrt(lora_alpha)
+                                return lora_alpha
+
+                return 1
+        
+        alpha = guess_alpha(state_dict)
+        
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            block_id, source_name = guess_block_id(name,model_resource)
+            if alpha != 1:
+                param *= alpha
+            if source_name in rename_dict:
+                target_name = rename_dict[source_name]
+                target_name = target_name.replace(".blockid.", f".{block_id}.")
+                state_dict_[target_name] = param
+            else:
+                state_dict_[name] = param
+        
+        if model_resource == 'diffusers':
+            for name in list(state_dict_.keys()):
+                if "single_blocks." in name and ".a_to_q." in name:
+                    mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
+                    if mlp is None:
+                        dim = 4
+                        if 'lora_A' in name:
+                            dim = 1
+                        mlp = torch.zeros(dim * state_dict_[name].shape[0],
+                                        *state_dict_[name].shape[1:],
+                                        dtype=state_dict_[name].dtype)
+                    else:
+                        state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
+                    if 'lora_A' in name:
+                        param = torch.concat([
+                            state_dict_.pop(name),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
+                            mlp,
+                        ], dim=0)
+                    elif 'lora_B' in name:
+                        d, r = state_dict_[name].shape
+                        param = torch.zeros((3*d+mlp.shape[0], 3*r+mlp.shape[1]), dtype=state_dict_[name].dtype, device=state_dict_[name].device)
+                        param[:d, :r] = state_dict_.pop(name)
+                        param[d:2*d, r:2*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_k."))
+                        param[2*d:3*d, 2*r:3*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_v."))
+                        param[3*d:, 3*r:] = mlp
+                    else:
+                        param = torch.concat([
+                            state_dict_.pop(name),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
+                            mlp,
+                        ], dim=0)
+                    name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
+                    state_dict_[name_] = param
+            for name in list(state_dict_.keys()):
+                for component in ["a", "b"]:
+                    if f".{component}_to_q." in name:
+                        name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
+                        concat_dim = 0
+                        if 'lora_A' in name:
+                            param = torch.concat([
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                            ], dim=0)
+                        elif 'lora_B' in name:
+                            origin = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
+                            d, r = origin.shape
+                            # print(d, r)
+                            param = torch.zeros((3*d, 3*r), dtype=origin.dtype, device=origin.device)
+                            param[:d, :r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
+                            param[d:2*d, r:2*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")]
+                            param[2*d:3*d, 2*r:3*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")]
+                        else:
+                            param = torch.concat([
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                            ], dim=0)
+                        state_dict_[name_] = param
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))  
+        return state_dict_
--- a/diffsynth/utils/state_dict_converters/flux_vae.py
+++ b/diffsynth/utils/state_dict_converters/flux_vae.py
@@ -0,0 +1,264 @@
+def FluxVAEEncoderStateDictConverter(state_dict):
+    rename_dict = {
+        "encoder.conv_in.bias": "conv_in.bias",
+        "encoder.conv_in.weight": "conv_in.weight",
+        "encoder.conv_out.bias": "conv_out.bias",
+        "encoder.conv_out.weight": "conv_out.weight",
+        "encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias",
+        "encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight",
+        "encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias",
+        "encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight",
+        "encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias",
+        "encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight",
+        "encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias",
+        "encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight",
+        "encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias",
+        "encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight",
+        "encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias",
+        "encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight",
+        "encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias",
+        "encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight",
+        "encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias",
+        "encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight",
+        "encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias",
+        "encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight",
+        "encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias",
+        "encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight",
+        "encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias",
+        "encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight",
+        "encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias",
+        "encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight",
+        "encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias",
+        "encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight",
+        "encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias",
+        "encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight",
+        "encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias",
+        "encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight",
+        "encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias",
+        "encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight",
+        "encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias",
+        "encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight",
+        "encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias",
+        "encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight",
+        "encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias",
+        "encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight",
+        "encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias",
+        "encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight",
+        "encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias",
+        "encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight",
+        "encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias",
+        "encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight",
+        "encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias",
+        "encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight",
+        "encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias",
+        "encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight",
+        "encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias",
+        "encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight",
+        "encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias",
+        "encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight",
+        "encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias",
+        "encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight",
+        "encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias",
+        "encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight",
+        "encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias",
+        "encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight",
+        "encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias",
+        "encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight",
+        "encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias",
+        "encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight",
+        "encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias",
+        "encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight",
+        "encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias",
+        "encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight",
+        "encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias",
+        "encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight",
+        "encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias",
+        "encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight",
+        "encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias",
+        "encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight",
+        "encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias",
+        "encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight",
+        "encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias",
+        "encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight",
+        "encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias",
+        "encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight",
+        "encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias",
+        "encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight",
+        "encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias",
+        "encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight",
+        "encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias",
+        "encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight",
+        "encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias",
+        "encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight",
+        "encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias",
+        "encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight",
+        "encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias",
+        "encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight",
+        "encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias",
+        "encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight",
+        "encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias",
+        "encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight",
+        "encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias",
+        "encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight",
+        "encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias",
+        "encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight",
+        "encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias",
+        "encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight",
+        "encoder.norm_out.bias": "conv_norm_out.bias",
+        "encoder.norm_out.weight": "conv_norm_out.weight",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            param = state_dict[name]
+            state_dict_[rename_dict[name]] = param
+    return state_dict_
+
+
+def FluxVAEDecoderStateDictConverter(state_dict):
+    rename_dict = {
+        "decoder.conv_in.bias": "conv_in.bias",
+        "decoder.conv_in.weight": "conv_in.weight",
+        "decoder.conv_out.bias": "conv_out.bias",
+        "decoder.conv_out.weight": "conv_out.weight",
+        "decoder.mid.attn_1.k.bias": "blocks.1.transformer_blocks.0.to_k.bias",
+        "decoder.mid.attn_1.k.weight": "blocks.1.transformer_blocks.0.to_k.weight",
+        "decoder.mid.attn_1.norm.bias": "blocks.1.norm.bias",
+        "decoder.mid.attn_1.norm.weight": "blocks.1.norm.weight",
+        "decoder.mid.attn_1.proj_out.bias": "blocks.1.transformer_blocks.0.to_out.bias",
+        "decoder.mid.attn_1.proj_out.weight": "blocks.1.transformer_blocks.0.to_out.weight",
+        "decoder.mid.attn_1.q.bias": "blocks.1.transformer_blocks.0.to_q.bias",
+        "decoder.mid.attn_1.q.weight": "blocks.1.transformer_blocks.0.to_q.weight",
+        "decoder.mid.attn_1.v.bias": "blocks.1.transformer_blocks.0.to_v.bias",
+        "decoder.mid.attn_1.v.weight": "blocks.1.transformer_blocks.0.to_v.weight",
+        "decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias",
+        "decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight",
+        "decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias",
+        "decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight",
+        "decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias",
+        "decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight",
+        "decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias",
+        "decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight",
+        "decoder.mid.block_2.conv1.bias": "blocks.2.conv1.bias",
+        "decoder.mid.block_2.conv1.weight": "blocks.2.conv1.weight",
+        "decoder.mid.block_2.conv2.bias": "blocks.2.conv2.bias",
+        "decoder.mid.block_2.conv2.weight": "blocks.2.conv2.weight",
+        "decoder.mid.block_2.norm1.bias": "blocks.2.norm1.bias",
+        "decoder.mid.block_2.norm1.weight": "blocks.2.norm1.weight",
+        "decoder.mid.block_2.norm2.bias": "blocks.2.norm2.bias",
+        "decoder.mid.block_2.norm2.weight": "blocks.2.norm2.weight",
+        "decoder.norm_out.bias": "conv_norm_out.bias",
+        "decoder.norm_out.weight": "conv_norm_out.weight",
+        "decoder.up.0.block.0.conv1.bias": "blocks.15.conv1.bias",
+        "decoder.up.0.block.0.conv1.weight": "blocks.15.conv1.weight",
+        "decoder.up.0.block.0.conv2.bias": "blocks.15.conv2.bias",
+        "decoder.up.0.block.0.conv2.weight": "blocks.15.conv2.weight",
+        "decoder.up.0.block.0.nin_shortcut.bias": "blocks.15.conv_shortcut.bias",
+        "decoder.up.0.block.0.nin_shortcut.weight": "blocks.15.conv_shortcut.weight",
+        "decoder.up.0.block.0.norm1.bias": "blocks.15.norm1.bias",
+        "decoder.up.0.block.0.norm1.weight": "blocks.15.norm1.weight",
+        "decoder.up.0.block.0.norm2.bias": "blocks.15.norm2.bias",
+        "decoder.up.0.block.0.norm2.weight": "blocks.15.norm2.weight",
+        "decoder.up.0.block.1.conv1.bias": "blocks.16.conv1.bias",
+        "decoder.up.0.block.1.conv1.weight": "blocks.16.conv1.weight",
+        "decoder.up.0.block.1.conv2.bias": "blocks.16.conv2.bias",
+        "decoder.up.0.block.1.conv2.weight": "blocks.16.conv2.weight",
+        "decoder.up.0.block.1.norm1.bias": "blocks.16.norm1.bias",
+        "decoder.up.0.block.1.norm1.weight": "blocks.16.norm1.weight",
+        "decoder.up.0.block.1.norm2.bias": "blocks.16.norm2.bias",
+        "decoder.up.0.block.1.norm2.weight": "blocks.16.norm2.weight",
+        "decoder.up.0.block.2.conv1.bias": "blocks.17.conv1.bias",
+        "decoder.up.0.block.2.conv1.weight": "blocks.17.conv1.weight",
+        "decoder.up.0.block.2.conv2.bias": "blocks.17.conv2.bias",
+        "decoder.up.0.block.2.conv2.weight": "blocks.17.conv2.weight",
+        "decoder.up.0.block.2.norm1.bias": "blocks.17.norm1.bias",
+        "decoder.up.0.block.2.norm1.weight": "blocks.17.norm1.weight",
+        "decoder.up.0.block.2.norm2.bias": "blocks.17.norm2.bias",
+        "decoder.up.0.block.2.norm2.weight": "blocks.17.norm2.weight",
+        "decoder.up.1.block.0.conv1.bias": "blocks.11.conv1.bias",
+        "decoder.up.1.block.0.conv1.weight": "blocks.11.conv1.weight",
+        "decoder.up.1.block.0.conv2.bias": "blocks.11.conv2.bias",
+        "decoder.up.1.block.0.conv2.weight": "blocks.11.conv2.weight",
+        "decoder.up.1.block.0.nin_shortcut.bias": "blocks.11.conv_shortcut.bias",
+        "decoder.up.1.block.0.nin_shortcut.weight": "blocks.11.conv_shortcut.weight",
+        "decoder.up.1.block.0.norm1.bias": "blocks.11.norm1.bias",
+        "decoder.up.1.block.0.norm1.weight": "blocks.11.norm1.weight",
+        "decoder.up.1.block.0.norm2.bias": "blocks.11.norm2.bias",
+        "decoder.up.1.block.0.norm2.weight": "blocks.11.norm2.weight",
+        "decoder.up.1.block.1.conv1.bias": "blocks.12.conv1.bias",
+        "decoder.up.1.block.1.conv1.weight": "blocks.12.conv1.weight",
+        "decoder.up.1.block.1.conv2.bias": "blocks.12.conv2.bias",
+        "decoder.up.1.block.1.conv2.weight": "blocks.12.conv2.weight",
+        "decoder.up.1.block.1.norm1.bias": "blocks.12.norm1.bias",
+        "decoder.up.1.block.1.norm1.weight": "blocks.12.norm1.weight",
+        "decoder.up.1.block.1.norm2.bias": "blocks.12.norm2.bias",
+        "decoder.up.1.block.1.norm2.weight": "blocks.12.norm2.weight",
+        "decoder.up.1.block.2.conv1.bias": "blocks.13.conv1.bias",
+        "decoder.up.1.block.2.conv1.weight": "blocks.13.conv1.weight",
+        "decoder.up.1.block.2.conv2.bias": "blocks.13.conv2.bias",
+        "decoder.up.1.block.2.conv2.weight": "blocks.13.conv2.weight",
+        "decoder.up.1.block.2.norm1.bias": "blocks.13.norm1.bias",
+        "decoder.up.1.block.2.norm1.weight": "blocks.13.norm1.weight",
+        "decoder.up.1.block.2.norm2.bias": "blocks.13.norm2.bias",
+        "decoder.up.1.block.2.norm2.weight": "blocks.13.norm2.weight",
+        "decoder.up.1.upsample.conv.bias": "blocks.14.conv.bias",
+        "decoder.up.1.upsample.conv.weight": "blocks.14.conv.weight",
+        "decoder.up.2.block.0.conv1.bias": "blocks.7.conv1.bias",
+        "decoder.up.2.block.0.conv1.weight": "blocks.7.conv1.weight",
+        "decoder.up.2.block.0.conv2.bias": "blocks.7.conv2.bias",
+        "decoder.up.2.block.0.conv2.weight": "blocks.7.conv2.weight",
+        "decoder.up.2.block.0.norm1.bias": "blocks.7.norm1.bias",
+        "decoder.up.2.block.0.norm1.weight": "blocks.7.norm1.weight",
+        "decoder.up.2.block.0.norm2.bias": "blocks.7.norm2.bias",
+        "decoder.up.2.block.0.norm2.weight": "blocks.7.norm2.weight",
+        "decoder.up.2.block.1.conv1.bias": "blocks.8.conv1.bias",
+        "decoder.up.2.block.1.conv1.weight": "blocks.8.conv1.weight",
+        "decoder.up.2.block.1.conv2.bias": "blocks.8.conv2.bias",
+        "decoder.up.2.block.1.conv2.weight": "blocks.8.conv2.weight",
+        "decoder.up.2.block.1.norm1.bias": "blocks.8.norm1.bias",
+        "decoder.up.2.block.1.norm1.weight": "blocks.8.norm1.weight",
+        "decoder.up.2.block.1.norm2.bias": "blocks.8.norm2.bias",
+        "decoder.up.2.block.1.norm2.weight": "blocks.8.norm2.weight",
+        "decoder.up.2.block.2.conv1.bias": "blocks.9.conv1.bias",
+        "decoder.up.2.block.2.conv1.weight": "blocks.9.conv1.weight",
+        "decoder.up.2.block.2.conv2.bias": "blocks.9.conv2.bias",
+        "decoder.up.2.block.2.conv2.weight": "blocks.9.conv2.weight",
+        "decoder.up.2.block.2.norm1.bias": "blocks.9.norm1.bias",
+        "decoder.up.2.block.2.norm1.weight": "blocks.9.norm1.weight",
+        "decoder.up.2.block.2.norm2.bias": "blocks.9.norm2.bias",
+        "decoder.up.2.block.2.norm2.weight": "blocks.9.norm2.weight",
+        "decoder.up.2.upsample.conv.bias": "blocks.10.conv.bias",
+        "decoder.up.2.upsample.conv.weight": "blocks.10.conv.weight",
+        "decoder.up.3.block.0.conv1.bias": "blocks.3.conv1.bias",
+        "decoder.up.3.block.0.conv1.weight": "blocks.3.conv1.weight",
+        "decoder.up.3.block.0.conv2.bias": "blocks.3.conv2.bias",
+        "decoder.up.3.block.0.conv2.weight": "blocks.3.conv2.weight",
+        "decoder.up.3.block.0.norm1.bias": "blocks.3.norm1.bias",
+        "decoder.up.3.block.0.norm1.weight": "blocks.3.norm1.weight",
+        "decoder.up.3.block.0.norm2.bias": "blocks.3.norm2.bias",
+        "decoder.up.3.block.0.norm2.weight": "blocks.3.norm2.weight",
+        "decoder.up.3.block.1.conv1.bias": "blocks.4.conv1.bias",
+        "decoder.up.3.block.1.conv1.weight": "blocks.4.conv1.weight",
+        "decoder.up.3.block.1.conv2.bias": "blocks.4.conv2.bias",
+        "decoder.up.3.block.1.conv2.weight": "blocks.4.conv2.weight",
+        "decoder.up.3.block.1.norm1.bias": "blocks.4.norm1.bias",
+        "decoder.up.3.block.1.norm1.weight": "blocks.4.norm1.weight",
+        "decoder.up.3.block.1.norm2.bias": "blocks.4.norm2.bias",
+        "decoder.up.3.block.1.norm2.weight": "blocks.4.norm2.weight",
+        "decoder.up.3.block.2.conv1.bias": "blocks.5.conv1.bias",
+        "decoder.up.3.block.2.conv1.weight": "blocks.5.conv1.weight",
+        "decoder.up.3.block.2.conv2.bias": "blocks.5.conv2.bias",
+        "decoder.up.3.block.2.conv2.weight": "blocks.5.conv2.weight",
+        "decoder.up.3.block.2.norm1.bias": "blocks.5.norm1.bias",
+        "decoder.up.3.block.2.norm1.weight": "blocks.5.norm1.weight",
+        "decoder.up.3.block.2.norm2.bias": "blocks.5.norm2.bias",
+        "decoder.up.3.block.2.norm2.weight": "blocks.5.norm2.weight",
+        "decoder.up.3.upsample.conv.bias": "blocks.6.conv.bias",
+        "decoder.up.3.upsample.conv.weight": "blocks.6.conv.weight",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            param = state_dict[name]
+            state_dict_[rename_dict[name]] = param
+    return state_dict_
--- a/docs/Model_Details/Overview.md
+++ b/docs/Model_Details/Overview.md
@@ -101,7 +101,7 @@ graph LR;

 ```python
 import torch
-from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig

 pipe = FluxImagePipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
@@ -109,7 +109,7 @@ pipe = FluxImagePipeline.from_pretrained(
    model_configs=[
        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
-        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
    ],
 )
--- a/examples/flux/model_inference/FLEX.2-preview.py
+++ b/examples/flux/model_inference/FLEX.2-preview.py
@@ -0,0 +1,50 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+from diffsynth.utils.controlnet import Annotator
+import numpy as np
+from PIL import Image
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ostris/Flex.2-preview", origin_file_pattern="Flex.2-preview.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl, long hair, red t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    seed=0
+)
+image.save(f"image_1.jpg")
+
+mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
+mask[200:400, 400:700] = 255
+mask = Image.fromarray(mask)
+mask.save(f"image_mask.jpg")
+
+inpaint_image = image
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl with sunglasses, long hair, red t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    flex_inpaint_image=inpaint_image, flex_inpaint_mask=mask,
+    seed=4
+)
+image.save(f"image_2_new.jpg")
+
+control_image = Annotator("canny")(image)
+control_image.save("image_control.jpg")
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl with sunglasses, long hair, yellow t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    flex_control_image=control_image,
+    seed=4
+)
+image.save(f"image_3_new.jpg")
--- a/examples/flux/model_inference/FLUX.1-Kontext-dev.py
+++ b/examples/flux/model_inference/FLUX.1-Kontext-dev.py
@@ -0,0 +1,54 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+from PIL import Image
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-Kontext-dev", origin_file_pattern="flux1-kontext-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+
+image_1 = pipe(
+    prompt="a beautiful Asian long-haired female college student.",
+    embedded_guidance=2.5,
+    seed=1,
+)
+image_1.save("image_1.jpg")
+
+image_2 = pipe(
+    prompt="transform the style to anime style.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=2,
+)
+image_2.save("image_2.jpg")
+
+image_3 = pipe(
+    prompt="let her smile.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=3,
+)
+image_3.save("image_3.jpg")
+
+image_4 = pipe(
+    prompt="let the girl play basketball.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=4,
+)
+image_4.save("image_4.jpg")
+
+image_5 = pipe(
+    prompt="move the girl to a park, let her sit on a chair.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=5,
+)
+image_5.save("image_5.jpg")
--- a/examples/flux/model_inference/FLUX.1-Krea-dev.py
+++ b/examples/flux/model_inference/FLUX.1-Krea-dev.py
@@ -0,0 +1,27 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-Krea-dev", origin_file_pattern="flux1-krea-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+
+prompt = "An beautiful woman is riding a bicycle in a park, wearing a red dress"
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+
+image = pipe(prompt=prompt, seed=0, embedded_guidance=4.5)
+image.save("flux_krea.jpg")
+
+image = pipe(
+    prompt=prompt, negative_prompt=negative_prompt,
+    seed=0, cfg_scale=2, num_inference_steps=50,
+    embedded_guidance=4.5
+)
+image.save("flux_krea_cfg.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-AttriCtrl.py
+++ b/examples/flux/model_inference/FLUX.1-dev-AttriCtrl.py
@@ -0,0 +1,19 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/AttriCtrl-FLUX.1-Dev", origin_file_pattern="models/brightness.safetensors")
+    ],
+)
+
+for i in [0.1, 0.3, 0.5, 0.7, 0.9]:
+    image = pipe(prompt="a cat on the beach", seed=2, value_controller_inputs=[i])
+    image.save(f"value_control_{i}.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py
+++ b/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py
@@ -0,0 +1,37 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig, ControlNetInput
+import numpy as np
+from PIL import Image
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", origin_file_pattern="diffusion_pytorch_model.safetensors"),
+    ],
+)
+
+image_1 = pipe(
+    prompt="a cat sitting on a chair",
+    height=1024, width=1024,
+    seed=8, rand_device="cuda",
+)
+image_1.save("image_1.jpg")
+
+mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
+mask[100:350, 350: -300] = 255
+mask = Image.fromarray(mask)
+mask.save("mask.jpg")
+
+image_2 = pipe(
+    prompt="a cat sitting on a chair, wearing sunglasses",
+    controlnet_inputs=[ControlNetInput(image=image_1, inpaint_mask=mask, scale=0.9)],
+    height=1024, width=1024,
+    seed=9, rand_device="cuda",
+)
+image_2.save("image_2.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py
+++ b/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py
@@ -0,0 +1,40 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig, ControlNetInput
+from diffsynth.utils.controlnet import Annotator
+from modelscope import snapshot_download
+
+
+
+snapshot_download("sd_lora/Annotators", allow_file_pattern="dpt_hybrid-midas-501f0c75.pt", local_dir="models/Annotators")
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="InstantX/FLUX.1-dev-Controlnet-Union-alpha", origin_file_pattern="diffusion_pytorch_model.safetensors"),
+    ],
+)
+
+image_1 = pipe(
+    prompt="a beautiful Asian girl, full body, red dress, summer",
+    height=1024, width=1024,
+    seed=6, rand_device="cuda",
+)
+image_1.save("image_1.jpg")
+
+image_canny = Annotator("canny")(image_1)
+image_depth = Annotator("depth")(image_1)
+
+image_2 = pipe(
+    prompt="a beautiful Asian girl, full body, red dress, winter",
+    controlnet_inputs=[
+        ControlNetInput(image=image_canny, scale=0.3, processor_id="canny"),
+        ControlNetInput(image=image_depth, scale=0.3, processor_id="depth"),
+    ],
+    height=1024, width=1024,
+    seed=7, rand_device="cuda",
+)
+image_2.save("image_2.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py
+++ b/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py
@@ -0,0 +1,33 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig, ControlNetInput
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="jasperai/Flux.1-dev-Controlnet-Upscaler", origin_file_pattern="diffusion_pytorch_model.safetensors"),
+    ],
+)
+
+image_1 = pipe(
+    prompt="a photo of a cat, highly detailed",
+    height=768, width=768,
+    seed=0, rand_device="cuda",
+)
+image_1.save("image_1.jpg")
+
+image_1 = image_1.resize((2048, 2048))
+image_2 = pipe(
+    prompt="a photo of a cat, highly detailed",
+    controlnet_inputs=[ControlNetInput(image=image_1, scale=0.7)],
+    input_image=image_1,
+    denoising_strength=0.99,
+    height=2048, width=2048, tiled=True,
+    seed=1, rand_device="cuda",
+)
+image_2.save("image_2.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-EliGen.py
+++ b/examples/flux/model_inference/FLUX.1-dev-EliGen.py
@@ -0,0 +1,133 @@
+import random
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
+    # Create a blank image for overlays
+    overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
+    
+    colors = [
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+    ]
+    # Generate random colors for each mask
+    if use_random_colors:
+        colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]
+    
+    # Font settings
+    try:
+        font = ImageFont.truetype("arial", font_size)  # Adjust as needed
+    except IOError:
+        font = ImageFont.load_default(font_size)
+
+    # Overlay each mask onto the overlay image
+    for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
+        # Convert mask to RGBA mode
+        mask_rgba = mask.convert('RGBA')
+        mask_data = mask_rgba.getdata()
+        new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
+        mask_rgba.putdata(new_data)
+
+        # Draw the mask prompt text on the mask
+        draw = ImageDraw.Draw(mask_rgba)
+        mask_bbox = mask.getbbox()  # Get the bounding box of the mask
+        text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10)  # Adjust text position based on mask position
+        draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)
+
+        # Alpha composite the overlay with this mask
+        overlay = Image.alpha_composite(overlay, mask_rgba)
+    
+    # Composite the overlay onto the original image
+    result = Image.alpha_composite(image.convert('RGBA'), overlay)
+    
+    # Save or display the resulting image
+    result.save(output_path)
+
+    return result
+
+def example(pipe, seeds, example_id, global_prompt, entity_prompts):
+    dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png")
+    masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))]
+    negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+    for seed in seeds:
+        # generate image
+        image = pipe(
+            prompt=global_prompt,
+            cfg_scale=3.0,
+            negative_prompt=negative_prompt,
+            num_inference_steps=50,
+            embedded_guidance=3.5,
+            seed=seed,
+            height=1024,
+            width=1024,
+            eligen_entity_prompts=entity_prompts,
+            eligen_entity_masks=masks,
+        )
+        image.save(f"eligen_example_{example_id}_{seed}.png")
+        visualize_masks(image, masks, entity_prompts, f"eligen_example_{example_id}_mask_{seed}.png")
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+pipe.load_lora(pipe.dit, ModelConfig(model_id="DiffSynth-Studio/Eligen", origin_file_pattern="model_bf16.safetensors"), alpha=1)
+
+# example 1
+global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a pale blue long dress with soft glow, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea, best quality, realistic, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, maximalist style, photorealistic, concept art, sharp focus, harmony, serenity, tranquility, soft pastell colors,ambient occlusion, cozy ambient lighting, masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning, view from above\n"
+entity_prompts = ["cliff", "sea", "moon", "sailing boat", "a seated beautiful woman", "pale blue long dress with soft glow"]
+example(pipe, [0], 1, global_prompt, entity_prompts)
+
+# example 2
+global_prompt = "samurai girl wearing a kimono, she's holding a sword  glowing with red flame, her long hair is flowing in the wind, she is looking at a small bird perched on the back of her hand. ultra realist style. maximum image detail. maximum realistic render."
+entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "blue belt"]
+example(pipe, [0], 2, global_prompt, entity_prompts)
+
+# example 3
+global_prompt = "Image of a neverending staircase up to a mysterious palace in the sky, The ancient palace stood majestically atop a mist-shrouded mountain, sunrise, two traditional monk walk in the stair looking at the sunrise, fog,see-through, best quality, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, photorealistic, concept art, harmony, serenity, tranquility, ambient occlusion, halation, cozy ambient lighting, dynamic lighting,masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning,"
+entity_prompts = ["ancient palace", "stone staircase with railings", "a traditional monk", "a traditional monk"]
+example(pipe, [27], 3, global_prompt, entity_prompts)
+
+# example 4
+global_prompt = "A beautiful girl wearing shirt and shorts in the street,  holding a sign 'Entity Control'"
+entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"]
+example(pipe, [21], 4, global_prompt, entity_prompts)
+
+# example 5
+global_prompt = "A captivating, dramatic scene in a painting that exudes mystery and foreboding. A white sky, swirling blue clouds, and a crescent yellow moon illuminate a solitary woman standing near the water's edge. Her long dress flows in the wind, silhouetted against the eerie glow. The water mirrors the fiery sky and moonlight, amplifying the uneasy atmosphere."
+entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"]
+example(pipe, [0], 5, global_prompt, entity_prompts)
+
+# example 6
+global_prompt = "Snow White and the 6 Dwarfs."
+entity_prompts = ["Dwarf 1", "Dwarf 2", "Dwarf 3", "Snow White", "Dwarf 4", "Dwarf 5", "Dwarf 6"]
+example(pipe, [8], 6, global_prompt, entity_prompts)
+
+# example 7, same prompt with different seeds
+seeds = range(5, 9)
+global_prompt = "A beautiful woman wearing white dress, holding a mirror, with a warm light background;"
+entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"]
+example(pipe, seeds, 7, global_prompt, entity_prompts)
--- a/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py
+++ b/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py
@@ -0,0 +1,24 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="InstantX/FLUX.1-dev-IP-Adapter", origin_file_pattern="ip-adapter.bin"),
+        ModelConfig(model_id="google/siglip-so400m-patch14-384", origin_file_pattern="model.safetensors"),
+    ],
+)
+
+origin_prompt = "a rabbit in a garden, colorful flowers"
+image = pipe(prompt=origin_prompt, height=1280, width=960, seed=42)
+image.save("style image.jpg")
+
+image = pipe(prompt="A piggy", height=1280, width=960, seed=42,
+    ipadapter_images=[image], ipadapter_scale=0.7)
+image.save("A piggy.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py
+++ b/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py
@@ -0,0 +1,59 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig, ControlNetInput
+from modelscope import dataset_snapshot_download
+from modelscope import snapshot_download
+from PIL import Image
+import numpy as np
+
+
+snapshot_download(
+    "ByteDance/InfiniteYou",
+    allow_file_pattern="supports/insightface/models/antelopev2/*",
+    local_dir="models/ByteDance/InfiniteYou",
+)
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/image_proj_model.bin"),
+        ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/InfuseNetModel/*.safetensors"),
+    ],
+)
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/infiniteyou/*",
+)
+
+height, width = 1024, 1024
+controlnet_image = Image.fromarray(np.zeros([height, width, 3]).astype(np.uint8))
+controlnet_inputs = [ControlNetInput(image=controlnet_image, scale=1.0, processor_id="None")]
+
+prompt = "A man, portrait, cinematic"
+id_image = "data/examples/infiniteyou/man.jpg"
+id_image = Image.open(id_image).convert('RGB')
+image = pipe(
+    prompt=prompt, seed=1,
+    infinityou_id_image=id_image, infinityou_guidance=1.0,
+    controlnet_inputs=controlnet_inputs,
+    num_inference_steps=50, embedded_guidance=3.5,
+    height=height, width=width,
+)
+image.save("man.jpg")
+
+prompt = "A woman, portrait, cinematic"
+id_image = "data/examples/infiniteyou/woman.jpg"
+id_image = Image.open(id_image).convert('RGB')
+image = pipe(
+    prompt=prompt, seed=1,
+    infinityou_id_image=id_image, infinityou_guidance=1.0,
+    controlnet_inputs=controlnet_inputs,
+    num_inference_steps=50, embedded_guidance=3.5,
+    height=height, width=width,
+)
+image.save("woman.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py
+++ b/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py
@@ -0,0 +1,40 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev", origin_file_pattern="model.safetensors"),
+    ],
+)
+pipe.enable_lora_magic()
+
+lora = ModelConfig(model_id="VoidOc/flux_animal_forest1", origin_file_pattern="20.safetensors")
+pipe.load_lora(pipe.dit, lora, hotload=True) # Use `pipe.clear_lora()` to drop the loaded LoRA.
+
+# Empty prompt can automatically activate LoRA capabilities.
+image = pipe(prompt="", seed=0, lora_encoder_inputs=lora)
+image.save("image_1.jpg")
+
+image = pipe(prompt="", seed=0)
+image.save("image_1_origin.jpg")
+
+# Prompt without trigger words can also activate LoRA capabilities.
+image = pipe(prompt="a car", seed=0, lora_encoder_inputs=lora)
+image.save("image_2.jpg")
+
+image = pipe(prompt="a car", seed=0,)
+image.save("image_2_origin.jpg")
+
+# Adjust the activation intensity through the scale parameter.
+image = pipe(prompt="a cat", seed=0, lora_encoder_inputs=lora, lora_encoder_scale=1.0)
+image.save("image_3.jpg")
+
+image = pipe(prompt="a cat", seed=0, lora_encoder_inputs=lora, lora_encoder_scale=0.5)
+image.save("image_3_scale.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py
+++ b/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py
@@ -0,0 +1,29 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+        
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev", origin_file_pattern="model.safetensors"),
+    ],
+)
+pipe.enable_lora_magic()
+
+pipe.load_lora(
+    pipe.dit,
+    ModelConfig(model_id="cancel13/cxsk", origin_file_pattern="30.safetensors"),
+    hotload=True,
+)
+pipe.load_lora(
+    pipe.dit,
+    ModelConfig(model_id="DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1", origin_file_pattern="merged_lora.safetensors"),
+    hotload=True,
+)
+image = pipe(prompt="a cat", seed=0)
+image.save("image_fused.jpg")
--- a/examples/flux/model_inference/FLUX.1-dev.py
+++ b/examples/flux/model_inference/FLUX.1-dev.py
@@ -0,0 +1,26 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+
+prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+
+image = pipe(prompt=prompt, seed=0)
+image.save("flux.jpg")
+
+image = pipe(
+    prompt=prompt, negative_prompt=negative_prompt,
+    seed=0, cfg_scale=2, num_inference_steps=50,
+)
+image.save("flux_cfg.jpg")
--- a/examples/flux/model_inference/Nexus-Gen-Editing.py
+++ b/examples/flux/model_inference/Nexus-Gen-Editing.py
@@ -0,0 +1,37 @@
+import importlib
+import torch
+from PIL import Image
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+if importlib.util.find_spec("transformers") is None:
+    raise ImportError("You are using Nexus-GenV2. It depends on transformers, which is not installed. Please install it with `pip install transformers==4.49.0`.")
+else:
+    import transformers
+    assert transformers.__version__ == "4.49.0", "Nexus-GenV2 requires transformers==4.49.0, please install it with `pip install transformers==4.49.0`."
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+    nexus_gen_processor_config=ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="processor/"),
+)
+
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/nexusgen/cat.jpg")
+ref_image = Image.open("data/examples/nexusgen/cat.jpg").convert("RGB")
+prompt = "Add a crown."
+image = pipe(
+    prompt=prompt, negative_prompt="",
+    seed=42, cfg_scale=2.0, num_inference_steps=50,
+    nexus_gen_reference_image=ref_image,
+    height=512, width=512,
+)
+image.save("cat_crown.jpg")
--- a/examples/flux/model_inference/Nexus-Gen-Generation.py
+++ b/examples/flux/model_inference/Nexus-Gen-Generation.py
@@ -0,0 +1,32 @@
+import importlib
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+
+
+if importlib.util.find_spec("transformers") is None:
+    raise ImportError("You are using Nexus-GenV2. It depends on transformers, which is not installed. Please install it with `pip install transformers==4.49.0`.")
+else:
+    import transformers
+    assert transformers.__version__ == "4.49.0", "Nexus-GenV2 requires transformers==4.49.0, please install it with `pip install transformers==4.49.0`."
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="generation_decoder.bin"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+    nexus_gen_processor_config=ModelConfig("DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="processor"),
+)
+
+prompt = "一只可爱的猫咪"
+image = pipe(
+    prompt=prompt, negative_prompt="",
+    seed=0, cfg_scale=3, num_inference_steps=50,
+    height=1024, width=1024,
+)
+image.save("cat.jpg")
--- a/examples/flux/model_inference/Step1X-Edit.py
+++ b/examples/flux/model_inference/Step1X-Edit.py
@@ -0,0 +1,32 @@
+import torch
+from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig
+from PIL import Image
+import numpy as np
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Qwen/Qwen2.5-VL-7B-Instruct", origin_file_pattern="model-*.safetensors"),
+        ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="step1x-edit-i1258.safetensors"),
+        ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="vae.safetensors"),
+    ],
+)
+
+image = Image.fromarray(np.zeros((1248, 832, 3), dtype=np.uint8) + 255)
+image = pipe(
+    prompt="draw red flowers in Chinese ink painting style",
+    step1x_reference_image=image,
+    width=832, height=1248, cfg_scale=6,
+    seed=1, rand_device='cuda'
+)
+image.save("image_1.jpg")
+
+image = pipe(
+    prompt="add more flowers in Chinese ink painting style",
+    step1x_reference_image=image,
+    width=832, height=1248, cfg_scale=6,
+    seed=2, rand_device='cuda'
+)
+image.save("image_2.jpg")