DiffSynth-Studio 2.0 major update

2026-03-20 15:48:20 +00:00 · 2025-12-04 16:33:07 +08:00
parent afd101f345
commit 72af7122b3
758 changed files with 26462 additions and 2221398 deletions
--- a/diffsynth/models/flux_dit.py
+++ b/diffsynth/models/flux_dit.py
@@ -1,8 +1,7 @@
 import torch
-from .sd3_dit import TimestepEmbeddings, AdaLayerNorm, RMSNorm
+from .general_modules import TimestepEmbeddings, AdaLayerNorm, RMSNorm
 from einops import rearrange
-from .tiler import TileWorker
-from .utils import init_weights_on_device, hash_state_dict_keys
+

 def interact_with_ipadapter(hidden_states, q, ip_k, ip_v, scale=1.0):
    batch_size, num_tokens = hidden_states.shape[0:2]
@@ -269,7 +268,7 @@ class AdaLayerNormContinuous(torch.nn.Module):

    def forward(self, x, conditioning):
        emb = self.linear(self.silu(conditioning))
-        scale, shift = torch.chunk(emb, 2, dim=1)
+        shift, scale = torch.chunk(emb, 2, dim=1)
        x = self.norm(x) * (1 + scale)[:, None] + shift[:, None]
        return x

@@ -321,25 +320,6 @@ class FluxDiT(torch.nn.Module):
        return latent_image_ids


-    def tiled_forward(
-        self,
-        hidden_states,
-        timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids,
-        tile_size=128, tile_stride=64,
-        **kwargs
-    ):
-        # Due to the global positional embedding, we cannot implement layer-wise tiled forward.
-        hidden_states = TileWorker().tiled_forward(
-            lambda x: self.forward(x, timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids, image_ids=None),
-            hidden_states,
-            tile_size,
-            tile_stride,
-            tile_device=hidden_states.device,
-            tile_dtype=hidden_states.dtype
-        )
-        return hidden_states
-
-
    def construct_mask(self, entity_masks, prompt_seq_len, image_seq_len):
        N = len(entity_masks)
        batch_size = entity_masks[0].shape[0]
@@ -411,338 +391,5 @@ class FluxDiT(torch.nn.Module):
        use_gradient_checkpointing=False,
        **kwargs
    ):
-        if tiled:
-            return self.tiled_forward(
-                hidden_states,
-                timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids,
-                tile_size=tile_size, tile_stride=tile_stride,
-                **kwargs
-            )
-
-        if image_ids is None:
-            image_ids = self.prepare_image_ids(hidden_states)
-
-        conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
-        if self.guidance_embedder is not None:
-            guidance = guidance * 1000
-            conditioning = conditioning + self.guidance_embedder(guidance, hidden_states.dtype)
-
-        height, width = hidden_states.shape[-2:]
-        hidden_states = self.patchify(hidden_states)
-        hidden_states = self.x_embedder(hidden_states)
-
-        if entity_prompt_emb is not None and entity_masks is not None:
-            prompt_emb, image_rotary_emb, attention_mask = self.process_entity_masks(hidden_states, prompt_emb, entity_prompt_emb, entity_masks, text_ids, image_ids)
-        else:
-            prompt_emb = self.context_embedder(prompt_emb)
-            image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
-            attention_mask = None
-
-        def create_custom_forward(module):
-            def custom_forward(*inputs):
-                return module(*inputs)
-            return custom_forward
-
-        for block in self.blocks:
-            if self.training and use_gradient_checkpointing:
-                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states, prompt_emb, conditioning, image_rotary_emb, attention_mask,
-                    use_reentrant=False,
-                )
-            else:
-                hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb, attention_mask)
-
-        hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
-        for block in self.single_blocks:
-            if self.training and use_gradient_checkpointing:
-                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states, prompt_emb, conditioning, image_rotary_emb, attention_mask,
-                    use_reentrant=False,
-                )
-            else:
-                hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb, attention_mask)
-        hidden_states = hidden_states[:, prompt_emb.shape[1]:]
-
-        hidden_states = self.final_norm_out(hidden_states, conditioning)
-        hidden_states = self.final_proj_out(hidden_states)
-        hidden_states = self.unpatchify(hidden_states, height, width)
-
-        return hidden_states
-
-
-    def quantize(self):
-        def cast_to(weight, dtype=None, device=None, copy=False):
-            if device is None or weight.device == device:
-                if not copy:
-                    if dtype is None or weight.dtype == dtype:
-                        return weight
-                return weight.to(dtype=dtype, copy=copy)
-
-            r = torch.empty_like(weight, dtype=dtype, device=device)
-            r.copy_(weight)
-            return r
-
-        def cast_weight(s, input=None, dtype=None, device=None):
-            if input is not None:
-                if dtype is None:
-                    dtype = input.dtype
-                if device is None:
-                    device = input.device
-            weight = cast_to(s.weight, dtype, device)
-            return weight
-
-        def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
-            if input is not None:
-                if dtype is None:
-                    dtype = input.dtype
-                if bias_dtype is None:
-                    bias_dtype = dtype
-                if device is None:
-                    device = input.device
-            bias = None
-            weight = cast_to(s.weight, dtype, device)
-            bias = cast_to(s.bias, bias_dtype, device)
-            return weight, bias
-
-        class quantized_layer:
-            class Linear(torch.nn.Linear):
-                def __init__(self, *args, **kwargs):
-                    super().__init__(*args, **kwargs)
-
-                def forward(self,input,**kwargs):
-                    weight,bias= cast_bias_weight(self,input)
-                    return torch.nn.functional.linear(input,weight,bias)
-
-            class RMSNorm(torch.nn.Module):
-                def __init__(self, module):
-                    super().__init__()
-                    self.module = module
-
-                def forward(self,hidden_states,**kwargs):
-                    weight= cast_weight(self.module,hidden_states)
-                    input_dtype = hidden_states.dtype
-                    variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
-                    hidden_states = hidden_states * torch.rsqrt(variance + self.module.eps)
-                    hidden_states = hidden_states.to(input_dtype) * weight
-                    return hidden_states
-
-        def replace_layer(model):
-            for name, module in model.named_children():
-                if isinstance(module, torch.nn.Linear):
-                    with init_weights_on_device():
-                        new_layer = quantized_layer.Linear(module.in_features,module.out_features)
-                    new_layer.weight = module.weight
-                    if module.bias is not None:
-                        new_layer.bias = module.bias
-                    # del module
-                    setattr(model, name, new_layer)
-                elif isinstance(module, RMSNorm):
-                    if hasattr(module,"quantized"):
-                        continue
-                    module.quantized= True
-                    new_layer = quantized_layer.RMSNorm(module)
-                    setattr(model, name, new_layer)
-                else:
-                    replace_layer(module)
-
-        replace_layer(self)
-
-
-    @staticmethod
-    def state_dict_converter():
-        return FluxDiTStateDictConverter()
-
-
-class FluxDiTStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        global_rename_dict = {
-            "context_embedder": "context_embedder",
-            "x_embedder": "x_embedder",
-            "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
-            "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
-            "time_text_embed.guidance_embedder.linear_1": "guidance_embedder.timestep_embedder.0",
-            "time_text_embed.guidance_embedder.linear_2": "guidance_embedder.timestep_embedder.2",
-            "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
-            "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
-            "norm_out.linear": "final_norm_out.linear",
-            "proj_out": "final_proj_out",
-        }
-        rename_dict = {
-            "proj_out": "proj_out",
-            "norm1.linear": "norm1_a.linear",
-            "norm1_context.linear": "norm1_b.linear",
-            "attn.to_q": "attn.a_to_q",
-            "attn.to_k": "attn.a_to_k",
-            "attn.to_v": "attn.a_to_v",
-            "attn.to_out.0": "attn.a_to_out",
-            "attn.add_q_proj": "attn.b_to_q",
-            "attn.add_k_proj": "attn.b_to_k",
-            "attn.add_v_proj": "attn.b_to_v",
-            "attn.to_add_out": "attn.b_to_out",
-            "ff.net.0.proj": "ff_a.0",
-            "ff.net.2": "ff_a.2",
-            "ff_context.net.0.proj": "ff_b.0",
-            "ff_context.net.2": "ff_b.2",
-            "attn.norm_q": "attn.norm_q_a",
-            "attn.norm_k": "attn.norm_k_a",
-            "attn.norm_added_q": "attn.norm_q_b",
-            "attn.norm_added_k": "attn.norm_k_b",
-        }
-        rename_dict_single = {
-            "attn.to_q": "a_to_q",
-            "attn.to_k": "a_to_k",
-            "attn.to_v": "a_to_v",
-            "attn.norm_q": "norm_q_a",
-            "attn.norm_k": "norm_k_a",
-            "norm.linear": "norm.linear",
-            "proj_mlp": "proj_in_besides_attn",
-            "proj_out": "proj_out",
-        }
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.endswith(".weight") or name.endswith(".bias"):
-                suffix = ".weight" if name.endswith(".weight") else ".bias"
-                prefix = name[:-len(suffix)]
-                if prefix in global_rename_dict:
-                    state_dict_[global_rename_dict[prefix] + suffix] = param
-                elif prefix.startswith("transformer_blocks."):
-                    names = prefix.split(".")
-                    names[0] = "blocks"
-                    middle = ".".join(names[2:])
-                    if middle in rename_dict:
-                        name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
-                        state_dict_[name_] = param
-                elif prefix.startswith("single_transformer_blocks."):
-                    names = prefix.split(".")
-                    names[0] = "single_blocks"
-                    middle = ".".join(names[2:])
-                    if middle in rename_dict_single:
-                        name_ = ".".join(names[:2] + [rename_dict_single[middle]] + [suffix[1:]])
-                        state_dict_[name_] = param
-                    else:
-                        pass
-                else:
-                    pass
-        for name in list(state_dict_.keys()):
-            if "single_blocks." in name and ".a_to_q." in name:
-                mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
-                if mlp is None:
-                    mlp = torch.zeros(4 * state_dict_[name].shape[0],
-                                      *state_dict_[name].shape[1:],
-                                      dtype=state_dict_[name].dtype)
-                else:
-                    state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
-                param = torch.concat([
-                    state_dict_.pop(name),
-                    state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
-                    state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
-                    mlp,
-                ], dim=0)
-                name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
-                state_dict_[name_] = param
-        for name in list(state_dict_.keys()):
-            for component in ["a", "b"]:
-                if f".{component}_to_q." in name:
-                    name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
-                    param = torch.concat([
-                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
-                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
-                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
-                    ], dim=0)
-                    state_dict_[name_] = param
-                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
-                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
-                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))
-        return state_dict_
-
-    def from_civitai(self, state_dict):
-        if hash_state_dict_keys(state_dict, with_shape=True) in ["3e6c61b0f9471135fc9c6d6a98e98b6d", "63c969fd37cce769a90aa781fbff5f81"]:
-            dit_state_dict = {key.replace("pipe.dit.", ""): value for key, value in state_dict.items() if key.startswith('pipe.dit.')}
-            return dit_state_dict
-        rename_dict = {
-            "time_in.in_layer.bias": "time_embedder.timestep_embedder.0.bias",
-            "time_in.in_layer.weight": "time_embedder.timestep_embedder.0.weight",
-            "time_in.out_layer.bias": "time_embedder.timestep_embedder.2.bias",
-            "time_in.out_layer.weight": "time_embedder.timestep_embedder.2.weight",
-            "txt_in.bias": "context_embedder.bias",
-            "txt_in.weight": "context_embedder.weight",
-            "vector_in.in_layer.bias": "pooled_text_embedder.0.bias",
-            "vector_in.in_layer.weight": "pooled_text_embedder.0.weight",
-            "vector_in.out_layer.bias": "pooled_text_embedder.2.bias",
-            "vector_in.out_layer.weight": "pooled_text_embedder.2.weight",
-            "final_layer.linear.bias": "final_proj_out.bias",
-            "final_layer.linear.weight": "final_proj_out.weight",
-            "guidance_in.in_layer.bias": "guidance_embedder.timestep_embedder.0.bias",
-            "guidance_in.in_layer.weight": "guidance_embedder.timestep_embedder.0.weight",
-            "guidance_in.out_layer.bias": "guidance_embedder.timestep_embedder.2.bias",
-            "guidance_in.out_layer.weight": "guidance_embedder.timestep_embedder.2.weight",
-            "img_in.bias": "x_embedder.bias",
-            "img_in.weight": "x_embedder.weight",
-            "final_layer.adaLN_modulation.1.weight": "final_norm_out.linear.weight",
-            "final_layer.adaLN_modulation.1.bias": "final_norm_out.linear.bias",
-        }
-        suffix_rename_dict = {
-            "img_attn.norm.key_norm.scale": "attn.norm_k_a.weight",
-            "img_attn.norm.query_norm.scale": "attn.norm_q_a.weight",
-            "img_attn.proj.bias": "attn.a_to_out.bias",
-            "img_attn.proj.weight": "attn.a_to_out.weight",
-            "img_attn.qkv.bias": "attn.a_to_qkv.bias",
-            "img_attn.qkv.weight": "attn.a_to_qkv.weight",
-            "img_mlp.0.bias": "ff_a.0.bias",
-            "img_mlp.0.weight": "ff_a.0.weight",
-            "img_mlp.2.bias": "ff_a.2.bias",
-            "img_mlp.2.weight": "ff_a.2.weight",
-            "img_mod.lin.bias": "norm1_a.linear.bias",
-            "img_mod.lin.weight": "norm1_a.linear.weight",
-            "txt_attn.norm.key_norm.scale": "attn.norm_k_b.weight",
-            "txt_attn.norm.query_norm.scale": "attn.norm_q_b.weight",
-            "txt_attn.proj.bias": "attn.b_to_out.bias",
-            "txt_attn.proj.weight": "attn.b_to_out.weight",
-            "txt_attn.qkv.bias": "attn.b_to_qkv.bias",
-            "txt_attn.qkv.weight": "attn.b_to_qkv.weight",
-            "txt_mlp.0.bias": "ff_b.0.bias",
-            "txt_mlp.0.weight": "ff_b.0.weight",
-            "txt_mlp.2.bias": "ff_b.2.bias",
-            "txt_mlp.2.weight": "ff_b.2.weight",
-            "txt_mod.lin.bias": "norm1_b.linear.bias",
-            "txt_mod.lin.weight": "norm1_b.linear.weight",
-
-            "linear1.bias": "to_qkv_mlp.bias",
-            "linear1.weight": "to_qkv_mlp.weight",
-            "linear2.bias": "proj_out.bias",
-            "linear2.weight": "proj_out.weight",
-            "modulation.lin.bias": "norm.linear.bias",
-            "modulation.lin.weight": "norm.linear.weight",
-            "norm.key_norm.scale": "norm_k_a.weight",
-            "norm.query_norm.scale": "norm_q_a.weight",
-        }
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("model.diffusion_model."):
-                name = name[len("model.diffusion_model."):]
-            names = name.split(".")
-            if name in rename_dict:
-                rename = rename_dict[name]
-                if name.startswith("final_layer.adaLN_modulation.1."):
-                    param = torch.concat([param[3072:], param[:3072]], dim=0)
-                state_dict_[rename] = param
-            elif names[0] == "double_blocks":
-                rename = f"blocks.{names[1]}." + suffix_rename_dict[".".join(names[2:])]
-                state_dict_[rename] = param
-            elif names[0] == "single_blocks":
-                if ".".join(names[2:]) in suffix_rename_dict:
-                    rename = f"single_blocks.{names[1]}." + suffix_rename_dict[".".join(names[2:])]
-                    state_dict_[rename] = param
-            else:
-                pass
-        if "guidance_embedder.timestep_embedder.0.weight" not in state_dict_:
-            return state_dict_, {"disable_guidance_embedder": True}
-        elif "blocks.8.attn.norm_k_a.weight" not in state_dict_:
-            return state_dict_, {"input_dim": 196, "num_blocks": 8}
-        else:
-            return state_dict_
+        # (Deprecated) The real forward is in `pipelines.flux_image`.
+        return None