DiffSynth-Studio 2.0 major update

2026-03-20 23:58:12 +00:00 · 2025-12-04 16:33:07 +08:00
parent afd101f345
commit 72af7122b3
758 changed files with 26462 additions and 2221398 deletions
--- a/diffsynth/utils/state_dict_converters/init.py
+++ b/diffsynth/utils/state_dict_converters/init.py
--- a/diffsynth/utils/state_dict_converters/flux2_text_encoder.py
+++ b/diffsynth/utils/state_dict_converters/flux2_text_encoder.py
@@ -0,0 +1,17 @@
+def Flux2TextEncoderStateDictConverter(state_dict):
+    rename_dict = {
+        "multi_modal_projector.linear_1.weight": "model.multi_modal_projector.linear_1.weight",
+        "multi_modal_projector.linear_2.weight": "model.multi_modal_projector.linear_2.weight",
+        "multi_modal_projector.norm.weight": "model.multi_modal_projector.norm.weight",
+        "multi_modal_projector.patch_merger.merging_layer.weight": "model.multi_modal_projector.patch_merger.merging_layer.weight",
+        "language_model.lm_head.weight": "lm_head.weight",
+    }
+    state_dict_ = {}
+    for k in state_dict:
+        k_ = k
+        k_ = k_.replace("language_model.model", "model.language_model")
+        k_ = k_.replace("vision_tower", "model.vision_tower")
+        if k_ in rename_dict:
+            k_ = rename_dict[k_]
+        state_dict_[k_] = state_dict[k]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/flux_controlnet.py
+++ b/diffsynth/utils/state_dict_converters/flux_controlnet.py
@@ -0,0 +1,103 @@
+import torch
+
+
+def FluxControlNetStateDictConverter(state_dict):
+    global_rename_dict = {
+        "context_embedder": "context_embedder",
+        "x_embedder": "x_embedder",
+        "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
+        "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
+        "time_text_embed.guidance_embedder.linear_1": "guidance_embedder.timestep_embedder.0",
+        "time_text_embed.guidance_embedder.linear_2": "guidance_embedder.timestep_embedder.2",
+        "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
+        "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
+        "norm_out.linear": "final_norm_out.linear",
+        "proj_out": "final_proj_out",
+    }
+    rename_dict = {
+        "proj_out": "proj_out",
+        "norm1.linear": "norm1_a.linear",
+        "norm1_context.linear": "norm1_b.linear",
+        "attn.to_q": "attn.a_to_q",
+        "attn.to_k": "attn.a_to_k",
+        "attn.to_v": "attn.a_to_v",
+        "attn.to_out.0": "attn.a_to_out",
+        "attn.add_q_proj": "attn.b_to_q",
+        "attn.add_k_proj": "attn.b_to_k",
+        "attn.add_v_proj": "attn.b_to_v",
+        "attn.to_add_out": "attn.b_to_out",
+        "ff.net.0.proj": "ff_a.0",
+        "ff.net.2": "ff_a.2",
+        "ff_context.net.0.proj": "ff_b.0",
+        "ff_context.net.2": "ff_b.2",
+        "attn.norm_q": "attn.norm_q_a",
+        "attn.norm_k": "attn.norm_k_a",
+        "attn.norm_added_q": "attn.norm_q_b",
+        "attn.norm_added_k": "attn.norm_k_b",
+    }
+    rename_dict_single = {
+        "attn.to_q": "a_to_q",
+        "attn.to_k": "a_to_k",
+        "attn.to_v": "a_to_v",
+        "attn.norm_q": "norm_q_a",
+        "attn.norm_k": "norm_k_a",
+        "norm.linear": "norm.linear",
+        "proj_mlp": "proj_in_besides_attn",
+        "proj_out": "proj_out",
+    }
+    state_dict_ = {}
+
+    for name in state_dict:
+        param = state_dict[name]
+        if name.endswith(".weight") or name.endswith(".bias"):
+            suffix = ".weight" if name.endswith(".weight") else ".bias"
+            prefix = name[:-len(suffix)]
+            if prefix in global_rename_dict:
+                state_dict_[global_rename_dict[prefix] + suffix] = param
+            elif prefix.startswith("transformer_blocks."):
+                names = prefix.split(".")
+                names[0] = "blocks"
+                middle = ".".join(names[2:])
+                if middle in rename_dict:
+                    name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
+                    state_dict_[name_] = param
+            elif prefix.startswith("single_transformer_blocks."):
+                names = prefix.split(".")
+                names[0] = "single_blocks"
+                middle = ".".join(names[2:])
+                if middle in rename_dict_single:
+                    name_ = ".".join(names[:2] + [rename_dict_single[middle]] + [suffix[1:]])
+                    state_dict_[name_] = param
+                else:
+                    state_dict_[name] = param
+            else:
+                state_dict_[name] = param
+    for name in list(state_dict_.keys()):
+        if ".proj_in_besides_attn." in name:
+            name_ = name.replace(".proj_in_besides_attn.", ".to_qkv_mlp.")
+            param = torch.concat([
+                state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_q.")],
+                state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_k.")],
+                state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_v.")],
+                state_dict_[name],
+            ], dim=0)
+            state_dict_[name_] = param
+            state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_q."))
+            state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_k."))
+            state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_v."))
+            state_dict_.pop(name)
+    for name in list(state_dict_.keys()):
+        for component in ["a", "b"]:
+            if f".{component}_to_q." in name:
+                name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
+                param = torch.concat([
+                    state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                    state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                    state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                ], dim=0)
+                state_dict_[name_] = param
+                state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
+                state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
+                state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))
+    
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/flux_dit.py
+++ b/diffsynth/utils/state_dict_converters/flux_dit.py
@@ -0,0 +1,92 @@
+import torch
+
+
+def FluxDiTStateDictConverter(state_dict):
+    is_nexus_gen = sum([key.startswith("pipe.dit.") for key in state_dict]) > 0
+    if is_nexus_gen:
+        dit_state_dict = {}
+        for key in state_dict:
+            if key.startswith('pipe.dit.'):
+                param = state_dict[key]
+                new_key = key.replace("pipe.dit.", "")
+                if new_key.startswith("final_norm_out.linear."):
+                    param = torch.concat([param[3072:], param[:3072]], dim=0)
+                dit_state_dict[new_key] = param
+        return dit_state_dict
+
+    rename_dict = {
+        "time_in.in_layer.bias": "time_embedder.timestep_embedder.0.bias",
+        "time_in.in_layer.weight": "time_embedder.timestep_embedder.0.weight",
+        "time_in.out_layer.bias": "time_embedder.timestep_embedder.2.bias",
+        "time_in.out_layer.weight": "time_embedder.timestep_embedder.2.weight",
+        "txt_in.bias": "context_embedder.bias",
+        "txt_in.weight": "context_embedder.weight",
+        "vector_in.in_layer.bias": "pooled_text_embedder.0.bias",
+        "vector_in.in_layer.weight": "pooled_text_embedder.0.weight",
+        "vector_in.out_layer.bias": "pooled_text_embedder.2.bias",
+        "vector_in.out_layer.weight": "pooled_text_embedder.2.weight",
+        "final_layer.linear.bias": "final_proj_out.bias",
+        "final_layer.linear.weight": "final_proj_out.weight",
+        "guidance_in.in_layer.bias": "guidance_embedder.timestep_embedder.0.bias",
+        "guidance_in.in_layer.weight": "guidance_embedder.timestep_embedder.0.weight",
+        "guidance_in.out_layer.bias": "guidance_embedder.timestep_embedder.2.bias",
+        "guidance_in.out_layer.weight": "guidance_embedder.timestep_embedder.2.weight",
+        "img_in.bias": "x_embedder.bias",
+        "img_in.weight": "x_embedder.weight",
+        "final_layer.adaLN_modulation.1.weight": "final_norm_out.linear.weight",
+        "final_layer.adaLN_modulation.1.bias": "final_norm_out.linear.bias",
+    }
+    suffix_rename_dict = {
+        "img_attn.norm.key_norm.scale": "attn.norm_k_a.weight",
+        "img_attn.norm.query_norm.scale": "attn.norm_q_a.weight",
+        "img_attn.proj.bias": "attn.a_to_out.bias",
+        "img_attn.proj.weight": "attn.a_to_out.weight",
+        "img_attn.qkv.bias": "attn.a_to_qkv.bias",
+        "img_attn.qkv.weight": "attn.a_to_qkv.weight",
+        "img_mlp.0.bias": "ff_a.0.bias",
+        "img_mlp.0.weight": "ff_a.0.weight",
+        "img_mlp.2.bias": "ff_a.2.bias",
+        "img_mlp.2.weight": "ff_a.2.weight",
+        "img_mod.lin.bias": "norm1_a.linear.bias",
+        "img_mod.lin.weight": "norm1_a.linear.weight",
+        "txt_attn.norm.key_norm.scale": "attn.norm_k_b.weight",
+        "txt_attn.norm.query_norm.scale": "attn.norm_q_b.weight",
+        "txt_attn.proj.bias": "attn.b_to_out.bias",
+        "txt_attn.proj.weight": "attn.b_to_out.weight",
+        "txt_attn.qkv.bias": "attn.b_to_qkv.bias",
+        "txt_attn.qkv.weight": "attn.b_to_qkv.weight",
+        "txt_mlp.0.bias": "ff_b.0.bias",
+        "txt_mlp.0.weight": "ff_b.0.weight",
+        "txt_mlp.2.bias": "ff_b.2.bias",
+        "txt_mlp.2.weight": "ff_b.2.weight",
+        "txt_mod.lin.bias": "norm1_b.linear.bias",
+        "txt_mod.lin.weight": "norm1_b.linear.weight",
+
+        "linear1.bias": "to_qkv_mlp.bias",
+        "linear1.weight": "to_qkv_mlp.weight",
+        "linear2.bias": "proj_out.bias",
+        "linear2.weight": "proj_out.weight",
+        "modulation.lin.bias": "norm.linear.bias",
+        "modulation.lin.weight": "norm.linear.weight",
+        "norm.key_norm.scale": "norm_k_a.weight",
+        "norm.query_norm.scale": "norm_q_a.weight",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        original_name = name
+        if name.startswith("model.diffusion_model."):
+            name = name[len("model.diffusion_model."):]
+        names = name.split(".")
+        if name in rename_dict:
+            rename = rename_dict[name]
+            state_dict_[rename] = state_dict[original_name]
+        elif names[0] == "double_blocks":
+            rename = f"blocks.{names[1]}." + suffix_rename_dict[".".join(names[2:])]
+            state_dict_[rename] = state_dict[original_name]
+        elif names[0] == "single_blocks":
+            if ".".join(names[2:]) in suffix_rename_dict:
+                rename = f"single_blocks.{names[1]}." + suffix_rename_dict[".".join(names[2:])]
+                state_dict_[rename] = state_dict[original_name]
+        else:
+            pass
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/flux_infiniteyou.py
+++ b/diffsynth/utils/state_dict_converters/flux_infiniteyou.py
@@ -0,0 +1,2 @@
+def FluxInfiniteYouImageProjectorStateDictConverter(state_dict):
+    return state_dict['image_proj']
--- a/diffsynth/utils/state_dict_converters/flux_ipadapter.py
+++ b/diffsynth/utils/state_dict_converters/flux_ipadapter.py
@@ -0,0 +1,32 @@
+def FluxIpAdapterStateDictConverter(state_dict):
+    state_dict_ = {}
+    
+    if "ip_adapter" in state_dict and isinstance(state_dict["ip_adapter"], dict):
+        for name, param in state_dict["ip_adapter"].items():
+            name_ = 'ipadapter_modules.' + name
+            state_dict_[name_] = param
+        
+        if "image_proj" in state_dict:
+            for name, param in state_dict["image_proj"].items():
+                name_ = "image_proj." + name
+                state_dict_[name_] = param
+        return state_dict_
+
+    for key, value in state_dict.items():
+        if key.startswith("image_proj."):
+            state_dict_[key] = value
+        elif key.startswith("ip_adapter."):
+            new_key = key.replace("ip_adapter.", "ipadapter_modules.")
+            state_dict_[new_key] = value
+        else:
+            pass
+            
+    return state_dict_
+
+
+def SiglipStateDictConverter(state_dict):
+    new_state_dict = {}
+    for key in state_dict:
+        if key.startswith("vision_model."):
+            new_state_dict[key] = state_dict[key] 
+    return new_state_dict
--- a/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py
+++ b/diffsynth/utils/state_dict_converters/flux_text_encoder_clip.py
@@ -0,0 +1,31 @@
+def FluxTextEncoderClipStateDictConverter(state_dict):
+    rename_dict = {
+        "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+        "text_model.embeddings.position_embedding.weight": "position_embeds",
+        "text_model.final_layer_norm.weight": "final_layer_norm.weight",
+        "text_model.final_layer_norm.bias": "final_layer_norm.bias",
+    }
+    attn_rename_dict = {
+        "self_attn.q_proj": "attn.to_q",
+        "self_attn.k_proj": "attn.to_k",
+        "self_attn.v_proj": "attn.to_v",
+        "self_attn.out_proj": "attn.to_out",
+        "layer_norm1": "layer_norm1",
+        "layer_norm2": "layer_norm2",
+        "mlp.fc1": "fc1",
+        "mlp.fc2": "fc2",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            param = state_dict[name]
+            if name == "text_model.embeddings.position_embedding.weight":
+                param = param.reshape((1, param.shape[0], param.shape[1]))
+            state_dict_[rename_dict[name]] = param
+        elif name.startswith("text_model.encoder.layers."):
+            param = state_dict[name]
+            names = name.split(".")
+            layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+            name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+            state_dict_[name_] = param
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py
+++ b/diffsynth/utils/state_dict_converters/flux_text_encoder_t5.py
@@ -0,0 +1,4 @@
+def FluxTextEncoderT5StateDictConverter(state_dict):
+    state_dict_ = {i: state_dict[i] for i in state_dict}
+    state_dict_["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/flux_vae.py
+++ b/diffsynth/utils/state_dict_converters/flux_vae.py
@@ -0,0 +1,382 @@
+def FluxVAEEncoderStateDictConverter(state_dict):
+    rename_dict = {
+        "encoder.conv_in.bias": "conv_in.bias",
+        "encoder.conv_in.weight": "conv_in.weight",
+        "encoder.conv_out.bias": "conv_out.bias",
+        "encoder.conv_out.weight": "conv_out.weight",
+        "encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias",
+        "encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight",
+        "encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias",
+        "encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight",
+        "encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias",
+        "encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight",
+        "encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias",
+        "encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight",
+        "encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias",
+        "encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight",
+        "encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias",
+        "encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight",
+        "encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias",
+        "encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight",
+        "encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias",
+        "encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight",
+        "encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias",
+        "encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight",
+        "encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias",
+        "encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight",
+        "encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias",
+        "encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight",
+        "encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias",
+        "encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight",
+        "encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias",
+        "encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight",
+        "encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias",
+        "encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight",
+        "encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias",
+        "encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight",
+        "encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias",
+        "encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight",
+        "encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias",
+        "encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight",
+        "encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias",
+        "encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight",
+        "encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias",
+        "encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight",
+        "encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias",
+        "encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight",
+        "encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias",
+        "encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight",
+        "encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias",
+        "encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight",
+        "encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias",
+        "encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight",
+        "encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias",
+        "encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight",
+        "encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias",
+        "encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight",
+        "encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias",
+        "encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight",
+        "encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias",
+        "encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight",
+        "encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias",
+        "encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight",
+        "encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias",
+        "encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight",
+        "encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias",
+        "encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight",
+        "encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias",
+        "encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight",
+        "encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias",
+        "encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight",
+        "encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias",
+        "encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight",
+        "encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias",
+        "encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight",
+        "encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias",
+        "encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight",
+        "encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias",
+        "encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight",
+        "encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias",
+        "encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight",
+        "encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias",
+        "encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight",
+        "encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias",
+        "encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight",
+        "encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias",
+        "encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight",
+        "encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias",
+        "encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight",
+        "encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias",
+        "encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight",
+        "encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias",
+        "encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight",
+        "encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias",
+        "encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight",
+        "encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias",
+        "encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight",
+        "encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias",
+        "encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight",
+        "encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias",
+        "encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight",
+        "encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias",
+        "encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight",
+        "encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias",
+        "encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight",
+        "encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias",
+        "encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight",
+        "encoder.norm_out.bias": "conv_norm_out.bias",
+        "encoder.norm_out.weight": "conv_norm_out.weight",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            param = state_dict[name]
+            state_dict_[rename_dict[name]] = param
+    return state_dict_
+
+
+def FluxVAEDecoderStateDictConverter(state_dict):
+    rename_dict = {
+        "decoder.conv_in.bias": "conv_in.bias",
+        "decoder.conv_in.weight": "conv_in.weight",
+        "decoder.conv_out.bias": "conv_out.bias",
+        "decoder.conv_out.weight": "conv_out.weight",
+        "decoder.mid.attn_1.k.bias": "blocks.1.transformer_blocks.0.to_k.bias",
+        "decoder.mid.attn_1.k.weight": "blocks.1.transformer_blocks.0.to_k.weight",
+        "decoder.mid.attn_1.norm.bias": "blocks.1.norm.bias",
+        "decoder.mid.attn_1.norm.weight": "blocks.1.norm.weight",
+        "decoder.mid.attn_1.proj_out.bias": "blocks.1.transformer_blocks.0.to_out.bias",
+        "decoder.mid.attn_1.proj_out.weight": "blocks.1.transformer_blocks.0.to_out.weight",
+        "decoder.mid.attn_1.q.bias": "blocks.1.transformer_blocks.0.to_q.bias",
+        "decoder.mid.attn_1.q.weight": "blocks.1.transformer_blocks.0.to_q.weight",
+        "decoder.mid.attn_1.v.bias": "blocks.1.transformer_blocks.0.to_v.bias",
+        "decoder.mid.attn_1.v.weight": "blocks.1.transformer_blocks.0.to_v.weight",
+        "decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias",
+        "decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight",
+        "decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias",
+        "decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight",
+        "decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias",
+        "decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight",
+        "decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias",
+        "decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight",
+        "decoder.mid.block_2.conv1.bias": "blocks.2.conv1.bias",
+        "decoder.mid.block_2.conv1.weight": "blocks.2.conv1.weight",
+        "decoder.mid.block_2.conv2.bias": "blocks.2.conv2.bias",
+        "decoder.mid.block_2.conv2.weight": "blocks.2.conv2.weight",
+        "decoder.mid.block_2.norm1.bias": "blocks.2.norm1.bias",
+        "decoder.mid.block_2.norm1.weight": "blocks.2.norm1.weight",
+        "decoder.mid.block_2.norm2.bias": "blocks.2.norm2.bias",
+        "decoder.mid.block_2.norm2.weight": "blocks.2.norm2.weight",
+        "decoder.norm_out.bias": "conv_norm_out.bias",
+        "decoder.norm_out.weight": "conv_norm_out.weight",
+        "decoder.up.0.block.0.conv1.bias": "blocks.15.conv1.bias",
+        "decoder.up.0.block.0.conv1.weight": "blocks.15.conv1.weight",
+        "decoder.up.0.block.0.conv2.bias": "blocks.15.conv2.bias",
+        "decoder.up.0.block.0.conv2.weight": "blocks.15.conv2.weight",
+        "decoder.up.0.block.0.nin_shortcut.bias": "blocks.15.conv_shortcut.bias",
+        "decoder.up.0.block.0.nin_shortcut.weight": "blocks.15.conv_shortcut.weight",
+        "decoder.up.0.block.0.norm1.bias": "blocks.15.norm1.bias",
+        "decoder.up.0.block.0.norm1.weight": "blocks.15.norm1.weight",
+        "decoder.up.0.block.0.norm2.bias": "blocks.15.norm2.bias",
+        "decoder.up.0.block.0.norm2.weight": "blocks.15.norm2.weight",
+        "decoder.up.0.block.1.conv1.bias": "blocks.16.conv1.bias",
+        "decoder.up.0.block.1.conv1.weight": "blocks.16.conv1.weight",
+        "decoder.up.0.block.1.conv2.bias": "blocks.16.conv2.bias",
+        "decoder.up.0.block.1.conv2.weight": "blocks.16.conv2.weight",
+        "decoder.up.0.block.1.norm1.bias": "blocks.16.norm1.bias",
+        "decoder.up.0.block.1.norm1.weight": "blocks.16.norm1.weight",
+        "decoder.up.0.block.1.norm2.bias": "blocks.16.norm2.bias",
+        "decoder.up.0.block.1.norm2.weight": "blocks.16.norm2.weight",
+        "decoder.up.0.block.2.conv1.bias": "blocks.17.conv1.bias",
+        "decoder.up.0.block.2.conv1.weight": "blocks.17.conv1.weight",
+        "decoder.up.0.block.2.conv2.bias": "blocks.17.conv2.bias",
+        "decoder.up.0.block.2.conv2.weight": "blocks.17.conv2.weight",
+        "decoder.up.0.block.2.norm1.bias": "blocks.17.norm1.bias",
+        "decoder.up.0.block.2.norm1.weight": "blocks.17.norm1.weight",
+        "decoder.up.0.block.2.norm2.bias": "blocks.17.norm2.bias",
+        "decoder.up.0.block.2.norm2.weight": "blocks.17.norm2.weight",
+        "decoder.up.1.block.0.conv1.bias": "blocks.11.conv1.bias",
+        "decoder.up.1.block.0.conv1.weight": "blocks.11.conv1.weight",
+        "decoder.up.1.block.0.conv2.bias": "blocks.11.conv2.bias",
+        "decoder.up.1.block.0.conv2.weight": "blocks.11.conv2.weight",
+        "decoder.up.1.block.0.nin_shortcut.bias": "blocks.11.conv_shortcut.bias",
+        "decoder.up.1.block.0.nin_shortcut.weight": "blocks.11.conv_shortcut.weight",
+        "decoder.up.1.block.0.norm1.bias": "blocks.11.norm1.bias",
+        "decoder.up.1.block.0.norm1.weight": "blocks.11.norm1.weight",
+        "decoder.up.1.block.0.norm2.bias": "blocks.11.norm2.bias",
+        "decoder.up.1.block.0.norm2.weight": "blocks.11.norm2.weight",
+        "decoder.up.1.block.1.conv1.bias": "blocks.12.conv1.bias",
+        "decoder.up.1.block.1.conv1.weight": "blocks.12.conv1.weight",
+        "decoder.up.1.block.1.conv2.bias": "blocks.12.conv2.bias",
+        "decoder.up.1.block.1.conv2.weight": "blocks.12.conv2.weight",
+        "decoder.up.1.block.1.norm1.bias": "blocks.12.norm1.bias",
+        "decoder.up.1.block.1.norm1.weight": "blocks.12.norm1.weight",
+        "decoder.up.1.block.1.norm2.bias": "blocks.12.norm2.bias",
+        "decoder.up.1.block.1.norm2.weight": "blocks.12.norm2.weight",
+        "decoder.up.1.block.2.conv1.bias": "blocks.13.conv1.bias",
+        "decoder.up.1.block.2.conv1.weight": "blocks.13.conv1.weight",
+        "decoder.up.1.block.2.conv2.bias": "blocks.13.conv2.bias",
+        "decoder.up.1.block.2.conv2.weight": "blocks.13.conv2.weight",
+        "decoder.up.1.block.2.norm1.bias": "blocks.13.norm1.bias",
+        "decoder.up.1.block.2.norm1.weight": "blocks.13.norm1.weight",
+        "decoder.up.1.block.2.norm2.bias": "blocks.13.norm2.bias",
+        "decoder.up.1.block.2.norm2.weight": "blocks.13.norm2.weight",
+        "decoder.up.1.upsample.conv.bias": "blocks.14.conv.bias",
+        "decoder.up.1.upsample.conv.weight": "blocks.14.conv.weight",
+        "decoder.up.2.block.0.conv1.bias": "blocks.7.conv1.bias",
+        "decoder.up.2.block.0.conv1.weight": "blocks.7.conv1.weight",
+        "decoder.up.2.block.0.conv2.bias": "blocks.7.conv2.bias",
+        "decoder.up.2.block.0.conv2.weight": "blocks.7.conv2.weight",
+        "decoder.up.2.block.0.norm1.bias": "blocks.7.norm1.bias",
+        "decoder.up.2.block.0.norm1.weight": "blocks.7.norm1.weight",
+        "decoder.up.2.block.0.norm2.bias": "blocks.7.norm2.bias",
+        "decoder.up.2.block.0.norm2.weight": "blocks.7.norm2.weight",
+        "decoder.up.2.block.1.conv1.bias": "blocks.8.conv1.bias",
+        "decoder.up.2.block.1.conv1.weight": "blocks.8.conv1.weight",
+        "decoder.up.2.block.1.conv2.bias": "blocks.8.conv2.bias",
+        "decoder.up.2.block.1.conv2.weight": "blocks.8.conv2.weight",
+        "decoder.up.2.block.1.norm1.bias": "blocks.8.norm1.bias",
+        "decoder.up.2.block.1.norm1.weight": "blocks.8.norm1.weight",
+        "decoder.up.2.block.1.norm2.bias": "blocks.8.norm2.bias",
+        "decoder.up.2.block.1.norm2.weight": "blocks.8.norm2.weight",
+        "decoder.up.2.block.2.conv1.bias": "blocks.9.conv1.bias",
+        "decoder.up.2.block.2.conv1.weight": "blocks.9.conv1.weight",
+        "decoder.up.2.block.2.conv2.bias": "blocks.9.conv2.bias",
+        "decoder.up.2.block.2.conv2.weight": "blocks.9.conv2.weight",
+        "decoder.up.2.block.2.norm1.bias": "blocks.9.norm1.bias",
+        "decoder.up.2.block.2.norm1.weight": "blocks.9.norm1.weight",
+        "decoder.up.2.block.2.norm2.bias": "blocks.9.norm2.bias",
+        "decoder.up.2.block.2.norm2.weight": "blocks.9.norm2.weight",
+        "decoder.up.2.upsample.conv.bias": "blocks.10.conv.bias",
+        "decoder.up.2.upsample.conv.weight": "blocks.10.conv.weight",
+        "decoder.up.3.block.0.conv1.bias": "blocks.3.conv1.bias",
+        "decoder.up.3.block.0.conv1.weight": "blocks.3.conv1.weight",
+        "decoder.up.3.block.0.conv2.bias": "blocks.3.conv2.bias",
+        "decoder.up.3.block.0.conv2.weight": "blocks.3.conv2.weight",
+        "decoder.up.3.block.0.norm1.bias": "blocks.3.norm1.bias",
+        "decoder.up.3.block.0.norm1.weight": "blocks.3.norm1.weight",
+        "decoder.up.3.block.0.norm2.bias": "blocks.3.norm2.bias",
+        "decoder.up.3.block.0.norm2.weight": "blocks.3.norm2.weight",
+        "decoder.up.3.block.1.conv1.bias": "blocks.4.conv1.bias",
+        "decoder.up.3.block.1.conv1.weight": "blocks.4.conv1.weight",
+        "decoder.up.3.block.1.conv2.bias": "blocks.4.conv2.bias",
+        "decoder.up.3.block.1.conv2.weight": "blocks.4.conv2.weight",
+        "decoder.up.3.block.1.norm1.bias": "blocks.4.norm1.bias",
+        "decoder.up.3.block.1.norm1.weight": "blocks.4.norm1.weight",
+        "decoder.up.3.block.1.norm2.bias": "blocks.4.norm2.bias",
+        "decoder.up.3.block.1.norm2.weight": "blocks.4.norm2.weight",
+        "decoder.up.3.block.2.conv1.bias": "blocks.5.conv1.bias",
+        "decoder.up.3.block.2.conv1.weight": "blocks.5.conv1.weight",
+        "decoder.up.3.block.2.conv2.bias": "blocks.5.conv2.bias",
+        "decoder.up.3.block.2.conv2.weight": "blocks.5.conv2.weight",
+        "decoder.up.3.block.2.norm1.bias": "blocks.5.norm1.bias",
+        "decoder.up.3.block.2.norm1.weight": "blocks.5.norm1.weight",
+        "decoder.up.3.block.2.norm2.bias": "blocks.5.norm2.bias",
+        "decoder.up.3.block.2.norm2.weight": "blocks.5.norm2.weight",
+        "decoder.up.3.upsample.conv.bias": "blocks.6.conv.bias",
+        "decoder.up.3.upsample.conv.weight": "blocks.6.conv.weight",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            param = state_dict[name]
+            state_dict_[rename_dict[name]] = param
+    return state_dict_
+
+
+def FluxVAEEncoderStateDictConverterDiffusers(state_dict):
+    # architecture
+    block_types = [
+        'ResnetBlock', 'ResnetBlock', 'DownSampler',
+        'ResnetBlock', 'ResnetBlock', 'DownSampler',
+        'ResnetBlock', 'ResnetBlock', 'DownSampler',
+        'ResnetBlock', 'ResnetBlock',
+        'ResnetBlock', 'VAEAttentionBlock', 'ResnetBlock'
+    ]
+
+    # Rename each parameter
+    local_rename_dict = {
+        "quant_conv": "quant_conv",
+        "encoder.conv_in": "conv_in",
+        "encoder.mid_block.attentions.0.group_norm": "blocks.12.norm",
+        "encoder.mid_block.attentions.0.to_q": "blocks.12.transformer_blocks.0.to_q",
+        "encoder.mid_block.attentions.0.to_k": "blocks.12.transformer_blocks.0.to_k",
+        "encoder.mid_block.attentions.0.to_v": "blocks.12.transformer_blocks.0.to_v",
+        "encoder.mid_block.attentions.0.to_out.0": "blocks.12.transformer_blocks.0.to_out",
+        "encoder.mid_block.resnets.0.norm1": "blocks.11.norm1",
+        "encoder.mid_block.resnets.0.conv1": "blocks.11.conv1",
+        "encoder.mid_block.resnets.0.norm2": "blocks.11.norm2",
+        "encoder.mid_block.resnets.0.conv2": "blocks.11.conv2",
+        "encoder.mid_block.resnets.1.norm1": "blocks.13.norm1",
+        "encoder.mid_block.resnets.1.conv1": "blocks.13.conv1",
+        "encoder.mid_block.resnets.1.norm2": "blocks.13.norm2",
+        "encoder.mid_block.resnets.1.conv2": "blocks.13.conv2",
+        "encoder.conv_norm_out": "conv_norm_out",
+        "encoder.conv_out": "conv_out",
+    }
+    name_list = sorted([name for name in state_dict])
+    rename_dict = {}
+    block_id = {"ResnetBlock": -1, "DownSampler": -1, "UpSampler": -1}
+    last_block_type_with_id = {"ResnetBlock": "", "DownSampler": "", "UpSampler": ""}
+    for name in name_list:
+        names = name.split(".")
+        name_prefix = ".".join(names[:-1])
+        if name_prefix in local_rename_dict:
+            rename_dict[name] = local_rename_dict[name_prefix] + "." + names[-1]
+        elif name.startswith("encoder.down_blocks"):
+            block_type = {"resnets": "ResnetBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[3]]
+            block_type_with_id = ".".join(names[:5])
+            if block_type_with_id != last_block_type_with_id[block_type]:
+                block_id[block_type] += 1
+            last_block_type_with_id[block_type] = block_type_with_id
+            while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
+                block_id[block_type] += 1
+            block_type_with_id = ".".join(names[:5])
+            names = ["blocks", str(block_id[block_type])] + names[5:]
+            rename_dict[name] = ".".join(names)
+
+    # Convert state_dict
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            state_dict_[rename_dict[name]] = state_dict[name]
+    return state_dict_
+
+
+def FluxVAEDecoderStateDictConverterDiffusers(state_dict):
+    # architecture
+        block_types = [
+            'ResnetBlock', 'VAEAttentionBlock', 'ResnetBlock',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock'
+        ]
+
+        # Rename each parameter
+        local_rename_dict = {
+            "post_quant_conv": "post_quant_conv",
+            "decoder.conv_in": "conv_in",
+            "decoder.mid_block.attentions.0.group_norm": "blocks.1.norm",
+            "decoder.mid_block.attentions.0.to_q": "blocks.1.transformer_blocks.0.to_q",
+            "decoder.mid_block.attentions.0.to_k": "blocks.1.transformer_blocks.0.to_k",
+            "decoder.mid_block.attentions.0.to_v": "blocks.1.transformer_blocks.0.to_v",
+            "decoder.mid_block.attentions.0.to_out.0": "blocks.1.transformer_blocks.0.to_out",
+            "decoder.mid_block.resnets.0.norm1": "blocks.0.norm1",
+            "decoder.mid_block.resnets.0.conv1": "blocks.0.conv1",
+            "decoder.mid_block.resnets.0.norm2": "blocks.0.norm2",
+            "decoder.mid_block.resnets.0.conv2": "blocks.0.conv2",
+            "decoder.mid_block.resnets.1.norm1": "blocks.2.norm1",
+            "decoder.mid_block.resnets.1.conv1": "blocks.2.conv1",
+            "decoder.mid_block.resnets.1.norm2": "blocks.2.norm2",
+            "decoder.mid_block.resnets.1.conv2": "blocks.2.conv2",
+            "decoder.conv_norm_out": "conv_norm_out",
+            "decoder.conv_out": "conv_out",
+        }
+        name_list = sorted([name for name in state_dict])
+        rename_dict = {}
+        block_id = {"ResnetBlock": 2, "DownSampler": 2, "UpSampler": 2}
+        last_block_type_with_id = {"ResnetBlock": "", "DownSampler": "", "UpSampler": ""}
+        for name in name_list:
+            names = name.split(".")
+            name_prefix = ".".join(names[:-1])
+            if name_prefix in local_rename_dict:
+                rename_dict[name] = local_rename_dict[name_prefix] + "." + names[-1]
+            elif name.startswith("decoder.up_blocks"):
+                block_type = {"resnets": "ResnetBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[3]]
+                block_type_with_id = ".".join(names[:5])
+                if block_type_with_id != last_block_type_with_id[block_type]:
+                    block_id[block_type] += 1
+                last_block_type_with_id[block_type] = block_type_with_id
+                while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
+                    block_id[block_type] += 1
+                block_type_with_id = ".".join(names[:5])
+                names = ["blocks", str(block_id[block_type])] + names[5:]
+                rename_dict[name] = ".".join(names)
+
+        # Convert state_dict
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = state_dict[name]
+        return state_dict_
--- a/diffsynth/utils/state_dict_converters/nexus_gen.py
+++ b/diffsynth/utils/state_dict_converters/nexus_gen.py
@@ -0,0 +1,6 @@
+def NexusGenAutoregressiveModelStateDictConverter(state_dict):
+    new_state_dict = {}
+    for key in state_dict:
+        value = state_dict[key]
+        new_state_dict["model." + key] = value
+    return new_state_dict
--- a/diffsynth/utils/state_dict_converters/nexus_gen_projector.py
+++ b/diffsynth/utils/state_dict_converters/nexus_gen_projector.py
@@ -0,0 +1,15 @@
+def NexusGenMergerStateDictConverter(state_dict):
+    merger_state_dict = {}
+    for key in state_dict:
+        if key.startswith('embedding_merger.'):
+            value = state_dict[key]
+            new_key = key.replace("embedding_merger.", "")
+            merger_state_dict[new_key] = value
+    return merger_state_dict
+
+def NexusGenAdapterStateDictConverter(state_dict):
+    adapter_state_dict = {}
+    for key in state_dict:
+        if key.startswith('adapter.'):
+            adapter_state_dict[key] = state_dict[key]
+    return adapter_state_dict
--- a/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py
+++ b/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py
@@ -0,0 +1,10 @@
+def QwenImageTextEncoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for k in state_dict:
+        v = state_dict[k]
+        if k.startswith("visual."):
+            k = "model." + k
+        elif k.startswith("model."):
+            k = k.replace("model.", "model.language_model.")
+        state_dict_[k] = v
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/step1x_connector.py
+++ b/diffsynth/utils/state_dict_converters/step1x_connector.py
@@ -0,0 +1,7 @@
+def Qwen2ConnectorStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("connector."):
+            name_ = name[len("connector."):]
+            state_dict_[name_] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_animate_adapter.py
@@ -0,0 +1,6 @@
+def WanAnimateAdapterStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("pose_patch_embedding.") or name.startswith("face_adapter") or name.startswith("face_encoder") or name.startswith("motion_encoder"):
+            state_dict_[name] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wan_video_dit.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_dit.py
@@ -0,0 +1,83 @@
+def WanVideoDiTFromDiffusers(state_dict):
+    rename_dict = {
+        "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
+        "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
+        "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
+        "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
+        "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
+        "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
+        "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
+        "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
+        "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
+        "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
+        "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
+        "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
+        "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
+        "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
+        "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
+        "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
+        "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
+        "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
+        "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
+        "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
+        "blocks.0.attn2.add_k_proj.bias":"blocks.0.cross_attn.k_img.bias",
+        "blocks.0.attn2.add_k_proj.weight":"blocks.0.cross_attn.k_img.weight",
+        "blocks.0.attn2.add_v_proj.bias":"blocks.0.cross_attn.v_img.bias",
+        "blocks.0.attn2.add_v_proj.weight":"blocks.0.cross_attn.v_img.weight",
+        "blocks.0.attn2.norm_added_k.weight":"blocks.0.cross_attn.norm_k_img.weight",
+        "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
+        "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
+        "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
+        "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
+        "blocks.0.norm2.bias": "blocks.0.norm3.bias",
+        "blocks.0.norm2.weight": "blocks.0.norm3.weight",
+        "blocks.0.scale_shift_table": "blocks.0.modulation",
+        "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+        "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+        "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+        "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+        "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+        "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+        "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+        "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+        "condition_embedder.time_proj.bias": "time_projection.1.bias",
+        "condition_embedder.time_proj.weight": "time_projection.1.weight",
+        "condition_embedder.image_embedder.ff.net.0.proj.bias":"img_emb.proj.1.bias",
+        "condition_embedder.image_embedder.ff.net.0.proj.weight":"img_emb.proj.1.weight",
+        "condition_embedder.image_embedder.ff.net.2.bias":"img_emb.proj.3.bias",
+        "condition_embedder.image_embedder.ff.net.2.weight":"img_emb.proj.3.weight",
+        "condition_embedder.image_embedder.norm1.bias":"img_emb.proj.0.bias",
+        "condition_embedder.image_embedder.norm1.weight":"img_emb.proj.0.weight",
+        "condition_embedder.image_embedder.norm2.bias":"img_emb.proj.4.bias",
+        "condition_embedder.image_embedder.norm2.weight":"img_emb.proj.4.weight",
+        "patch_embedding.bias": "patch_embedding.bias",
+        "patch_embedding.weight": "patch_embedding.weight",
+        "scale_shift_table": "head.modulation",
+        "proj_out.bias": "head.head.bias",
+        "proj_out.weight": "head.head.weight",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        if name in rename_dict:
+            state_dict_[rename_dict[name]] = state_dict[name]
+        else:
+            name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
+            if name_ in rename_dict:
+                name_ = rename_dict[name_]
+                name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
+                state_dict_[name_] = state_dict[name]
+    return state_dict_
+
+
+def WanVideoDiTStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("vace"):
+            continue
+        if name.split(".")[0] in ["pose_patch_embedding", "face_adapter", "face_encoder", "motion_encoder"]:
+            continue
+        name_ = name
+        if name_.startswith("model."):
+            name_ = name_[len("model."):]
+        state_dict_[name_] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_image_encoder.py
@@ -0,0 +1,8 @@
+def WanImageEncoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name.startswith("textual."):
+            continue
+        name_ = "model." + name
+        state_dict_[name_] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wan_video_mot.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_mot.py
@@ -0,0 +1,78 @@
+def WanVideoMotStateDictConverter(state_dict):
+    rename_dict = {
+        "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
+        "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
+        "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
+        "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
+        "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
+        "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
+        "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
+        "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
+        "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
+        "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
+        "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
+        "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
+        "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
+        "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
+        "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
+        "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
+        "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
+        "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
+        "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
+        "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
+        "blocks.0.attn2.add_k_proj.bias":"blocks.0.cross_attn.k_img.bias",
+        "blocks.0.attn2.add_k_proj.weight":"blocks.0.cross_attn.k_img.weight",
+        "blocks.0.attn2.add_v_proj.bias":"blocks.0.cross_attn.v_img.bias",
+        "blocks.0.attn2.add_v_proj.weight":"blocks.0.cross_attn.v_img.weight",
+        "blocks.0.attn2.norm_added_k.weight":"blocks.0.cross_attn.norm_k_img.weight",
+        "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
+        "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
+        "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
+        "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
+        "blocks.0.norm2.bias": "blocks.0.norm3.bias",
+        "blocks.0.norm2.weight": "blocks.0.norm3.weight",
+        "blocks.0.scale_shift_table": "blocks.0.modulation",
+        "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+        "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+        "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+        "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+        "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+        "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+        "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+        "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+        "condition_embedder.time_proj.bias": "time_projection.1.bias",
+        "condition_embedder.time_proj.weight": "time_projection.1.weight",
+        "condition_embedder.image_embedder.ff.net.0.proj.bias":"img_emb.proj.1.bias",
+        "condition_embedder.image_embedder.ff.net.0.proj.weight":"img_emb.proj.1.weight",
+        "condition_embedder.image_embedder.ff.net.2.bias":"img_emb.proj.3.bias",
+        "condition_embedder.image_embedder.ff.net.2.weight":"img_emb.proj.3.weight",
+        "condition_embedder.image_embedder.norm1.bias":"img_emb.proj.0.bias",
+        "condition_embedder.image_embedder.norm1.weight":"img_emb.proj.0.weight",
+        "condition_embedder.image_embedder.norm2.bias":"img_emb.proj.4.bias",
+        "condition_embedder.image_embedder.norm2.weight":"img_emb.proj.4.weight",
+        "patch_embedding.bias": "patch_embedding.bias",
+        "patch_embedding.weight": "patch_embedding.weight",
+        "scale_shift_table": "head.modulation",
+        "proj_out.bias": "head.head.bias",
+        "proj_out.weight": "head.head.weight",
+    }
+    mot_layers = (0, 4, 8, 12, 16, 20, 24, 28, 32, 36)
+    mot_layers_mapping = {i:n for n, i in enumerate(mot_layers)}
+    state_dict_ = {}
+    for name in state_dict:
+        if "_mot_ref" not in name:
+            continue
+        param = state_dict[name]
+        name = name.replace("_mot_ref", "")
+        if name in rename_dict:
+            state_dict_[rename_dict[name]] = param
+        else:
+            if name.split(".")[1].isdigit():
+                block_id = int(name.split(".")[1])
+                name = name.replace(str(block_id), str(mot_layers_mapping[block_id]))
+            name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
+            if name_ in rename_dict:
+                name_ = rename_dict[name_]
+                name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
+                state_dict_[name_] = param
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wan_video_vace.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_vace.py
@@ -0,0 +1,3 @@
+def VaceWanModelDictConverter(state_dict):
+    state_dict_ = {name: state_dict[name] for name in state_dict if name.startswith("vace")}
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wan_video_vae.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_vae.py
@@ -0,0 +1,7 @@
+def WanVideoVAEStateDictConverter(state_dict):
+    state_dict_ = {}
+    if 'model_state' in state_dict:
+        state_dict = state_dict['model_state']
+    for name in state_dict:
+        state_dict_['model.' + name] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py
+++ b/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py
@@ -0,0 +1,12 @@
+def WanS2VAudioEncoderStateDictConverter(state_dict):
+    rename_dict = {
+        "model.wav2vec2.encoder.pos_conv_embed.conv.weight_g": "model.wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0",
+        "model.wav2vec2.encoder.pos_conv_embed.conv.weight_v": "model.wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        name_ = "model." + name
+        if name_ in rename_dict:
+            name_ = rename_dict[name_]
+        state_dict_[name_] = state_dict[name]
+    return state_dict_