wan-series

2026-03-20 15:48:20 +00:00 · 2025-11-14 19:05:26 +08:00
parent 5be5c32fe4
commit e3356556ee
215 changed files with 5504 additions and 482 deletions
--- a/diffsynth/models/model_loader.py
+++ b/diffsynth/models/model_loader.py
@@ -59,7 +59,7 @@ class ModelPool:
        }
        return vram_config
    
-    def auto_load_model(self, path, vram_config=None, vram_limit=None):
+    def auto_load_model(self, path, vram_config=None, vram_limit=None, clear_parameters=False):
        print(f"Loading models from: {json.dumps(path, indent=4)}")
        if vram_config is None:
            vram_config = self.default_vram_config()
@@ -68,6 +68,7 @@ class ModelPool:
        for config in MODEL_CONFIGS:
            if config["model_hash"] == model_hash:
                model = self.load_model_file(config, path, vram_config, vram_limit=vram_limit)
+                if clear_parameters: self.clear_parameters(model)
                self.model.append(model)
                model_name = config["model_name"]
                self.model_name.append(model_name)
@@ -102,3 +103,9 @@ class ModelPool:
                model = fetched_models
                print(f"More than one {model_name} models are loaded: {fetched_model_paths}. Using {model_name} from {json.dumps(fetched_model_paths, indent=4)}.")
        return model
+
+    def clear_parameters(self, model: torch.nn.Module):
+        for name, module in model.named_children():
+            self.clear_parameters(module)
+        for name, param in model.named_parameters(recurse=False):
+            setattr(model, name, None)
--- a/diffsynth/models/wan_video_animate_adapter.py
+++ b/diffsynth/models/wan_video_animate_adapter.py
@@ -375,7 +375,7 @@ class Blur(nn.Module):
 		if upsample_factor > 1:
 			kernel = kernel * (upsample_factor ** 2)

-		self.register_buffer('kernel', kernel)
+		self.kernel = torch.nn.Parameter(kernel)

 		self.pad = pad

@@ -648,23 +648,3 @@ class WanAnimateAdapter(torch.nn.Module):
            residual_out = self.face_adapter.fuser_blocks[block_idx // 5](*adapter_args)
            x = residual_out + x
        return x
-    
-    @staticmethod
-    def state_dict_converter():
-        return WanAnimateAdapterStateDictConverter()
-
-
-class WanAnimateAdapterStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        return state_dict
-    
-    def from_civitai(self, state_dict):
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("pose_patch_embedding.") or name.startswith("face_adapter") or name.startswith("face_encoder") or name.startswith("motion_encoder"):
-                state_dict_[name] = param
-        return state_dict_
-
--- a/diffsynth/models/wan_video_dit.py
+++ b/diffsynth/models/wan_video_dit.py
@@ -404,369 +404,3 @@ class WanModel(torch.nn.Module):
        x = self.head(x, t)
        x = self.unpatchify(x, (f, h, w))
        return x
-
-    @staticmethod
-    def state_dict_converter():
-        return WanModelStateDictConverter()
-    
-    
-class WanModelStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        rename_dict = {
-            "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
-            "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
-            "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
-            "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
-            "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
-            "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
-            "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
-            "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
-            "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
-            "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
-            "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
-            "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
-            "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
-            "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
-            "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
-            "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
-            "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
-            "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
-            "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
-            "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
-            "blocks.0.attn2.add_k_proj.bias":"blocks.0.cross_attn.k_img.bias",
-            "blocks.0.attn2.add_k_proj.weight":"blocks.0.cross_attn.k_img.weight",
-            "blocks.0.attn2.add_v_proj.bias":"blocks.0.cross_attn.v_img.bias",
-            "blocks.0.attn2.add_v_proj.weight":"blocks.0.cross_attn.v_img.weight",
-            "blocks.0.attn2.norm_added_k.weight":"blocks.0.cross_attn.norm_k_img.weight",
-            "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
-            "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
-            "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
-            "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
-            "blocks.0.norm2.bias": "blocks.0.norm3.bias",
-            "blocks.0.norm2.weight": "blocks.0.norm3.weight",
-            "blocks.0.scale_shift_table": "blocks.0.modulation",
-            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
-            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
-            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
-            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
-            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
-            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
-            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
-            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
-            "condition_embedder.time_proj.bias": "time_projection.1.bias",
-            "condition_embedder.time_proj.weight": "time_projection.1.weight",
-            "condition_embedder.image_embedder.ff.net.0.proj.bias":"img_emb.proj.1.bias",
-            "condition_embedder.image_embedder.ff.net.0.proj.weight":"img_emb.proj.1.weight",
-            "condition_embedder.image_embedder.ff.net.2.bias":"img_emb.proj.3.bias",
-            "condition_embedder.image_embedder.ff.net.2.weight":"img_emb.proj.3.weight",
-            "condition_embedder.image_embedder.norm1.bias":"img_emb.proj.0.bias",
-            "condition_embedder.image_embedder.norm1.weight":"img_emb.proj.0.weight",
-            "condition_embedder.image_embedder.norm2.bias":"img_emb.proj.4.bias",
-            "condition_embedder.image_embedder.norm2.weight":"img_emb.proj.4.weight",
-            "patch_embedding.bias": "patch_embedding.bias",
-            "patch_embedding.weight": "patch_embedding.weight",
-            "scale_shift_table": "head.modulation",
-            "proj_out.bias": "head.head.bias",
-            "proj_out.weight": "head.head.weight",
-        }
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name in rename_dict:
-                state_dict_[rename_dict[name]] = param
-            else:
-                name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
-                if name_ in rename_dict:
-                    name_ = rename_dict[name_]
-                    name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
-                    state_dict_[name_] = param
-        if hash_state_dict_keys(state_dict_) == "cb104773c6c2cb6df4f9529ad5c60d0b":
-            config = {
-                "model_type": "t2v",
-                "patch_size": (1, 2, 2),
-                "text_len": 512,
-                "in_dim": 16,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "window_size": (-1, -1),
-                "qk_norm": True,
-                "cross_attn_norm": True,
-                "eps": 1e-6,
-            }
-        elif hash_state_dict_keys(state_dict_) == "6bfcfb3b342cb286ce886889d519a77e":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        else:
-            config = {}
-        return state_dict_, config
-    
-    def from_civitai(self, state_dict):
-        state_dict = {name: param for name, param in state_dict.items() if not name.startswith("vace")}
-        state_dict = {name: param for name, param in state_dict.items() if name.split(".")[0] not in ["pose_patch_embedding", "face_adapter", "face_encoder", "motion_encoder"]}
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("model."):
-                name = name[len("model."):]
-            state_dict_[name] = param
-        state_dict = state_dict_
-        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 16,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 16,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "6d6ccde6845b95ad9114ab993d917893":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "349723183fc063b2bfc10bb2835cf677":
-            # 1.3B PAI control
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "efa44cddf936c70abd0ea28b6cbe946c":
-            # 14B PAI control
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "3ef3b1f8e1dab83d5b71fd7b617f859f":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_image_pos_emb": True
-            }
-        elif hash_state_dict_keys(state_dict) == "70ddad9d3a133785da5ea371aae09504":
-            # 1.3B PAI control v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6,
-                "has_ref_conv": True
-            }
-        elif hash_state_dict_keys(state_dict) == "26bde73488a92e64cc20b0a7485b9e5b":
-            # 14B PAI control v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": True
-            }
-        elif hash_state_dict_keys(state_dict) == "ac6a5aa74f4a0aab6f64eb9a72f19901":
-            # 1.3B PAI control-camera v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 32,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6,
-                "has_ref_conv": False,
-                "add_control_adapter": True,
-                "in_dim_control_adapter": 24,
-            }
-        elif hash_state_dict_keys(state_dict) == "b61c605c2adbd23124d152ed28e049ae":
-            # 14B PAI control-camera v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 32,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": False,
-                "add_control_adapter": True,
-                "in_dim_control_adapter": 24,
-            }
-        elif hash_state_dict_keys(state_dict) == "1f5ab7703c6fc803fdded85ff040c316":
-            # Wan-AI/Wan2.2-TI2V-5B
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 3072,
-                "ffn_dim": 14336,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 48,
-                "num_heads": 24,
-                "num_layers": 30,
-                "eps": 1e-6,
-                "seperated_timestep": True,
-                "require_clip_embedding": False,
-                "require_vae_embedding": False,
-                "fuse_vae_embedding_in_latents": True,
-            }
-        elif hash_state_dict_keys(state_dict) == "5b013604280dd715f8457c6ed6d6a626":
-            # Wan-AI/Wan2.2-I2V-A14B
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "require_clip_embedding": False,
-            }
-        elif hash_state_dict_keys(state_dict) == "2267d489f0ceb9f21836532952852ee5":
-            # Wan2.2-Fun-A14B-Control
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 52,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": True,
-                "require_clip_embedding": False,
-            }
-        elif hash_state_dict_keys(state_dict) == "47dbeab5e560db3180adf51dc0232fb1":
-            # Wan2.2-Fun-A14B-Control-Camera
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": False,
-                "add_control_adapter": True,
-                "in_dim_control_adapter": 24,
-                "require_clip_embedding": False,
-            }
-        else:
-            config = {}
-        return state_dict, config
--- a/diffsynth/models/wan_video_image_encoder.py
+++ b/diffsynth/models/wan_video_image_encoder.py
@@ -874,29 +874,5 @@ class WanImageEncoder(torch.nn.Module):
        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))

        # forward
-        dtype = next(iter(self.model.visual.parameters())).dtype
-        videos = videos.to(dtype)
        out = self.model.visual(videos, use_31_block=True)
        return out
-        
-    @staticmethod
-    def state_dict_converter():
-        return WanImageEncoderStateDictConverter()
-    
-    
-class WanImageEncoderStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        return state_dict
-    
-    def from_civitai(self, state_dict):
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("textual."):
-                continue
-            name = "model." + name
-            state_dict_[name] = param
-        return state_dict_
-
--- a/diffsynth/models/wan_video_motion_controller.py
+++ b/diffsynth/models/wan_video_motion_controller.py
@@ -25,20 +25,3 @@ class WanMotionControllerModel(torch.nn.Module):
        state_dict = self.linear[-1].state_dict()
        state_dict = {i: state_dict[i] * 0 for i in state_dict}
        self.linear[-1].load_state_dict(state_dict)
-
-    @staticmethod
-    def state_dict_converter():
-        return WanMotionControllerModelDictConverter()
-    
-    
-
-class WanMotionControllerModelDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        return state_dict
-    
-    def from_civitai(self, state_dict):
-        return state_dict
-
--- a/diffsynth/models/wav2vec.py
+++ b/diffsynth/models/wav2vec.py
@@ -189,16 +189,3 @@ class WanS2VAudioEncoder(torch.nn.Module):
        audio_embed_bucket = audio_embed_bucket.unsqueeze(0).permute(0, 2, 3, 1).to(device, dtype)
        audio_embeds = [audio_embed_bucket[..., i * batch_frames:(i + 1) * batch_frames] for i in range(min_batch_num)]
        return audio_embeds
-
-    @staticmethod
-    def state_dict_converter():
-        return WanS2VAudioEncoderStateDictConverter()
-
-
-class WanS2VAudioEncoderStateDictConverter():
-    def __init__(self):
-        pass
-
-    def from_civitai(self, state_dict):
-        state_dict = {'model.' + k: v for k, v in state_dict.items()}
-        return state_dict