wan-series

2026-03-20 07:18:14 +00:00 · 2025-11-14 19:05:26 +08:00
parent 5be5c32fe4
commit e3356556ee
215 changed files with 5504 additions and 482 deletions
--- a/diffsynth/configs/vram_management_module_maps.py
+++ b/diffsynth/configs/vram_management_module_maps.py
@@ -21,4 +21,94 @@ VRAM_MANAGEMENT_MODULE_MAPS = {
        "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
+    "diffsynth.models.wan_video_animate_adapter.WanAnimateAdapter": {
+        "diffsynth.models.wan_video_animate_adapter.FaceEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.EqualLinear": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.ConvLayer": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.FusedLeakyReLU": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_dit_s2v.WanS2VModel": {
+        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit_s2v.WanS2VDiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit_s2v.CausalAudioEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_dit.WanModel": {
+        "diffsynth.models.wan_video_dit.MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_image_encoder.WanImageEncoder": {
+        "diffsynth.models.wan_video_image_encoder.VisionTransformer": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_mot.MotWanModel": {
+        "diffsynth.models.wan_video_mot.MotWanAttentionBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_motion_controller.WanMotionControllerModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.wan_video_text_encoder.WanTextEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_text_encoder.T5RelativeEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_text_encoder.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_vace.VaceWanModel": {
+        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_vae.WanVideoVAE": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_vae.WanVideoVAE38": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wav2vec.WanS2VAudioEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.longcat_video_dit.LongCatVideoTransformer3DModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.longcat_video_dit.RMSNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.longcat_video_dit.LayerNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
 }
--- a/diffsynth/core/loader/config.py
+++ b/diffsynth/core/loader/config.py
@@ -21,6 +21,7 @@ class ModelConfig:
    preparing_dtype: Optional[torch.dtype] = None
    computation_device: Optional[Union[str, torch.device]] = None
    computation_dtype: Optional[torch.dtype] = None
+    clear_parameters: bool = False
    
    def check_input(self):
        if self.path is None and self.model_id is None:
--- a/diffsynth/diffusion/base_pipeline.py
+++ b/diffsynth/diffusion/base_pipeline.py
@@ -274,6 +274,7 @@ class BasePipeline(torch.nn.Module):
                model_config.path,
                vram_config=vram_config,
                vram_limit=vram_limit,
+                clear_parameters=model_config.clear_parameters,
            )
        return model_pool
    
--- a/diffsynth/diffusion/loss.py
+++ b/diffsynth/diffusion/loss.py
@@ -3,7 +3,10 @@ import torch


 def FlowMatchSFTLoss(pipe: BasePipeline, **inputs):
-    timestep_id = torch.randint(0, pipe.scheduler.num_train_timesteps, (1,))
+    max_timestep_boundary = int(inputs.get("max_timestep_boundary", 1) * pipe.scheduler.num_train_timesteps)
+    min_timestep_boundary = int(inputs.get("min_timestep_boundary", 0) * pipe.scheduler.num_train_timesteps)
+
+    timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
    timestep = pipe.scheduler.timesteps[timestep_id].to(dtype=pipe.torch_dtype, device=pipe.device)
    
    noise = torch.randn_like(inputs["input_latents"])
--- a/diffsynth/diffusion/parsers.py
+++ b/diffsynth/diffusion/parsers.py
@@ -6,7 +6,7 @@ def add_dataset_base_config(parser: argparse.ArgumentParser):
    parser.add_argument("--dataset_metadata_path", type=str, default=None, help="Path to the metadata file of the dataset.")
    parser.add_argument("--dataset_repeat", type=int, default=1, help="Number of times to repeat the dataset per epoch.")
    parser.add_argument("--dataset_num_workers", type=int, default=0, help="Number of workers for data loading.")
-    parser.add_argument("--data_file_keys", type=str, default="image", help="Data file keys in the metadata. Comma-separated.")
+    parser.add_argument("--data_file_keys", type=str, default="image,video", help="Data file keys in the metadata. Comma-separated.")
    return parser

 def add_image_size_config(parser: argparse.ArgumentParser):
@@ -15,11 +15,19 @@ def add_image_size_config(parser: argparse.ArgumentParser):
    parser.add_argument("--max_pixels", type=int, default=1024*1024, help="Maximum number of pixels per frame, used for dynamic resolution.")
    return parser

+def add_video_size_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--height", type=int, default=None, help="Height of images. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--width", type=int, default=None, help="Width of images. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--max_pixels", type=int, default=1024*1024, help="Maximum number of pixels per frame, used for dynamic resolution.")
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of frames per video. Frames are sampled from the video prefix.")
+    return parser
+
 def add_model_config(parser: argparse.ArgumentParser):
    parser.add_argument("--model_paths", type=str, default=None, help="Paths to load models. In JSON format.")
    parser.add_argument("--model_id_with_origin_paths", type=str, default=None, help="Model ID with origin paths, e.g., Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors. Comma-separated.")
    parser.add_argument("--extra_inputs", default=None, help="Additional model inputs, comma-separated.")
    parser.add_argument("--fp8_models", default=None, help="Models with FP8 precision, comma-separated.")
+    parser.add_argument("--offload_models", default=None, help="Models with offload, comma-separated. Only used in splited training.")
    return parser

 def add_training_config(parser: argparse.ArgumentParser):
--- a/diffsynth/diffusion/training_module.py
+++ b/diffsynth/diffusion/training_module.py
@@ -82,32 +82,55 @@ class DiffusionTrainingModule(torch.nn.Module):
        else:
            return data
    
+    def parse_vram_config(self, fp8=False, offload=False, device="cpu"):
+        if fp8:
+            return {
+                "offload_dtype": torch.float8_e4m3fn,
+                "offload_device": device,
+                "onload_dtype": torch.float8_e4m3fn,
+                "onload_device": device,
+                "preparing_dtype": torch.float8_e4m3fn,
+                "preparing_device": device,
+                "computation_dtype": torch.bfloat16,
+                "computation_device": device,
+            }
+        elif offload:
+            return {
+                "offload_dtype": "disk",
+                "offload_device": "disk",
+                "onload_dtype": "disk",
+                "onload_device": "disk",
+                "preparing_dtype": torch.bfloat16,
+                "preparing_device": device,
+                "computation_dtype": torch.bfloat16,
+                "computation_device": device,
+                "clear_parameters": True,
+            }
+        else:
+            return {}
    
-    def parse_model_configs(self, model_paths, model_id_with_origin_paths, fp8_models=None, device="cpu"):
+    def parse_model_configs(self, model_paths, model_id_with_origin_paths, fp8_models=None, offload_models=None, device="cpu"):
        fp8_models = [] if fp8_models is None else fp8_models.split(",")
-        fp8_config = {
-            # To accommodate multi-GPU training,
-            # the model will be temporarily stored in CPU memory.
-            "offload_dtype": torch.float8_e4m3fn,
-            "offload_device": device,
-            "onload_dtype": torch.float8_e4m3fn,
-            "onload_device": device,
-            "preparing_dtype": torch.float8_e4m3fn,
-            "preparing_device": device,
-            "computation_dtype": torch.bfloat16,
-            "computation_device": device,
-        }
+        offload_models = [] if offload_models is None else offload_models.split(",")
        model_configs = []
        if model_paths is not None:
            model_paths = json.loads(model_paths)
            for path in model_paths:
-                vram_config = fp8_config if path in fp8_models else {}
+                vram_config = self.parse_vram_config(
+                    fp8=path in fp8_models,
+                    offload=path in offload_models,
+                    device=device
+                )
                model_configs.append(ModelConfig(path=path, **vram_config))
        if model_id_with_origin_paths is not None:
            model_id_with_origin_paths = model_id_with_origin_paths.split(",")
            for model_id_with_origin_path in model_id_with_origin_paths:
                model_id, origin_file_pattern = model_id_with_origin_path.split(":")
-                vram_config = fp8_config if model_id_with_origin_path in fp8_models else {}
+                vram_config = self.parse_vram_config(
+                    fp8=model_id_with_origin_path in fp8_models,
+                    offload=model_id_with_origin_path in offload_models,
+                    device=device
+                )
                model_configs.append(ModelConfig(model_id=model_id, origin_file_pattern=origin_file_pattern, **vram_config))
        return model_configs
    
@@ -118,6 +141,7 @@ class DiffusionTrainingModule(torch.nn.Module):
        trainable_models=None,
        lora_base_model=None, lora_target_modules="", lora_rank=32, lora_checkpoint=None,
        preset_lora_path=None, preset_lora_model=None,
+        task="sft",
    ):
        # Scheduler
        pipe.scheduler.set_timesteps(1000, training=True)
@@ -134,7 +158,7 @@ class DiffusionTrainingModule(torch.nn.Module):
        # It is delegated to the subclass.
        
        # Add LoRA to the base models
-        if lora_base_model is not None:
+        if lora_base_model is not None and not task.endswith(":data_process"):
            if (not hasattr(pipe, lora_base_model)) or getattr(pipe, lora_base_model) is None:
                print(f"No {lora_base_model} models in the pipeline. We cannot patch LoRA on the model. If this occurs during the data processing stage, it is normal.")
                return
--- a/diffsynth/models/model_loader.py
+++ b/diffsynth/models/model_loader.py
@@ -59,7 +59,7 @@ class ModelPool:
        }
        return vram_config
    
-    def auto_load_model(self, path, vram_config=None, vram_limit=None):
+    def auto_load_model(self, path, vram_config=None, vram_limit=None, clear_parameters=False):
        print(f"Loading models from: {json.dumps(path, indent=4)}")
        if vram_config is None:
            vram_config = self.default_vram_config()
@@ -68,6 +68,7 @@ class ModelPool:
        for config in MODEL_CONFIGS:
            if config["model_hash"] == model_hash:
                model = self.load_model_file(config, path, vram_config, vram_limit=vram_limit)
+                if clear_parameters: self.clear_parameters(model)
                self.model.append(model)
                model_name = config["model_name"]
                self.model_name.append(model_name)
@@ -102,3 +103,9 @@ class ModelPool:
                model = fetched_models
                print(f"More than one {model_name} models are loaded: {fetched_model_paths}. Using {model_name} from {json.dumps(fetched_model_paths, indent=4)}.")
        return model
+
+    def clear_parameters(self, model: torch.nn.Module):
+        for name, module in model.named_children():
+            self.clear_parameters(module)
+        for name, param in model.named_parameters(recurse=False):
+            setattr(model, name, None)
--- a/diffsynth/models/wan_video_animate_adapter.py
+++ b/diffsynth/models/wan_video_animate_adapter.py
@@ -375,7 +375,7 @@ class Blur(nn.Module):
 		if upsample_factor > 1:
 			kernel = kernel * (upsample_factor ** 2)

-		self.register_buffer('kernel', kernel)
+		self.kernel = torch.nn.Parameter(kernel)

 		self.pad = pad

@@ -648,23 +648,3 @@ class WanAnimateAdapter(torch.nn.Module):
            residual_out = self.face_adapter.fuser_blocks[block_idx // 5](*adapter_args)
            x = residual_out + x
        return x
-    
-    @staticmethod
-    def state_dict_converter():
-        return WanAnimateAdapterStateDictConverter()
-
-
-class WanAnimateAdapterStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        return state_dict
-    
-    def from_civitai(self, state_dict):
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("pose_patch_embedding.") or name.startswith("face_adapter") or name.startswith("face_encoder") or name.startswith("motion_encoder"):
-                state_dict_[name] = param
-        return state_dict_
-
--- a/diffsynth/models/wan_video_dit.py
+++ b/diffsynth/models/wan_video_dit.py
@@ -404,369 +404,3 @@ class WanModel(torch.nn.Module):
        x = self.head(x, t)
        x = self.unpatchify(x, (f, h, w))
        return x
-
-    @staticmethod
-    def state_dict_converter():
-        return WanModelStateDictConverter()
-    
-    
-class WanModelStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        rename_dict = {
-            "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
-            "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
-            "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
-            "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
-            "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
-            "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
-            "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
-            "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
-            "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
-            "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
-            "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
-            "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
-            "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
-            "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
-            "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
-            "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
-            "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
-            "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
-            "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
-            "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
-            "blocks.0.attn2.add_k_proj.bias":"blocks.0.cross_attn.k_img.bias",
-            "blocks.0.attn2.add_k_proj.weight":"blocks.0.cross_attn.k_img.weight",
-            "blocks.0.attn2.add_v_proj.bias":"blocks.0.cross_attn.v_img.bias",
-            "blocks.0.attn2.add_v_proj.weight":"blocks.0.cross_attn.v_img.weight",
-            "blocks.0.attn2.norm_added_k.weight":"blocks.0.cross_attn.norm_k_img.weight",
-            "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
-            "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
-            "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
-            "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
-            "blocks.0.norm2.bias": "blocks.0.norm3.bias",
-            "blocks.0.norm2.weight": "blocks.0.norm3.weight",
-            "blocks.0.scale_shift_table": "blocks.0.modulation",
-            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
-            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
-            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
-            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
-            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
-            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
-            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
-            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
-            "condition_embedder.time_proj.bias": "time_projection.1.bias",
-            "condition_embedder.time_proj.weight": "time_projection.1.weight",
-            "condition_embedder.image_embedder.ff.net.0.proj.bias":"img_emb.proj.1.bias",
-            "condition_embedder.image_embedder.ff.net.0.proj.weight":"img_emb.proj.1.weight",
-            "condition_embedder.image_embedder.ff.net.2.bias":"img_emb.proj.3.bias",
-            "condition_embedder.image_embedder.ff.net.2.weight":"img_emb.proj.3.weight",
-            "condition_embedder.image_embedder.norm1.bias":"img_emb.proj.0.bias",
-            "condition_embedder.image_embedder.norm1.weight":"img_emb.proj.0.weight",
-            "condition_embedder.image_embedder.norm2.bias":"img_emb.proj.4.bias",
-            "condition_embedder.image_embedder.norm2.weight":"img_emb.proj.4.weight",
-            "patch_embedding.bias": "patch_embedding.bias",
-            "patch_embedding.weight": "patch_embedding.weight",
-            "scale_shift_table": "head.modulation",
-            "proj_out.bias": "head.head.bias",
-            "proj_out.weight": "head.head.weight",
-        }
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name in rename_dict:
-                state_dict_[rename_dict[name]] = param
-            else:
-                name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
-                if name_ in rename_dict:
-                    name_ = rename_dict[name_]
-                    name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
-                    state_dict_[name_] = param
-        if hash_state_dict_keys(state_dict_) == "cb104773c6c2cb6df4f9529ad5c60d0b":
-            config = {
-                "model_type": "t2v",
-                "patch_size": (1, 2, 2),
-                "text_len": 512,
-                "in_dim": 16,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "window_size": (-1, -1),
-                "qk_norm": True,
-                "cross_attn_norm": True,
-                "eps": 1e-6,
-            }
-        elif hash_state_dict_keys(state_dict_) == "6bfcfb3b342cb286ce886889d519a77e":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        else:
-            config = {}
-        return state_dict_, config
-    
-    def from_civitai(self, state_dict):
-        state_dict = {name: param for name, param in state_dict.items() if not name.startswith("vace")}
-        state_dict = {name: param for name, param in state_dict.items() if name.split(".")[0] not in ["pose_patch_embedding", "face_adapter", "face_encoder", "motion_encoder"]}
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("model."):
-                name = name[len("model."):]
-            state_dict_[name] = param
-        state_dict = state_dict_
-        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 16,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 16,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "6d6ccde6845b95ad9114ab993d917893":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "349723183fc063b2bfc10bb2835cf677":
-            # 1.3B PAI control
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "efa44cddf936c70abd0ea28b6cbe946c":
-            # 14B PAI control
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6
-            }
-        elif hash_state_dict_keys(state_dict) == "3ef3b1f8e1dab83d5b71fd7b617f859f":
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_image_pos_emb": True
-            }
-        elif hash_state_dict_keys(state_dict) == "70ddad9d3a133785da5ea371aae09504":
-            # 1.3B PAI control v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6,
-                "has_ref_conv": True
-            }
-        elif hash_state_dict_keys(state_dict) == "26bde73488a92e64cc20b0a7485b9e5b":
-            # 14B PAI control v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": True
-            }
-        elif hash_state_dict_keys(state_dict) == "ac6a5aa74f4a0aab6f64eb9a72f19901":
-            # 1.3B PAI control-camera v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 32,
-                "dim": 1536,
-                "ffn_dim": 8960,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 12,
-                "num_layers": 30,
-                "eps": 1e-6,
-                "has_ref_conv": False,
-                "add_control_adapter": True,
-                "in_dim_control_adapter": 24,
-            }
-        elif hash_state_dict_keys(state_dict) == "b61c605c2adbd23124d152ed28e049ae":
-            # 14B PAI control-camera v1.1
-            config = {
-                "has_image_input": True,
-                "patch_size": [1, 2, 2],
-                "in_dim": 32,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": False,
-                "add_control_adapter": True,
-                "in_dim_control_adapter": 24,
-            }
-        elif hash_state_dict_keys(state_dict) == "1f5ab7703c6fc803fdded85ff040c316":
-            # Wan-AI/Wan2.2-TI2V-5B
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 48,
-                "dim": 3072,
-                "ffn_dim": 14336,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 48,
-                "num_heads": 24,
-                "num_layers": 30,
-                "eps": 1e-6,
-                "seperated_timestep": True,
-                "require_clip_embedding": False,
-                "require_vae_embedding": False,
-                "fuse_vae_embedding_in_latents": True,
-            }
-        elif hash_state_dict_keys(state_dict) == "5b013604280dd715f8457c6ed6d6a626":
-            # Wan-AI/Wan2.2-I2V-A14B
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "require_clip_embedding": False,
-            }
-        elif hash_state_dict_keys(state_dict) == "2267d489f0ceb9f21836532952852ee5":
-            # Wan2.2-Fun-A14B-Control
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 52,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": True,
-                "require_clip_embedding": False,
-            }
-        elif hash_state_dict_keys(state_dict) == "47dbeab5e560db3180adf51dc0232fb1":
-            # Wan2.2-Fun-A14B-Control-Camera
-            config = {
-                "has_image_input": False,
-                "patch_size": [1, 2, 2],
-                "in_dim": 36,
-                "dim": 5120,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "text_dim": 4096,
-                "out_dim": 16,
-                "num_heads": 40,
-                "num_layers": 40,
-                "eps": 1e-6,
-                "has_ref_conv": False,
-                "add_control_adapter": True,
-                "in_dim_control_adapter": 24,
-                "require_clip_embedding": False,
-            }
-        else:
-            config = {}
-        return state_dict, config
--- a/diffsynth/models/wan_video_image_encoder.py
+++ b/diffsynth/models/wan_video_image_encoder.py
@@ -874,29 +874,5 @@ class WanImageEncoder(torch.nn.Module):
        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))

        # forward
-        dtype = next(iter(self.model.visual.parameters())).dtype
-        videos = videos.to(dtype)
        out = self.model.visual(videos, use_31_block=True)
        return out
-        
-    @staticmethod
-    def state_dict_converter():
-        return WanImageEncoderStateDictConverter()
-    
-    
-class WanImageEncoderStateDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        return state_dict
-    
-    def from_civitai(self, state_dict):
-        state_dict_ = {}
-        for name, param in state_dict.items():
-            if name.startswith("textual."):
-                continue
-            name = "model." + name
-            state_dict_[name] = param
-        return state_dict_
-
--- a/diffsynth/models/wan_video_motion_controller.py
+++ b/diffsynth/models/wan_video_motion_controller.py
@@ -25,20 +25,3 @@ class WanMotionControllerModel(torch.nn.Module):
        state_dict = self.linear[-1].state_dict()
        state_dict = {i: state_dict[i] * 0 for i in state_dict}
        self.linear[-1].load_state_dict(state_dict)
-
-    @staticmethod
-    def state_dict_converter():
-        return WanMotionControllerModelDictConverter()
-    
-    
-
-class WanMotionControllerModelDictConverter:
-    def __init__(self):
-        pass
-
-    def from_diffusers(self, state_dict):
-        return state_dict
-    
-    def from_civitai(self, state_dict):
-        return state_dict
-
--- a/diffsynth/models/wav2vec.py
+++ b/diffsynth/models/wav2vec.py
@@ -189,16 +189,3 @@ class WanS2VAudioEncoder(torch.nn.Module):
        audio_embed_bucket = audio_embed_bucket.unsqueeze(0).permute(0, 2, 3, 1).to(device, dtype)
        audio_embeds = [audio_embed_bucket[..., i * batch_frames:(i + 1) * batch_frames] for i in range(min_batch_num)]
        return audio_embeds
-
-    @staticmethod
-    def state_dict_converter():
-        return WanS2VAudioEncoderStateDictConverter()
-
-
-class WanS2VAudioEncoderStateDictConverter():
-    def __init__(self):
-        pass
-
-    def from_civitai(self, state_dict):
-        state_dict = {'model.' + k: v for k, v in state_dict.items()}
-        return state_dict
--- a/diffsynth/pipelines/wan_video.py
+++ b/diffsynth/pipelines/wan_video.py
@@ -9,6 +9,7 @@ from PIL import Image
 from tqdm import tqdm
 from typing import Optional
 from typing_extensions import Literal
+from transformers import Wav2Vec2Processor

 from ..diffusion import FlowMatchScheduler
 from ..core import ModelConfig, gradient_checkpoint_forward
@@ -23,6 +24,7 @@ from ..models.wan_video_vace import VaceWanModel
 from ..models.wan_video_motion_controller import WanMotionControllerModel
 from ..models.wan_video_animate_adapter import WanAnimateAdapter
 from ..models.wan_video_mot import MotWanModel
+from ..models.wav2vec import WanS2VAudioEncoder
 from ..models.longcat_video_dit import LongCatVideoTransformer3DModel


@@ -35,6 +37,7 @@ class WanVideoPipeline(BasePipeline):
        )
        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
        self.tokenizer: HuggingfaceTokenizer = None
+        self.audio_processor: Wav2Vec2Processor = None
        self.text_encoder: WanTextEncoder = None
        self.image_encoder: WanImageEncoder = None
        self.dit: WanModel = None
@@ -45,6 +48,7 @@ class WanVideoPipeline(BasePipeline):
        self.vace2: VaceWanModel = None
        self.vap: MotWanModel = None
        self.animate_adapter: WanAnimateAdapter = None
+        self.audio_encoder: WanS2VAudioEncoder = None
        self.in_iteration_models = ("dit", "motion_controller", "vace", "animate_adapter", "vap")
        self.in_iteration_models_2 = ("dit2", "motion_controller", "vace2", "animate_adapter", "vap")
        self.units = [
@@ -96,7 +100,7 @@ class WanVideoPipeline(BasePipeline):
        torch_dtype: torch.dtype = torch.bfloat16,
        device: Union[str, torch.device] = "cuda",
        model_configs: list[ModelConfig] = [],
-        tokenizer_config: ModelConfig = ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*"),
+        tokenizer_config: ModelConfig = ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"),
        audio_processor_config: ModelConfig = None,
        redirect_common_files: bool = True,
        use_usp: bool = False,
@@ -105,16 +109,18 @@ class WanVideoPipeline(BasePipeline):
        # Redirect model path
        if redirect_common_files:
            redirect_dict = {
-                "models_t5_umt5-xxl-enc-bf16.pth": "Wan-AI/Wan2.1-T2V-1.3B",
-                "Wan2.1_VAE.pth": "Wan-AI/Wan2.1-T2V-1.3B",
-                "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth": "Wan-AI/Wan2.1-I2V-14B-480P",
+                "models_t5_umt5-xxl-enc-bf16.pth": ("DiffSynth-Studio/Wan-Series-Converted-Safetensors", "models_t5_umt5-xxl-enc-bf16.safetensors"),
+                "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth": ("DiffSynth-Studio/Wan-Series-Converted-Safetensors", "models_clip_open-clip-xlm-roberta-large-vit-huge-14.safetensors"),
+                "Wan2.1_VAE.pth": ("DiffSynth-Studio/Wan-Series-Converted-Safetensors", "Wan2.1_VAE.safetensors"),
+                "Wan2.2_VAE.pth": ("DiffSynth-Studio/Wan-Series-Converted-Safetensors", "Wan2.2_VAE.safetensors"),
            }
            for model_config in model_configs:
                if model_config.origin_file_pattern is None or model_config.model_id is None:
                    continue
-                if model_config.origin_file_pattern in redirect_dict and model_config.model_id != redirect_dict[model_config.origin_file_pattern]:
-                    print(f"To avoid repeatedly downloading model files, ({model_config.model_id}, {model_config.origin_file_pattern}) is redirected to ({redirect_dict[model_config.origin_file_pattern]}, {model_config.origin_file_pattern}). You can use `redirect_common_files=False` to disable file redirection.")
-                    model_config.model_id = redirect_dict[model_config.origin_file_pattern]
+                if model_config.origin_file_pattern in redirect_dict and model_config.model_id != redirect_dict[model_config.origin_file_pattern][0]:
+                    print(f"To avoid repeatedly downloading model files, ({model_config.model_id}, {model_config.origin_file_pattern}) is redirected to {redirect_dict[model_config.origin_file_pattern]}. You can use `redirect_common_files=False` to disable file redirection.")
+                    model_config.model_id = redirect_dict[model_config.origin_file_pattern][0]
+                    model_config.origin_file_pattern = redirect_dict[model_config.origin_file_pattern][1]
        
        # Initialize pipeline
        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
@@ -153,11 +159,13 @@ class WanVideoPipeline(BasePipeline):
            pipe.tokenizer = HuggingfaceTokenizer(name=tokenizer_config.path, seq_len=512, clean='whitespace')
        if audio_processor_config is not None:
            audio_processor_config.download_if_necessary()
-            from transformers import Wav2Vec2Processor
            pipe.audio_processor = Wav2Vec2Processor.from_pretrained(audio_processor_config.path)
        
        # Unified Sequence Parallel
        if use_usp: pipe.enable_usp()
+        
+        # VRAM Management
+        pipe.vram_management_enabled = pipe.check_vram_management_state()
        return pipe


--- a/diffsynth/utils/state_dict_converters/wan_video_mot.py
+++ b/diffsynth/utils/state_dict_converters/wan_video_mot.py
@@ -62,9 +62,10 @@ def WanVideoMotStateDictConverter(state_dict):
    for name in state_dict:
        if "_mot_ref" not in name:
            continue
+        param = state_dict[name]
        name = name.replace("_mot_ref", "")
        if name in rename_dict:
-            state_dict_[rename_dict[name]] = state_dict[name]
+            state_dict_[rename_dict[name]] = param
        else:
            if name.split(".")[1].isdigit():
                block_id = int(name.split(".")[1])
@@ -73,5 +74,5 @@ def WanVideoMotStateDictConverter(state_dict):
            if name_ in rename_dict:
                name_ = rename_dict[name_]
                name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
-                state_dict_[name_] = state_dict[name]
+                state_dict_[name_] = param
    return state_dict_
--- a/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py
+++ b/diffsynth/utils/state_dict_converters/wans2v_audio_encoder.py
@@ -1,3 +1,12 @@
 def WanS2VAudioEncoderStateDictConverter(state_dict):
-    state_dict = {'model.' + k: state_dict[k] for k in state_dict}
-    return state_dict
+    rename_dict = {
+        "model.wav2vec2.encoder.pos_conv_embed.conv.weight_g": "model.wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0",
+        "model.wav2vec2.encoder.pos_conv_embed.conv.weight_v": "model.wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1",
+    }
+    state_dict_ = {}
+    for name in state_dict:
+        name_ = "model." + name
+        if name_ in rename_dict:
+            name_ = rename_dict[name_]
+        state_dict_[name_] = state_dict[name]
+    return state_dict_