update wan training

support torch<2.6.0
update torch version
2026-04-08 17:18:21 +00:00 · 2025-06-16 15:46:20 +08:00 · 2025-06-16 13:05:54 +08:00 · 2025-06-16 12:35:21 +08:00 · 2025-06-16 10:57:33 +08:00 · 2025-06-13 14:13:38 +08:00
135 changed files with 6808 additions and 2129 deletions
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -60,6 +60,9 @@ from ..models.wan_video_text_encoder import WanTextEncoder
 from ..models.wan_video_image_encoder import WanImageEncoder
 from ..models.wan_video_vae import WanVideoVAE
 from ..models.wan_video_motion_controller import WanMotionControllerModel
+from ..models.wan_video_vace import VaceWanModel
+
+from ..models.step1x_connector import Qwen2Connector


 model_loader_configs = [
@@ -97,6 +100,7 @@ model_loader_configs = [
    (None, "57b02550baab820169365b3ee3afa2c9", ["flux_dit"], [FluxDiT], "civitai"),
    (None, "3394f306c4cbf04334b712bf5aaed95f", ["flux_dit"], [FluxDiT], "civitai"),
    (None, "023f054d918a84ccf503481fd1e3379e", ["flux_dit"], [FluxDiT], "civitai"),
+    (None, "d02f41c13549fa5093d3521f62a5570a", ["flux_dit"], [FluxDiT], "civitai"),
    (None, "605c56eab23e9e2af863ad8f0813a25d", ["flux_dit"], [FluxDiT], "diffusers"),
    (None, "280189ee084bca10f70907bf6ce1649d", ["cog_vae_encoder", "cog_vae_decoder"], [CogVAEEncoder, CogVAEDecoder], "diffusers"),
    (None, "9b9313d104ac4df27991352fec013fd4", ["rife"], [IFNet], "civitai"),
@@ -107,6 +111,7 @@ model_loader_configs = [
    (None, "52357cb26250681367488a8954c271e8", ["flux_controlnet"], [FluxControlNet], "diffusers"),
    (None, "0cfd1740758423a2a854d67c136d1e8c", ["flux_controlnet"], [FluxControlNet], "diffusers"),
    (None, "7f9583eb8ba86642abb9a21a4b2c9e16", ["flux_controlnet"], [FluxControlNet], "diffusers"),
+    (None, "43ad5aaa27dd4ee01b832ed16773fa52", ["flux_controlnet"], [FluxControlNet], "diffusers"),
    (None, "c07c0f04f5ff55e86b4e937c7a40d481", ["infiniteyou_image_projector"], [InfiniteYouImageProjector], "diffusers"),
    (None, "4daaa66cc656a8fe369908693dad0a35", ["flux_ipadapter"], [FluxIpAdapter], "diffusers"),
    (None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
@@ -125,12 +130,20 @@ model_loader_configs = [
    (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
    (None, "349723183fc063b2bfc10bb2835cf677", ["wan_video_dit"], [WanModel], "civitai"),
    (None, "efa44cddf936c70abd0ea28b6cbe946c", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "3ef3b1f8e1dab83d5b71fd7b617f859f", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "70ddad9d3a133785da5ea371aae09504", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "26bde73488a92e64cc20b0a7485b9e5b", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "ac6a5aa74f4a0aab6f64eb9a72f19901", ["wan_video_dit"], [WanModel], "civitai"), 
+    (None, "b61c605c2adbd23124d152ed28e049ae", ["wan_video_dit"], [WanModel], "civitai"), 
+    (None, "a61453409b67cd3246cf0c3bebad47ba", ["wan_video_dit", "wan_video_vace"], [WanModel, VaceWanModel], "civitai"),
+    (None, "7a513e1f257a861512b1afd387a8ecd9", ["wan_video_dit", "wan_video_vace"], [WanModel, VaceWanModel], "civitai"),
    (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
    (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
    (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
    (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
    (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
    (None, "dbd5ec76bbf977983f972c151d545389", ["wan_video_motion_controller"], [WanMotionControllerModel], "civitai"),
+    (None, "d30fb9e02b1dbf4e509142f05cf7dd50", ["flux_dit", "step1x_connector"], [FluxDiT, Qwen2Connector], "civitai"),
 ]
 huggingface_model_loader_configs = [
    # These configs are provided for detecting model type automatically.
@@ -146,6 +159,7 @@ huggingface_model_loader_configs = [
    ("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
    ("LlavaForConditionalGeneration", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoMLLMEncoder"),
    ("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
+    ("Qwen2_5_VLForConditionalGeneration", "diffsynth.models.qwenvl", "qwenvl", "Qwen25VL_7b_Embedder"),
 ]
 patch_model_loader_configs = [
    # These configs are provided for detecting model type automatically.
--- a/diffsynth/data/image_pulse.py
+++ b/diffsynth/data/image_pulse.py
@@ -1,125 +0,0 @@
-import torch, os, json, torchvision
-from PIL import Image
-from torchvision.transforms import v2
-
-
-
-class SingleTaskDataset(torch.utils.data.Dataset):
-    def __init__(self, base_path, keys=(("image_1", "image_2", "editing_instruction"), ("image_2", "image_1", "reverse_editing_instruction")), height=1024, width=1024, random=True, steps_per_epoch=1000, metadata_path=None):
-        self.base_path = base_path
-        self.keys = keys
-        self.metadata = []
-        self.bad_data = []
-        self.height = height
-        self.width = width
-        self.random = random
-        self.steps_per_epoch = steps_per_epoch
-        self.image_process = v2.Compose([
-            v2.CenterCrop(size=(height, width)),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
-        ])
-        if metadata_path is None:
-            self.search_for_data("", report_data_log=True)
-            self.report_data_log()
-        else:
-            with open(metadata_path, "r", encoding="utf-8-sig") as f:
-                self.metadata = json.load(f)
-
-
-    def report_data_log(self):
-        print(f"{len(self.metadata)} valid data, {len(self.bad_data)} invalid data.")
-
-
-    def dump_metadata(self, path):
-        with open(path, "w", encoding="utf-8") as f:
-            json.dump(self.metadata, f, ensure_ascii=False, indent=4)
-        
-        
-    def parse_json_file(self, absolute_path, relative_path):
-        data_list = []
-        with open(absolute_path, "r") as f:
-            metadata = json.load(f)
-            for image_1, image_2, instruction in self.keys:
-                image_1 = os.path.join(relative_path, metadata[image_1])
-                image_2 = os.path.join(relative_path, metadata[image_2])
-                instruction = metadata[instruction]
-                data_list.append((image_1, image_2, instruction))
-        return data_list
-    
-        
-    def search_for_data(self, path, report_data_log=False):
-        now_path = os.path.join(self.base_path, path)
-        if os.path.isfile(now_path) and path.endswith(".json"):
-            try:
-                data_list = self.parse_json_file(now_path, os.path.dirname(path))
-                self.metadata.extend(data_list)
-            except:
-                self.bad_data.append(now_path)
-        elif os.path.isdir(now_path):
-            for sub_path in os.listdir(now_path):
-                self.search_for_data(os.path.join(path, sub_path))
-                if report_data_log and os.path.isdir(os.path.join(self.base_path, path, sub_path)):
-                    self.report_data_log()
-                
-                
-    def load_image(self, image_path):
-        image_path = os.path.join(self.base_path, image_path)
-        image = Image.open(image_path).convert("RGB")
-        width, height = image.size
-        scale = max(self.width / width, self.height / height)
-        image = torchvision.transforms.functional.resize(
-            image,
-            (round(height*scale), round(width*scale)),
-            interpolation=torchvision.transforms.InterpolationMode.BILINEAR
-        )
-        image = self.image_process(image)
-        return image
-                
-                
-    def load_data(self, data_id):
-        image_1, image_2, instruction = self.metadata[data_id]
-        image_1 = self.load_image(image_1)
-        image_2 = self.load_image(image_2)
-        return {"image_1": image_1, "image_2": image_2, "instruction": instruction}
-        
-        
-    def __getitem__(self, data_id):
-        if self.random:
-            while True:
-                try:
-                    data_id = (torch.randint(0, len(self.metadata), (1,))[0] + data_id) % len(self.metadata)
-                    data = self.load_data(data_id)
-                    return data
-                except:
-                    continue
-        else:
-            return self.load_data(data_id)
-
-
-    def __len__(self):
-        return self.steps_per_epoch if self.random else len(self.metadata)
-    
-    
-    
-class MultiTaskDataset(torch.utils.data.Dataset):
-    def __init__(self, dataset_list, dataset_weight, steps_per_epoch=1000):
-        self.dataset_list = dataset_list
-        self.dataset_weight = torch.tensor(dataset_weight, dtype=torch.float)
-        self.steps_per_epoch = steps_per_epoch
-
-        
-    def __getitem__(self, data_id):
-        while True:
-            try:
-                dataset_id = torch.multinomial(self.dataset_weight, 1).tolist()[0]
-                data_id = torch.randint(0, len(self.dataset_list[dataset_id]), (1,))[0]
-                data = self.dataset_list[dataset_id][data_id]
-                return data
-            except:
-                continue
-
-
-    def __len__(self):
-        return self.steps_per_epoch
--- a/diffsynth/lora/init.py
+++ b/diffsynth/lora/init.py
@@ -0,0 +1,45 @@
+import torch
+
+
+
+class GeneralLoRALoader:
+    def __init__(self, device="cpu", torch_dtype=torch.float32):
+        self.device = device
+        self.torch_dtype = torch_dtype
+    
+    
+    def get_name_dict(self, lora_state_dict):
+        lora_name_dict = {}
+        for key in lora_state_dict:
+            if ".lora_B." not in key:
+                continue
+            keys = key.split(".")
+            if len(keys) > keys.index("lora_B") + 2:
+                keys.pop(keys.index("lora_B") + 1)
+            keys.pop(keys.index("lora_B"))
+            if keys[0] == "diffusion_model":
+                keys.pop(0)
+            keys.pop(-1)
+            target_name = ".".join(keys)
+            lora_name_dict[target_name] = (key, key.replace(".lora_B.", ".lora_A."))
+        return lora_name_dict
+
+
+    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
+        updated_num = 0
+        lora_name_dict = self.get_name_dict(state_dict_lora)
+        for name, module in model.named_modules():
+            if name in lora_name_dict:
+                weight_up = state_dict_lora[lora_name_dict[name][0]].to(device=self.device, dtype=self.torch_dtype)
+                weight_down = state_dict_lora[lora_name_dict[name][1]].to(device=self.device, dtype=self.torch_dtype)
+                if len(weight_up.shape) == 4:
+                    weight_up = weight_up.squeeze(3).squeeze(2)
+                    weight_down = weight_down.squeeze(3).squeeze(2)
+                    weight_lora = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+                else:
+                    weight_lora = alpha * torch.mm(weight_up, weight_down)
+                state_dict = module.state_dict()
+                state_dict["weight"] = state_dict["weight"].to(device=self.device, dtype=self.torch_dtype) + weight_lora
+                module.load_state_dict(state_dict)
+                updated_num += 1
+        print(f"{updated_num} tensors are updated by LoRA.")
--- a/diffsynth/models/flux_controlnet.py
+++ b/diffsynth/models/flux_controlnet.py
@@ -320,6 +320,8 @@ class FluxControlNetStateDictConverter:
            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1}
        elif hash_value == "7f9583eb8ba86642abb9a21a4b2c9e16":
            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 10}
+        elif hash_value == "43ad5aaa27dd4ee01b832ed16773fa52":
+            extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0}
        else:
            extra_kwargs = {}
        return state_dict_, extra_kwargs
--- a/diffsynth/models/flux_dit.py
+++ b/diffsynth/models/flux_dit.py
@@ -20,11 +20,10 @@ class RoPEEmbedding(torch.nn.Module):
        self.axes_dim = axes_dim


-    def rope(self, pos: torch.Tensor, dim: int, theta: int, device="cpu") -> torch.Tensor:
+    def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
        assert dim % 2 == 0, "The dimension must be even."

        scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
-        scale = scale.to(device)
        omega = 1.0 / (theta**scale)

        batch_size, seq_length = pos.shape
@@ -37,9 +36,9 @@ class RoPEEmbedding(torch.nn.Module):
        return out.float()


-    def forward(self, ids, device="cpu"):
+    def forward(self, ids):
        n_axes = ids.shape[-1]
-        emb = torch.cat([self.rope(ids[..., i], self.axes_dim[i], self.theta, device) for i in range(n_axes)], dim=-3)
+        emb = torch.cat([self.rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
        return emb.unsqueeze(1)


@@ -277,20 +276,22 @@ class AdaLayerNormContinuous(torch.nn.Module):


 class FluxDiT(torch.nn.Module):
-    def __init__(self, disable_guidance_embedder=False):
+    def __init__(self, disable_guidance_embedder=False, input_dim=64, num_blocks=19):
        super().__init__()
        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
        self.time_embedder = TimestepEmbeddings(256, 3072)
        self.guidance_embedder = None if disable_guidance_embedder else TimestepEmbeddings(256, 3072)
        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(768, 3072), torch.nn.SiLU(), torch.nn.Linear(3072, 3072))
        self.context_embedder = torch.nn.Linear(4096, 3072)
-        self.x_embedder = torch.nn.Linear(64, 3072)
+        self.x_embedder = torch.nn.Linear(input_dim, 3072)

-        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(19)])
+        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(num_blocks)])
        self.single_blocks = torch.nn.ModuleList([FluxSingleTransformerBlock(3072, 24) for _ in range(38)])

        self.final_norm_out = AdaLayerNormContinuous(3072)
        self.final_proj_out = torch.nn.Linear(3072, 64)
+        
+        self.input_dim = input_dim


    def patchify(self, hidden_states):
@@ -739,5 +740,7 @@ class FluxDiTStateDictConverter:
                pass
        if "guidance_embedder.timestep_embedder.0.weight" not in state_dict_:
            return state_dict_, {"disable_guidance_embedder": True}
+        elif "blocks.8.attn.norm_k_a.weight" not in state_dict_:
+            return state_dict_, {"input_dim": 196, "num_blocks": 8}
        else:
            return state_dict_
--- a/diffsynth/models/flux_reference_embedder.py
+++ b/diffsynth/models/flux_reference_embedder.py
@@ -1,31 +0,0 @@
-from .sd3_dit import TimestepEmbeddings
-from .flux_dit import RoPEEmbedding
-import torch
-from einops import repeat
-
-
-class FluxReferenceEmbedder(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
-        self.idx_embedder = TimestepEmbeddings(256, 256)
-        self.proj = torch.nn.Linear(3072, 3072)
-        
-    def forward(self, image_ids, idx, dtype, device):
-        pos_emb = self.pos_embedder(image_ids, device=device)
-        idx_emb = self.idx_embedder(idx, dtype=dtype).to(device)
-        length = pos_emb.shape[2]
-        pos_emb = repeat(pos_emb, "B N L C H W -> 1 N (B L) C H W")
-        idx_emb = repeat(idx_emb, "B (C H W) -> 1 1 (B L) C H W", C=64, H=2, W=2, L=length)
-        image_rotary_emb = pos_emb + idx_emb
-        return image_rotary_emb
-    
-    def init(self):
-        self.idx_embedder.timestep_embedder[-1].load_state_dict({
-            "weight": torch.zeros((256, 256)),
-            "bias": torch.zeros((256,))
-        }),
-        self.proj.load_state_dict({
-            "weight": torch.eye(3072),
-            "bias": torch.zeros((3072,))
-        })
--- a/diffsynth/models/qwenvl.py
+++ b/diffsynth/models/qwenvl.py
@@ -0,0 +1,168 @@
+import torch
+
+
+class Qwen25VL_7b_Embedder(torch.nn.Module):
+    def __init__(self, model_path, max_length=640, dtype=torch.bfloat16, device="cuda"):
+        super(Qwen25VL_7b_Embedder, self).__init__()
+        self.max_length = max_length
+        self.dtype = dtype
+        self.device = device
+        
+        from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+        ).to(torch.cuda.current_device())
+
+        self.model.requires_grad_(False)
+        self.processor = AutoProcessor.from_pretrained(
+            model_path, min_pixels=256 * 28 * 28, max_pixels=324 * 28 * 28
+        )
+        
+        Qwen25VL_7b_PREFIX = '''Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:
+- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.
+- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n
+Here are examples of how to transform or refine prompts:
+- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.
+- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n
+Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:
+User Prompt:'''
+
+        self.prefix = Qwen25VL_7b_PREFIX
+        
+    @staticmethod
+    def from_pretrained(path, torch_dtype=torch.bfloat16, device="cuda"):
+        return Qwen25VL_7b_Embedder(path, dtype=torch_dtype, device=device)
+
+    def forward(self, caption, ref_images):
+        text_list = caption
+        embs = torch.zeros(
+            len(text_list),
+            self.max_length,
+            self.model.config.hidden_size,
+            dtype=torch.bfloat16,
+            device=torch.cuda.current_device(),
+        )
+        hidden_states = torch.zeros(
+            len(text_list),
+            self.max_length,
+            self.model.config.hidden_size,
+            dtype=torch.bfloat16,
+            device=torch.cuda.current_device(),
+        )
+        masks = torch.zeros(
+            len(text_list),
+            self.max_length,
+            dtype=torch.long,
+            device=torch.cuda.current_device(),
+        )
+        input_ids_list = []
+        attention_mask_list = []
+        emb_list = []
+
+        def split_string(s):
+            s = s.replace("“", '"').replace("”", '"').replace("'", '''"''')  # use english quotes
+            result = []
+            in_quotes = False
+            temp = ""
+
+            for idx,char in enumerate(s):
+                if char == '"' and idx>155:
+                    temp += char
+                    if not in_quotes:
+                        result.append(temp)
+                        temp = ""
+
+                    in_quotes = not in_quotes
+                    continue
+                if in_quotes:
+                    if char.isspace():
+                        pass  # have space token
+
+                    result.append("“" + char + "”")
+                else:
+                    temp += char
+
+            if temp:
+                result.append(temp)
+
+            return result
+
+        for idx, (txt, imgs) in enumerate(zip(text_list, ref_images)):
+
+            messages = [{"role": "user", "content": []}]
+
+            messages[0]["content"].append({"type": "text", "text": f"{self.prefix}"})
+
+            messages[0]["content"].append({"type": "image", "image": imgs})
+
+            # 再添加 text
+            messages[0]["content"].append({"type": "text", "text": f"{txt}"})
+
+            # Preparation for inference
+            text = self.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True, add_vision_id=True
+            )
+
+            image_inputs = [imgs]
+
+            inputs = self.processor(
+                text=[text],
+                images=image_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+
+            old_inputs_ids = inputs.input_ids
+            text_split_list = split_string(text)
+
+            token_list = []
+            for text_each in text_split_list:
+                txt_inputs = self.processor(
+                    text=text_each,
+                    images=None,
+                    videos=None,
+                    padding=True,
+                    return_tensors="pt",
+                )
+                token_each = txt_inputs.input_ids
+                if token_each[0][0] == 2073 and token_each[0][-1] == 854:
+                    token_each = token_each[:, 1:-1]
+                    token_list.append(token_each)
+                else:
+                    token_list.append(token_each)
+
+            new_txt_ids = torch.cat(token_list, dim=1).to("cuda")
+
+            new_txt_ids = new_txt_ids.to(old_inputs_ids.device)
+
+            idx1 = (old_inputs_ids == 151653).nonzero(as_tuple=True)[1][0]
+            idx2 = (new_txt_ids == 151653).nonzero(as_tuple=True)[1][0]
+            inputs.input_ids = (
+                torch.cat([old_inputs_ids[0, :idx1], new_txt_ids[0, idx2:]], dim=0)
+                .unsqueeze(0)
+                .to("cuda")
+            )
+            inputs.attention_mask = (inputs.input_ids > 0).long().to("cuda")
+            outputs = self.model(
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                pixel_values=inputs.pixel_values.to("cuda"),
+                image_grid_thw=inputs.image_grid_thw.to("cuda"),
+                output_hidden_states=True,
+            )
+
+            emb = outputs["hidden_states"][-1]
+
+            embs[idx, : min(self.max_length, emb.shape[1] - 217)] = emb[0, 217:][
+                : self.max_length
+            ]
+
+            masks[idx, : min(self.max_length, emb.shape[1] - 217)] = torch.ones(
+                (min(self.max_length, emb.shape[1] - 217)),
+                dtype=torch.long,
+                device=torch.cuda.current_device(),
+            )
+
+        return embs, masks
--- a/diffsynth/models/step1x_connector.py
+++ b/diffsynth/models/step1x_connector.py
@@ -0,0 +1,683 @@
+from typing import Optional
+
+import torch, math
+import torch.nn
+from einops import rearrange
+from torch import nn
+from functools import partial
+from einops import rearrange
+
+
+
+def attention(q, k, v, attn_mask, mode="torch"):
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+    x = rearrange(x, "b n s d -> b s (n d)")
+    return x
+    
+
+
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = (bias, bias)
+        drop_probs = (drop, drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(
+            in_channels, hidden_channels, bias=bias[0], device=device, dtype=dtype
+        )
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_channels, device=device, dtype=dtype)
+            if norm_layer is not None
+            else nn.Identity()
+        )
+        self.fc2 = linear_layer(
+            hidden_channels, out_features, bias=bias[1], device=device, dtype=dtype
+        )
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+    
+    
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            in_features=in_channels,
+            out_features=hidden_size,
+            bias=True,
+            **factory_kwargs,
+        )
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(
+            in_features=hidden_size,
+            out_features=hidden_size,
+            bias=True,
+            **factory_kwargs,
+        )
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+    
+    
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        act_layer,
+        frequency_embedding_size=256,
+        max_period=10000,
+        out_size=None,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+
+        self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
+            ),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)  # type: ignore
+        nn.init.normal_(self.mlp[2].weight, std=0.02)  # type: ignore
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+
+        Args:
+            t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+            dim (int): the dimension of the output.
+            max_period (int): controls the minimum frequency of the embeddings.
+
+        Returns:
+            embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+
+        .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        """
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(
+            t, self.frequency_embedding_size, self.max_period
+        ).type(self.mlp[0].weight.dtype)  # type: ignore
+        t_emb = self.mlp(t_freq)
+        return t_emb
+    
+    
+def apply_gate(x, gate=None, tanh=False):
+    """AI is creating summary for apply_gate
+
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+
+
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+
+
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+
+    Args:
+        norm_layer (str): The type of normalization layer.
+
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
+
+
+def get_activation_layer(act_type):
+    """get activation layer
+
+    Args:
+        act_type (str): the activation type
+
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")
+
+class IndividualTokenRefinerBlock(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        need_CA: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.need_CA = need_CA
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+        self.norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        self.self_attn_qkv = nn.Linear(
+            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+
+        self.norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        act_layer = get_activation_layer(act_type)
+        self.mlp = MLP(
+            in_channels=hidden_size,
+            hidden_channels=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop_rate,
+            **factory_kwargs,
+        )
+
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+
+        if self.need_CA:
+            self.cross_attnblock=CrossAttnBlock(hidden_size=hidden_size,
+                        heads_num=heads_num,
+                        mlp_width_ratio=mlp_width_ratio,
+                        mlp_drop_rate=mlp_drop_rate,
+                        act_type=act_type,
+                        qk_norm=qk_norm,
+                        qk_norm_type=qk_norm_type,
+                        qkv_bias=qkv_bias,
+                        **factory_kwargs,)
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+        y: torch.Tensor = None,
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        
+        if self.need_CA:
+            x = self.cross_attnblock(x, c, attn_mask, y)
+
+        # FFN Layer
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+
+        return x
+
+
+
+
+class CrossAttnBlock(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+
+        self.norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        self.norm1_2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        self.self_attn_q = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.self_attn_kv = nn.Linear(
+            hidden_size, hidden_size*2, bias=qkv_bias, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+
+        self.norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        act_layer = get_activation_layer(act_type)
+
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+        y: torch.Tensor=None,
+        
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+
+        norm_x = self.norm1(x)
+        norm_y = self.norm1_2(y)
+        q = self.self_attn_q(norm_x)
+        q = rearrange(q, "B L (H D) -> B L H D",  H=self.heads_num)
+        kv = self.self_attn_kv(norm_y)
+        k, v = rearrange(kv, "B L (K H D) -> K B L H D", K=2, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+
+        return x
+
+
+
+class IndividualTokenRefiner(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        need_CA:bool=False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):  
+        
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.need_CA = need_CA
+        self.blocks = nn.ModuleList(
+            [
+                IndividualTokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads_num=heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    act_type=act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    need_CA=self.need_CA,
+                    **factory_kwargs,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.LongTensor,
+        mask: Optional[torch.Tensor] = None,
+        y:torch.Tensor=None,
+    ):
+        self_attn_mask = None
+        if mask is not None:
+            batch_size = mask.shape[0]
+            seq_len = mask.shape[1]
+            mask = mask.to(x.device)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(
+                1, 1, seq_len, 1
+            )
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            # avoids self-attention weight being NaN for padding tokens
+            self_attn_mask[:, :, :, 0] = True
+        
+        
+        for block in self.blocks:
+            x = block(x, c, self_attn_mask,y)
+
+        return x
+
+
+class SingleTokenRefiner(torch.nn.Module):
+    """
+    A single token refiner block for llm text embedding refine.
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        need_CA:bool=False,
+        attn_mode: str = "torch",
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.need_CA = need_CA
+        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
+
+        self.input_embedder = nn.Linear(
+            in_channels, hidden_size, bias=True, **factory_kwargs
+        )
+        if self.need_CA:
+            self.input_embedder_CA = nn.Linear(
+            in_channels, hidden_size, bias=True, **factory_kwargs
+        )
+
+        act_layer = get_activation_layer(act_type)
+        # Build timestep embedding layer
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+        # Build context embedding layer
+        self.c_embedder = TextProjection(
+            in_channels, hidden_size, act_layer, **factory_kwargs
+        )
+
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            heads_num=heads_num,
+            depth=depth,
+            mlp_width_ratio=mlp_width_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            need_CA=need_CA,
+            **factory_kwargs,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.LongTensor,
+        mask: Optional[torch.LongTensor] = None,
+        y: torch.LongTensor=None,
+    ):
+        timestep_aware_representations = self.t_embedder(t)
+
+        if mask is None:
+            context_aware_representations = x.mean(dim=1)
+        else:
+            mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
+            context_aware_representations = (x * mask_float).sum(
+                dim=1
+            ) / mask_float.sum(dim=1)
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+
+        x = self.input_embedder(x)
+        if self.need_CA:
+            y = self.input_embedder_CA(y)
+            x = self.individual_token_refiner(x, c, mask, y)
+        else:
+            x = self.individual_token_refiner(x, c, mask)
+
+        return x
+
+
+class Qwen2Connector(torch.nn.Module):
+    def __init__(
+        self,
+        # biclip_dim=1024,
+        in_channels=3584,
+        hidden_size=4096,
+        heads_num=32,
+        depth=2,
+        need_CA=False,
+        device=None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype":dtype}
+
+        self.S =SingleTokenRefiner(in_channels=in_channels,hidden_size=hidden_size,heads_num=heads_num,depth=depth,need_CA=need_CA,**factory_kwargs)
+        self.global_proj_out=nn.Linear(in_channels,768)
+
+        self.scale_factor = nn.Parameter(torch.zeros(1))
+        with torch.no_grad():
+            self.scale_factor.data += -(1 - 0.09)
+
+    def forward(self, x,t,mask):
+        mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
+        x_mean = (x * mask_float).sum(
+                dim=1
+            ) / mask_float.sum(dim=1) * (1 + self.scale_factor)
+
+        global_out=self.global_proj_out(x_mean)
+        encoder_hidden_states = self.S(x,t,mask)
+        return encoder_hidden_states,global_out
+    
+    @staticmethod
+    def state_dict_converter():
+        return Qwen2ConnectorStateDictConverter()
+    
+    
+class Qwen2ConnectorStateDictConverter:
+    def __init__(self):
+        pass
+
+    def from_diffusers(self, state_dict):
+        return state_dict
+    
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.startswith("connector."):
+                name_ = name[len("connector."):]
+                state_dict_[name_] = param
+        return state_dict_
--- a/diffsynth/models/utils.py
+++ b/diffsynth/models/utils.py
@@ -62,16 +62,16 @@ def load_state_dict_from_folder(file_path, torch_dtype=None):
    return state_dict


-def load_state_dict(file_path, torch_dtype=None):
+def load_state_dict(file_path, torch_dtype=None, device="cpu"):
    if file_path.endswith(".safetensors"):
-        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
    else:
-        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)


-def load_state_dict_from_safetensors(file_path, torch_dtype=None):
+def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
    state_dict = {}
-    with safe_open(file_path, framework="pt", device="cpu") as f:
+    with safe_open(file_path, framework="pt", device=device) as f:
        for k in f.keys():
            state_dict[k] = f.get_tensor(k)
            if torch_dtype is not None:
@@ -79,8 +79,8 @@ def load_state_dict_from_safetensors(file_path, torch_dtype=None):
    return state_dict


-def load_state_dict_from_bin(file_path, torch_dtype=None):
-    state_dict = torch.load(file_path, map_location="cpu", weights_only=True)
+def load_state_dict_from_bin(file_path, torch_dtype=None, device="cpu"):
+    state_dict = torch.load(file_path, map_location=device, weights_only=True)
    if torch_dtype is not None:
        for i in state_dict:
            if isinstance(state_dict[i], torch.Tensor):
--- a/diffsynth/models/wan_video_camera_controller.py
+++ b/diffsynth/models/wan_video_camera_controller.py
@@ -0,0 +1,202 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+import os
+from typing_extensions import Literal
+
+class SimpleAdapter(nn.Module):
+    def __init__(self, in_dim, out_dim, kernel_size, stride, num_residual_blocks=1):
+        super(SimpleAdapter, self).__init__()
+
+        # Pixel Unshuffle: reduce spatial dimensions by a factor of 8
+        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=8)
+
+        # Convolution: reduce spatial dimensions by a factor
+        #  of 2 (without overlap)
+        self.conv = nn.Conv2d(in_dim * 64, out_dim, kernel_size=kernel_size, stride=stride, padding=0)
+
+        # Residual blocks for feature extraction
+        self.residual_blocks = nn.Sequential(
+            *[ResidualBlock(out_dim) for _ in range(num_residual_blocks)]
+        )
+
+    def forward(self, x):
+        # Reshape to merge the frame dimension into batch
+        bs, c, f, h, w = x.size()
+        x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
+
+        # Pixel Unshuffle operation
+        x_unshuffled = self.pixel_unshuffle(x)
+
+        # Convolution operation
+        x_conv = self.conv(x_unshuffled)
+
+        # Feature extraction with residual blocks
+        out = self.residual_blocks(x_conv)
+
+        # Reshape to restore original bf dimension
+        out = out.view(bs, f, out.size(1), out.size(2), out.size(3))
+
+        # Permute dimensions to reorder (if needed), e.g., swap channels and feature frames
+        out = out.permute(0, 2, 1, 3, 4)
+
+        return out
+    
+    def process_camera_coordinates(
+        self,
+        direction: Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"],
+        length: int,
+        height: int,
+        width: int,
+        speed: float = 1/54,
+        origin=(0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0)
+    ):
+        if origin is None:
+            origin = (0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0)
+        coordinates = generate_camera_coordinates(direction, length, speed, origin)
+        plucker_embedding = process_pose_file(coordinates, width, height)
+        return plucker_embedding
+        
+    
+
+class ResidualBlock(nn.Module):
+    def __init__(self, dim):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
+
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.conv1(x))
+        out = self.conv2(out)
+        out += residual
+        return out
+    
+class Camera(object):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+
+def get_relative_pose(cam_params):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+
+def custom_meshgrid(*args):
+    # torch>=2.0.0 only
+    return torch.meshgrid(*args, indexing='ij')
+
+
+def ray_condition(K, c2w, H, W, device):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+
+    B = K.shape[0]
+
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+
+    zs = torch.ones_like(i)  # [B, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.linalg.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+
+
+def process_pose_file(cam_params, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu', return_poses=False):
+    if return_poses:
+        return cam_params
+    else:
+        cam_params = [Camera(cam_param) for cam_param in cam_params]
+
+        sample_wh_ratio = width / height
+        pose_wh_ratio = original_pose_width / original_pose_height  # Assuming placeholder ratios, change as needed
+
+        if pose_wh_ratio > sample_wh_ratio:
+            resized_ori_w = height * pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fx = resized_ori_w * cam_param.fx / width
+        else:
+            resized_ori_h = width / pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fy = resized_ori_h * cam_param.fy / height
+
+        intrinsic = np.asarray([[cam_param.fx * width,
+                                cam_param.fy * height,
+                                cam_param.cx * width,
+                                cam_param.cy * height]
+                                for cam_param in cam_params], dtype=np.float32)
+
+        K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+        c2ws = get_relative_pose(cam_params)  # Assuming this function is defined elsewhere
+        c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+        plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
+        plucker_embedding = plucker_embedding[None]
+        plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
+        return plucker_embedding
+
+
+
+def generate_camera_coordinates(
+    direction: Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"],
+    length: int,
+    speed: float = 1/54,
+    origin=(0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0)
+):
+    coordinates = [list(origin)]
+    while len(coordinates) < length:
+        coor = coordinates[-1].copy()
+        if "Left" in direction:
+            coor[9] += speed
+        if "Right" in direction:
+            coor[9] -= speed
+        if "Up" in direction:
+            coor[13] += speed
+        if "Down" in direction:
+            coor[13] -= speed
+        coordinates.append(coor)
+    return coordinates
--- a/diffsynth/models/wan_video_dit.py
+++ b/diffsynth/models/wan_video_dit.py
@@ -5,6 +5,7 @@ import math
 from typing import Tuple, Optional
 from einops import rearrange
 from .utils import hash_state_dict_keys
+from .wan_video_camera_controller import SimpleAdapter
 try:
    import flash_attn_interface
    FLASH_ATTN_3_AVAILABLE = True
@@ -36,6 +37,8 @@ def flash_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_heads
        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
        x = flash_attn_interface.flash_attn_func(q, k, v)
+        if isinstance(x,tuple):
+            x = x[0]
        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
    elif FLASH_ATTN_2_AVAILABLE:
        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
@@ -221,7 +224,7 @@ class DiTBlock(nn.Module):


 class MLP(torch.nn.Module):
-    def __init__(self, in_dim, out_dim):
+    def __init__(self, in_dim, out_dim, has_pos_emb=False):
        super().__init__()
        self.proj = torch.nn.Sequential(
            nn.LayerNorm(in_dim),
@@ -230,8 +233,13 @@ class MLP(torch.nn.Module):
            nn.Linear(in_dim, out_dim),
            nn.LayerNorm(out_dim)
        )
+        self.has_pos_emb = has_pos_emb
+        if has_pos_emb:
+            self.emb_pos = torch.nn.Parameter(torch.zeros((1, 514, 1280)))

    def forward(self, x):
+        if self.has_pos_emb:
+            x = x + self.emb_pos.to(dtype=x.dtype, device=x.device)
        return self.proj(x)


@@ -264,6 +272,10 @@ class WanModel(torch.nn.Module):
        num_heads: int,
        num_layers: int,
        has_image_input: bool,
+        has_image_pos_emb: bool = False,
+        has_ref_conv: bool = False,
+        add_control_adapter: bool = False,
+        in_dim_control_adapter: int = 24,
    ):
        super().__init__()
        self.dim = dim
@@ -294,10 +306,22 @@ class WanModel(torch.nn.Module):
        self.freqs = precompute_freqs_cis_3d(head_dim)

        if has_image_input:
-            self.img_emb = MLP(1280, dim)  # clip_feature_dim = 1280
+            self.img_emb = MLP(1280, dim, has_pos_emb=has_image_pos_emb)  # clip_feature_dim = 1280
+        if has_ref_conv:
+            self.ref_conv = nn.Conv2d(16, dim, kernel_size=(2, 2), stride=(2, 2))
+        self.has_image_pos_emb = has_image_pos_emb
+        self.has_ref_conv = has_ref_conv
+        if add_control_adapter:
+            self.control_adapter = SimpleAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:])
+        else:
+            self.control_adapter = None

-    def patchify(self, x: torch.Tensor):
+    def patchify(self, x: torch.Tensor,control_camera_latents_input: torch.Tensor = None):
        x = self.patch_embedding(x)
+        if self.control_adapter is not None and control_camera_latents_input is not None:
+            y_camera = self.control_adapter(control_camera_latents_input)
+            x = [u + v for u, v in zip(x, y_camera)]
+            x = x[0].unsqueeze(0)
        grid_size = x.shape[2:]
        x = rearrange(x, 'b c f h w -> b (f h w) c').contiguous()
        return x, grid_size  # x, grid_size: (f, h, w)
@@ -451,6 +475,7 @@ class WanModelStateDictConverter:
        return state_dict_, config
    
    def from_civitai(self, state_dict):
+        state_dict = {name: param for name, param in state_dict.items() if not name.startswith("vace")}
        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
            config = {
                "has_image_input": False,
@@ -522,6 +547,7 @@ class WanModelStateDictConverter:
                "eps": 1e-6
            }
        elif hash_state_dict_keys(state_dict) == "349723183fc063b2bfc10bb2835cf677":
+            # 1.3B PAI control
            config = {
                "has_image_input": True,
                "patch_size": [1, 2, 2],
@@ -536,6 +562,7 @@ class WanModelStateDictConverter:
                "eps": 1e-6
            }
        elif hash_state_dict_keys(state_dict) == "efa44cddf936c70abd0ea28b6cbe946c":
+            # 14B PAI control
            config = {
                "has_image_input": True,
                "patch_size": [1, 2, 2],
@@ -549,6 +576,89 @@ class WanModelStateDictConverter:
                "num_layers": 40,
                "eps": 1e-6
            }
+        elif hash_state_dict_keys(state_dict) == "3ef3b1f8e1dab83d5b71fd7b617f859f":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_image_pos_emb": True
+            }
+        elif hash_state_dict_keys(state_dict) == "70ddad9d3a133785da5ea371aae09504":
+            # 1.3B PAI control v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "has_ref_conv": True
+            }
+        elif hash_state_dict_keys(state_dict) == "26bde73488a92e64cc20b0a7485b9e5b":
+            # 14B PAI control v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_ref_conv": True
+            }
+        elif hash_state_dict_keys(state_dict) == "ac6a5aa74f4a0aab6f64eb9a72f19901":
+            # 1.3B PAI control-camera v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 32,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "has_ref_conv": False,
+                "add_control_adapter": True,
+                "in_dim_control_adapter": 24,
+            }
+        elif hash_state_dict_keys(state_dict) == "b61c605c2adbd23124d152ed28e049ae":
+            # 14B PAI control-camera v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 32,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_ref_conv": False,
+                "add_control_adapter": True,
+                "in_dim_control_adapter": 24,
+            }
        else:
            config = {}
        return state_dict, config
--- a/diffsynth/models/wan_video_vace.py
+++ b/diffsynth/models/wan_video_vace.py
@@ -0,0 +1,113 @@
+import torch
+from .wan_video_dit import DiTBlock
+from .utils import hash_state_dict_keys
+
+class VaceWanAttentionBlock(DiTBlock):
+    def __init__(self, has_image_input, dim, num_heads, ffn_dim, eps=1e-6, block_id=0):
+        super().__init__(has_image_input, dim, num_heads, ffn_dim, eps=eps)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = torch.nn.Linear(self.dim, self.dim)
+        self.after_proj = torch.nn.Linear(self.dim, self.dim)
+
+    def forward(self, c, x, context, t_mod, freqs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+            all_c = []
+        else:
+            all_c = list(torch.unbind(c))
+            c = all_c.pop(-1)
+        c = super().forward(c, context, t_mod, freqs)
+        c_skip = self.after_proj(c)
+        all_c += [c_skip, c]
+        c = torch.stack(all_c)
+        return c
+
+
+class VaceWanModel(torch.nn.Module):
+    def __init__(
+        self,
+        vace_layers=(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28),
+        vace_in_dim=96,
+        patch_size=(1, 2, 2),
+        has_image_input=False,
+        dim=1536,
+        num_heads=12,
+        ffn_dim=8960,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.vace_layers = vace_layers
+        self.vace_in_dim = vace_in_dim
+        self.vace_layers_mapping = {i: n for n, i in enumerate(self.vace_layers)}
+
+        # vace blocks
+        self.vace_blocks = torch.nn.ModuleList([
+            VaceWanAttentionBlock(has_image_input, dim, num_heads, ffn_dim, eps, block_id=i)
+            for i in self.vace_layers
+        ])
+
+        # vace patch embeddings
+        self.vace_patch_embedding = torch.nn.Conv3d(vace_in_dim, dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(
+        self, x, vace_context, context, t_mod, freqs,
+        use_gradient_checkpointing: bool = False,
+        use_gradient_checkpointing_offload: bool = False,
+    ):
+        c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
+        c = [u.flatten(2).transpose(1, 2) for u in c]
+        c = torch.cat([
+            torch.cat([u, u.new_zeros(1, x.shape[1] - u.size(1), u.size(2))],
+                      dim=1) for u in c
+        ])
+        
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        
+        for block in self.vace_blocks:
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    c = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        c, x, context, t_mod, freqs,
+                        use_reentrant=False,
+                    )
+            elif use_gradient_checkpointing:
+                c = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    c, x, context, t_mod, freqs,
+                    use_reentrant=False,
+                )
+            else:
+                c = block(c, x, context, t_mod, freqs)
+        hints = torch.unbind(c)[:-1]
+        return hints
+    
+    @staticmethod
+    def state_dict_converter():
+        return VaceWanModelDictConverter()
+    
+    
+class VaceWanModelDictConverter:
+    def __init__(self):
+        pass
+    
+    def from_civitai(self, state_dict):
+        state_dict_ = {name: param for name, param in state_dict.items() if name.startswith("vace")}
+        if hash_state_dict_keys(state_dict_) == '3b2726384e4f64837bdf216eea3f310d': # vace 14B
+            config = {
+                "vace_layers": (0, 5, 10, 15, 20, 25, 30, 35),
+                "vace_in_dim": 96,
+                "patch_size": (1, 2, 2),
+                "has_image_input": False,
+                "dim": 5120,
+                "num_heads": 40,
+                "ffn_dim": 13824,
+                "eps": 1e-06,                
+            }
+        else:
+            config = {}
+        return state_dict_, config
--- a/diffsynth/models/wan_video_vae.py
+++ b/diffsynth/models/wan_video_vae.py
@@ -774,18 +774,11 @@ class WanVideoVAE(nn.Module):


    def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
-        hidden_states = [hidden_state.to("cpu") for hidden_state in hidden_states]
-        videos = []
-        for hidden_state in hidden_states:
-            hidden_state = hidden_state.unsqueeze(0)
-            if tiled:
-                video = self.tiled_decode(hidden_state, device, tile_size, tile_stride)
-            else:
-                video = self.single_decode(hidden_state, device)
-            video = video.squeeze(0)
-            videos.append(video)
-        videos = torch.stack(videos)
-        return videos
+        if tiled:
+            video = self.tiled_decode(hidden_states, device, tile_size, tile_stride)
+        else:
+            video = self.single_decode(hidden_states, device)
+        return video


    @staticmethod
--- a/diffsynth/pipelines/flux_image.py
+++ b/diffsynth/pipelines/flux_image.py
@@ -1,12 +1,11 @@
 from ..models import ModelManager, FluxDiT, SD3TextEncoder1, FluxTextEncoder2, FluxVAEDecoder, FluxVAEEncoder, FluxIpAdapter
+from ..models.step1x_connector import Qwen2Connector
 from ..controlnets import FluxMultiControlNetManager, ControlNetUnit, ControlNetConfigUnit, Annotator
-from ..models.flux_reference_embedder import FluxReferenceEmbedder
 from ..prompters import FluxPrompter
 from ..schedulers import FlowMatchScheduler
 from .base import BasePipeline
 from typing import List
 import torch
-from einops import rearrange
 from tqdm import tqdm
 import numpy as np
 from PIL import Image
@@ -34,106 +33,112 @@ class FluxImagePipeline(BasePipeline):
        self.ipadapter: FluxIpAdapter = None
        self.ipadapter_image_encoder: SiglipVisionModel = None
        self.infinityou_processor: InfinitYou = None
-        self.reference_embedder: FluxReferenceEmbedder = None
-        self.model_names = ['text_encoder_1', 'text_encoder_2', 'dit', 'vae_decoder', 'vae_encoder', 'controlnet', 'ipadapter', 'ipadapter_image_encoder']
+        self.qwenvl = None
+        self.step1x_connector: Qwen2Connector = None
+        self.model_names = ['text_encoder_1', 'text_encoder_2', 'dit', 'vae_decoder', 'vae_encoder', 'controlnet', 'ipadapter', 'ipadapter_image_encoder', 'qwenvl', 'step1x_connector']


    def enable_vram_management(self, num_persistent_param_in_dit=None):
-        dtype = next(iter(self.text_encoder_1.parameters())).dtype
-        enable_vram_management(
-            self.text_encoder_1,
-            module_map = {
-                torch.nn.Linear: AutoWrappedLinear,
-                torch.nn.Embedding: AutoWrappedModule,
-                torch.nn.LayerNorm: AutoWrappedModule,
-            },
-            module_config = dict(
-                offload_dtype=dtype,
-                offload_device="cpu",
-                onload_dtype=dtype,
-                onload_device="cpu",
-                computation_dtype=self.torch_dtype,
-                computation_device=self.device,
-            ),
-        )
-        dtype = next(iter(self.text_encoder_2.parameters())).dtype
-        enable_vram_management(
-            self.text_encoder_2,
-            module_map = {
-                torch.nn.Linear: AutoWrappedLinear,
-                torch.nn.Embedding: AutoWrappedModule,
-                T5LayerNorm: AutoWrappedModule,
-                T5DenseActDense: AutoWrappedModule,
-                T5DenseGatedActDense: AutoWrappedModule,
-            },
-            module_config = dict(
-                offload_dtype=dtype,
-                offload_device="cpu",
-                onload_dtype=dtype,
-                onload_device="cpu",
-                computation_dtype=self.torch_dtype,
-                computation_device=self.device,
-            ),
-        )
-        dtype = next(iter(self.dit.parameters())).dtype
-        enable_vram_management(
-            self.dit,
-            module_map = {
-                RMSNorm: AutoWrappedModule,
-                torch.nn.Linear: AutoWrappedLinear,
-            },
-            module_config = dict(
-                offload_dtype=dtype,
-                offload_device="cpu",
-                onload_dtype=dtype,
-                onload_device="cuda",
-                computation_dtype=self.torch_dtype,
-                computation_device=self.device,
-            ),
-            max_num_param=num_persistent_param_in_dit,
-            overflow_module_config = dict(
-                offload_dtype=dtype,
-                offload_device="cpu",
-                onload_dtype=dtype,
-                onload_device="cpu",
-                computation_dtype=self.torch_dtype,
-                computation_device=self.device,
-            ),
-        )
-        dtype = next(iter(self.vae_decoder.parameters())).dtype
-        enable_vram_management(
-            self.vae_decoder,
-            module_map = {
-                torch.nn.Linear: AutoWrappedLinear,
-                torch.nn.Conv2d: AutoWrappedModule,
-                torch.nn.GroupNorm: AutoWrappedModule,
-            },
-            module_config = dict(
-                offload_dtype=dtype,
-                offload_device="cpu",
-                onload_dtype=dtype,
-                onload_device="cpu",
-                computation_dtype=self.torch_dtype,
-                computation_device=self.device,
-            ),
-        )
-        dtype = next(iter(self.vae_encoder.parameters())).dtype
-        enable_vram_management(
-            self.vae_encoder,
-            module_map = {
-                torch.nn.Linear: AutoWrappedLinear,
-                torch.nn.Conv2d: AutoWrappedModule,
-                torch.nn.GroupNorm: AutoWrappedModule,
-            },
-            module_config = dict(
-                offload_dtype=dtype,
-                offload_device="cpu",
-                onload_dtype=dtype,
-                onload_device="cpu",
-                computation_dtype=self.torch_dtype,
-                computation_device=self.device,
-            ),
-        )
+        if self.text_encoder_1 is not None:
+            dtype = next(iter(self.text_encoder_1.parameters())).dtype
+            enable_vram_management(
+                self.text_encoder_1,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Embedding: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.text_encoder_2 is not None:
+            dtype = next(iter(self.text_encoder_2.parameters())).dtype
+            enable_vram_management(
+                self.text_encoder_2,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Embedding: AutoWrappedModule,
+                    T5LayerNorm: AutoWrappedModule,
+                    T5DenseActDense: AutoWrappedModule,
+                    T5DenseGatedActDense: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.dit is not None:
+            dtype = next(iter(self.dit.parameters())).dtype
+            enable_vram_management(
+                self.dit,
+                module_map = {
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Linear: AutoWrappedLinear,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cuda",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.vae_decoder is not None:
+            dtype = next(iter(self.vae_decoder.parameters())).dtype
+            enable_vram_management(
+                self.vae_decoder,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.GroupNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.vae_encoder is not None:
+            dtype = next(iter(self.vae_encoder.parameters())).dtype
+            enable_vram_management(
+                self.vae_encoder,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.GroupNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
        self.enable_cpu_offload()


@@ -170,6 +175,10 @@ class FluxImagePipeline(BasePipeline):
        self.image_proj_model = model_manager.fetch_model("infiniteyou_image_projector")
        if self.image_proj_model is not None:
            self.infinityou_processor = InfinitYou(device=self.device)
+            
+        # Step1x
+        self.qwenvl = model_manager.fetch_model("qwenvl")
+        self.step1x_connector = model_manager.fetch_model("step1x_connector")


    @staticmethod
@@ -194,10 +203,13 @@ class FluxImagePipeline(BasePipeline):
    

    def encode_prompt(self, prompt, positive=True, t5_sequence_length=512):
-        prompt_emb, pooled_prompt_emb, text_ids = self.prompter.encode_prompt(
-            prompt, device=self.device, positive=positive, t5_sequence_length=t5_sequence_length
-        )
-        return {"prompt_emb": prompt_emb, "pooled_prompt_emb": pooled_prompt_emb, "text_ids": text_ids}
+        if self.text_encoder_1 is not None and self.text_encoder_2 is not None:
+            prompt_emb, pooled_prompt_emb, text_ids = self.prompter.encode_prompt(
+                prompt, device=self.device, positive=positive, t5_sequence_length=t5_sequence_length
+            )
+            return {"prompt_emb": prompt_emb, "pooled_prompt_emb": pooled_prompt_emb, "text_ids": text_ids}
+        else:
+            return {}
    

    def prepare_extra_input(self, latents=None, guidance=1.0):
@@ -365,18 +377,44 @@ class FluxImagePipeline(BasePipeline):
            return {}, controlnet_image
        
        
-    def prepare_reference_images(self, reference_images, tiled=False, tile_size=64, tile_stride=32):
-        if reference_images is not None:
-            hidden_states_ref = []
-            for reference_image in reference_images:
-                self.load_models_to_device(['vae_encoder'])
-                reference_image = self.preprocess_image(reference_image).to(device=self.device, dtype=self.torch_dtype)
-                latents = self.encode_image(reference_image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
-                hidden_states_ref.append(latents)
-            hidden_states_ref = torch.concat(hidden_states_ref, dim=0)
-            return {"hidden_states_ref": hidden_states_ref}
+    def prepare_flex_kwargs(self, latents, flex_inpaint_image=None, flex_inpaint_mask=None, flex_control_image=None, flex_control_strength=0.5, flex_control_stop=0.5, tiled=False, tile_size=64, tile_stride=32):
+        if self.dit.input_dim == 196:
+            if flex_inpaint_image is None:
+                flex_inpaint_image = torch.zeros_like(latents)
+            else:
+                flex_inpaint_image = self.preprocess_image(flex_inpaint_image).to(device=self.device, dtype=self.torch_dtype)
+                flex_inpaint_image = self.encode_image(flex_inpaint_image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+            if flex_inpaint_mask is None:
+                flex_inpaint_mask = torch.ones_like(latents)[:, 0:1, :, :]
+            else:
+                flex_inpaint_mask = flex_inpaint_mask.resize((latents.shape[3], latents.shape[2]))
+                flex_inpaint_mask = self.preprocess_image(flex_inpaint_mask).to(device=self.device, dtype=self.torch_dtype)
+                flex_inpaint_mask = (flex_inpaint_mask[:, 0:1, :, :] + 1) / 2
+            flex_inpaint_image = flex_inpaint_image * (1 - flex_inpaint_mask)
+            if flex_control_image is None:
+                flex_control_image = torch.zeros_like(latents)
+            else:
+                flex_control_image = self.preprocess_image(flex_control_image).to(device=self.device, dtype=self.torch_dtype)
+                flex_control_image = self.encode_image(flex_control_image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) * flex_control_strength
+            flex_condition = torch.concat([flex_inpaint_image, flex_inpaint_mask, flex_control_image], dim=1)
+            flex_uncondition = torch.concat([flex_inpaint_image, flex_inpaint_mask, torch.zeros_like(flex_control_image)], dim=1)
+            flex_control_stop_timestep = self.scheduler.timesteps[int(flex_control_stop * (len(self.scheduler.timesteps) - 1))]
+            flex_kwargs = {"flex_condition": flex_condition, "flex_uncondition": flex_uncondition, "flex_control_stop_timestep": flex_control_stop_timestep}
        else:
-            return {"hidden_states_ref": None}
+            flex_kwargs = {}
+        return flex_kwargs
+    
+    
+    def prepare_step1x_kwargs(self, prompt, negative_prompt, image):
+        if image is None:
+            return {}, {}
+        self.load_models_to_device(["qwenvl", "vae_encoder"])
+        captions = [prompt, negative_prompt]
+        ref_images = [image, image]
+        embs, masks = self.qwenvl(captions, ref_images)
+        image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype)
+        image = self.encode_image(image)
+        return {"step1x_llm_embedding": embs[0:1], "step1x_mask": masks[0:1], "step1x_reference_latents": image}, {"step1x_llm_embedding": embs[1:2], "step1x_mask": masks[1:2], "step1x_reference_latents": image}


    @torch.no_grad()
@@ -415,8 +453,14 @@ class FluxImagePipeline(BasePipeline):
        # InfiniteYou
        infinityou_id_image=None,
        infinityou_guidance=1.0,
-        # Reference images
-        reference_images=None,
+        # Flex
+        flex_inpaint_image=None,
+        flex_inpaint_mask=None,
+        flex_control_image=None,
+        flex_control_strength=0.5,
+        flex_control_stop=0.5,
+        # Step1x
+        step1x_reference_image=None,
        # TeaCache
        tea_cache_l1_thresh=None,
        # Tile
@@ -456,22 +500,25 @@ class FluxImagePipeline(BasePipeline):
        # ControlNets
        controlnet_kwargs_posi, controlnet_kwargs_nega, local_controlnet_kwargs = self.prepare_controlnet(controlnet_image, masks, controlnet_inpaint_mask, tiler_kwargs, enable_controlnet_on_negative)
        
-        # Reference images
-        reference_kwargs = self.prepare_reference_images(reference_images, **tiler_kwargs)
+        # Flex
+        flex_kwargs = self.prepare_flex_kwargs(latents, flex_inpaint_image, flex_inpaint_mask, flex_control_image, flex_control_strength=flex_control_strength, flex_control_stop=flex_control_stop, **tiler_kwargs)
+        
+        # Step1x
+        step1x_kwargs_posi, step1x_kwargs_nega = self.prepare_step1x_kwargs(prompt, negative_prompt, image=step1x_reference_image)

        # TeaCache
        tea_cache_kwargs = {"tea_cache": TeaCache(num_inference_steps, rel_l1_thresh=tea_cache_l1_thresh) if tea_cache_l1_thresh is not None else None}

        # Denoise
-        self.load_models_to_device(['dit', 'controlnet'])
+        self.load_models_to_device(['dit', 'controlnet', 'step1x_connector'])
        for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
            timestep = timestep.unsqueeze(0).to(self.device)

            # Positive side
            inference_callback = lambda prompt_emb_posi, controlnet_kwargs: lets_dance_flux(
-                dit=self.dit, controlnet=self.controlnet, reference_embedder=self.reference_embedder,
+                dit=self.dit, controlnet=self.controlnet, step1x_connector=self.step1x_connector,
                hidden_states=latents, timestep=timestep,
-                **prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs, **ipadapter_kwargs_list_posi, **eligen_kwargs_posi, **tea_cache_kwargs, **infiniteyou_kwargs, **reference_kwargs,
+                **prompt_emb_posi, **tiler_kwargs, **extra_input, **controlnet_kwargs, **ipadapter_kwargs_list_posi, **eligen_kwargs_posi, **tea_cache_kwargs, **infiniteyou_kwargs, **flex_kwargs, **step1x_kwargs_posi,
            )
            noise_pred_posi = self.control_noise_via_local_prompts(
                prompt_emb_posi, prompt_emb_locals, masks, mask_scales, inference_callback,
@@ -486,9 +533,9 @@ class FluxImagePipeline(BasePipeline):
            if cfg_scale != 1.0:
                # Negative side
                noise_pred_nega = lets_dance_flux(
-                    dit=self.dit, controlnet=self.controlnet, reference_embedder=self.reference_embedder,
+                    dit=self.dit, controlnet=self.controlnet, step1x_connector=self.step1x_connector,
                    hidden_states=latents, timestep=timestep,
-                    **prompt_emb_nega, **tiler_kwargs, **extra_input, **controlnet_kwargs_nega, **ipadapter_kwargs_list_nega, **eligen_kwargs_nega, **infiniteyou_kwargs, **reference_kwargs,
+                    **prompt_emb_nega, **tiler_kwargs, **extra_input, **controlnet_kwargs_nega, **ipadapter_kwargs_list_nega, **eligen_kwargs_nega, **infiniteyou_kwargs, **flex_kwargs, **step1x_kwargs_nega,
                )
                noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
            else:
@@ -608,7 +655,7 @@ class TeaCache:
 def lets_dance_flux(
    dit: FluxDiT,
    controlnet: FluxMultiControlNetManager = None,
-    reference_embedder: FluxReferenceEmbedder = None,
+    step1x_connector: Qwen2Connector = None,
    hidden_states=None,
    timestep=None,
    prompt_emb=None,
@@ -617,7 +664,6 @@ def lets_dance_flux(
    text_ids=None,
    image_ids=None,
    controlnet_frames=None,
-    hidden_states_ref=None,
    tiled=False,
    tile_size=128,
    tile_stride=64,
@@ -626,8 +672,13 @@ def lets_dance_flux(
    ipadapter_kwargs_list={},
    id_emb=None,
    infinityou_guidance=None,
+    flex_condition=None,
+    flex_uncondition=None,
+    flex_control_stop_timestep=None,
+    step1x_llm_embedding=None,
+    step1x_mask=None,
+    step1x_reference_latents=None,
    tea_cache: TeaCache = None,
-    use_gradient_checkpointing=False,
    **kwargs
 ):
    if tiled:
@@ -677,6 +728,18 @@ def lets_dance_flux(
        controlnet_res_stack, controlnet_single_res_stack = controlnet(
            controlnet_frames, **controlnet_extra_kwargs
        )
+        
+    # Flex
+    if flex_condition is not None:
+        if timestep.tolist()[0] >= flex_control_stop_timestep:
+            hidden_states = torch.concat([hidden_states, flex_condition], dim=1)
+        else:
+            hidden_states = torch.concat([hidden_states, flex_uncondition], dim=1)
+            
+    # Step1x
+    if step1x_llm_embedding is not None:
+        prompt_emb, pooled_prompt_emb = step1x_connector(step1x_llm_embedding, timestep / 1000, step1x_mask)
+        text_ids = torch.zeros((1, prompt_emb.shape[1], 3), dtype=prompt_emb.dtype, device=prompt_emb.device)

    if image_ids is None:
        image_ids = dit.prepare_image_ids(hidden_states)
@@ -688,61 +751,42 @@ def lets_dance_flux(

    height, width = hidden_states.shape[-2:]
    hidden_states = dit.patchify(hidden_states)
+    
+    # Step1x
+    if step1x_reference_latents is not None:
+        step1x_reference_image_ids = dit.prepare_image_ids(step1x_reference_latents)
+        step1x_reference_latents = dit.patchify(step1x_reference_latents)
+        image_ids = torch.concat([image_ids, step1x_reference_image_ids], dim=-2)
+        hidden_states = torch.concat([hidden_states, step1x_reference_latents], dim=1)
+        
    hidden_states = dit.x_embedder(hidden_states)

    if entity_prompt_emb is not None and entity_masks is not None:
        prompt_emb, image_rotary_emb, attention_mask = dit.process_entity_masks(hidden_states, prompt_emb, entity_prompt_emb, entity_masks, text_ids, image_ids)
    else:
        prompt_emb = dit.context_embedder(prompt_emb)
-        image_rotary_emb = dit.pos_embedder(torch.cat((text_ids, image_ids), dim=1), device=hidden_states.device)
+        image_rotary_emb = dit.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
        attention_mask = None
-        
-    # Reference images
-    if hidden_states_ref is not None:
-        # RoPE
-        image_ids_ref = dit.prepare_image_ids(hidden_states_ref)
-        idx = torch.arange(0, image_ids_ref.shape[0]).to(dtype=hidden_states.dtype, device=hidden_states.device) * 100
-        image_rotary_emb_ref = reference_embedder(image_ids_ref, idx, dtype=hidden_states.dtype, device=hidden_states.device)
-        image_rotary_emb = torch.cat((image_rotary_emb, image_rotary_emb_ref), dim=2)
-        # hidden_states
-        original_hidden_states_length = hidden_states.shape[1]
-        hidden_states_ref = dit.patchify(hidden_states_ref)
-        hidden_states_ref = dit.x_embedder(hidden_states_ref)
-        hidden_states_ref = rearrange(hidden_states_ref, "B L C -> 1 (B L) C")
-        hidden_states_ref = reference_embedder.proj(hidden_states_ref)
-        hidden_states = torch.cat((hidden_states, hidden_states_ref), dim=1)

    # TeaCache
    if tea_cache is not None:
        tea_cache_update = tea_cache.check(dit, hidden_states, conditioning)
    else:
        tea_cache_update = False
-        
-    def create_custom_forward(module):
-        def custom_forward(*inputs):
-            return module(*inputs)
-        return custom_forward

    if tea_cache_update:
        hidden_states = tea_cache.update(hidden_states)
    else:
        # Joint Blocks
        for block_id, block in enumerate(dit.blocks):
-            if use_gradient_checkpointing:
-                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states, prompt_emb, conditioning, image_rotary_emb, attention_mask, ipadapter_kwargs_list.get(block_id, None),
-                    use_reentrant=False,
-                )
-            else:
-                hidden_states, prompt_emb = block(
-                    hidden_states,
-                    prompt_emb,
-                    conditioning,
-                    image_rotary_emb,
-                    attention_mask,
-                    ipadapter_kwargs_list=ipadapter_kwargs_list.get(block_id, None)
-                )
+            hidden_states, prompt_emb = block(
+                hidden_states,
+                prompt_emb,
+                conditioning,
+                image_rotary_emb,
+                attention_mask,
+                ipadapter_kwargs_list=ipadapter_kwargs_list.get(block_id, None)
+            )
            # ControlNet
            if controlnet is not None and controlnet_frames is not None:
                hidden_states = hidden_states + controlnet_res_stack[block_id]
@@ -751,21 +795,14 @@ def lets_dance_flux(
        hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
        num_joint_blocks = len(dit.blocks)
        for block_id, block in enumerate(dit.single_blocks):
-            if use_gradient_checkpointing:
-                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states, prompt_emb, conditioning, image_rotary_emb, attention_mask, ipadapter_kwargs_list.get(block_id + num_joint_blocks, None),
-                    use_reentrant=False,
-                )
-            else:
-                hidden_states, prompt_emb = block(
-                    hidden_states,
-                    prompt_emb,
-                    conditioning,
-                    image_rotary_emb,
-                    attention_mask,
-                    ipadapter_kwargs_list=ipadapter_kwargs_list.get(block_id + num_joint_blocks, None)
-                )
+            hidden_states, prompt_emb = block(
+                hidden_states,
+                prompt_emb,
+                conditioning,
+                image_rotary_emb,
+                attention_mask,
+                ipadapter_kwargs_list=ipadapter_kwargs_list.get(block_id + num_joint_blocks, None)
+            )
            # ControlNet
            if controlnet is not None and controlnet_frames is not None:
                hidden_states[:, prompt_emb.shape[1]:] = hidden_states[:, prompt_emb.shape[1]:] + controlnet_single_res_stack[block_id]
@@ -774,10 +811,13 @@ def lets_dance_flux(
        if tea_cache is not None:
            tea_cache.store(hidden_states)

-    if hidden_states_ref is not None:
-        hidden_states = hidden_states[:, :original_hidden_states_length]
    hidden_states = dit.final_norm_out(hidden_states, conditioning)
    hidden_states = dit.final_proj_out(hidden_states)
+    
+    # Step1x
+    if step1x_reference_latents is not None:
+        hidden_states = hidden_states[:, :hidden_states.shape[1] // 2]
+
    hidden_states = dit.unpatchify(hidden_states, height, width)

    return hidden_states
--- a/diffsynth/pipelines/wan_video.py
+++ b/diffsynth/pipelines/wan_video.py
@@ -4,6 +4,7 @@ from ..models.wan_video_dit import WanModel
 from ..models.wan_video_text_encoder import WanTextEncoder
 from ..models.wan_video_vae import WanVideoVAE
 from ..models.wan_video_image_encoder import WanImageEncoder
+from ..models.wan_video_vace import VaceWanModel
 from ..schedulers.flow_match import FlowMatchScheduler
 from .base import BasePipeline
 from ..prompters import WanPrompter
@@ -33,7 +34,8 @@ class WanVideoPipeline(BasePipeline):
        self.dit: WanModel = None
        self.vae: WanVideoVAE = None
        self.motion_controller: WanMotionControllerModel = None
-        self.model_names = ['text_encoder', 'dit', 'vae', 'image_encoder', 'motion_controller']
+        self.vace: VaceWanModel = None
+        self.model_names = ['text_encoder', 'dit', 'vae', 'image_encoder', 'motion_controller', 'vace']
        self.height_division_factor = 16
        self.width_division_factor = 16
        self.use_unified_sequence_parallel = False
@@ -66,6 +68,7 @@ class WanVideoPipeline(BasePipeline):
                torch.nn.Conv3d: AutoWrappedModule,
                torch.nn.LayerNorm: AutoWrappedModule,
                RMSNorm: AutoWrappedModule,
+                torch.nn.Conv2d: AutoWrappedModule,
            },
            module_config = dict(
                offload_dtype=dtype,
@@ -140,6 +143,24 @@ class WanVideoPipeline(BasePipeline):
                    computation_device=self.device,
                ),
            )
+        if self.vace is not None:
+            enable_vram_management(
+                self.vace,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                    RMSNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=self.device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
        self.enable_cpu_offload()


@@ -153,6 +174,7 @@ class WanVideoPipeline(BasePipeline):
        self.vae = model_manager.fetch_model("wan_video_vae")
        self.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
        self.motion_controller = model_manager.fetch_model("wan_video_motion_controller")
+        self.vace = model_manager.fetch_model("wan_video_vace")


    @staticmethod
@@ -182,7 +204,7 @@ class WanVideoPipeline(BasePipeline):
        return {"context": prompt_emb}
    
    
-    def encode_image(self, image, end_image, num_frames, height, width):
+    def encode_image(self, image, end_image, num_frames, height, width, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
        image = self.preprocess_image(image.resize((width, height))).to(self.device)
        clip_context = self.image_encoder.encode_image([image])
        msk = torch.ones(1, num_frames, height//8, width//8, device=self.device)
@@ -190,6 +212,8 @@ class WanVideoPipeline(BasePipeline):
        if end_image is not None:
            end_image = self.preprocess_image(end_image.resize((width, height))).to(self.device)
            vae_input = torch.concat([image.transpose(0,1), torch.zeros(3, num_frames-2, height, width).to(image.device), end_image.transpose(0,1)],dim=1)
+            if self.dit.has_image_pos_emb:
+                clip_context = torch.concat([clip_context, self.image_encoder.encode_image([end_image])], dim=1)
            msk[:, -1:] = 1
        else:
            vae_input = torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device)], dim=1)
@@ -198,7 +222,8 @@ class WanVideoPipeline(BasePipeline):
        msk = msk.view(1, msk.shape[1] // 4, 4, height//8, width//8)
        msk = msk.transpose(1, 2)[0]
        
-        y = self.vae.encode([vae_input.to(dtype=self.torch_dtype, device=self.device)], device=self.device)[0]
+        y = self.vae.encode([vae_input.to(dtype=self.torch_dtype, device=self.device)], device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)[0]
+        y = y.to(dtype=self.torch_dtype, device=self.device)
        y = torch.concat([msk, y])
        y = y.unsqueeze(0)
        clip_context = clip_context.to(dtype=self.torch_dtype, device=self.device)
@@ -213,6 +238,18 @@ class WanVideoPipeline(BasePipeline):
        return latents
    
    
+    def prepare_reference_image(self, reference_image, height, width):
+        if reference_image is not None:
+            self.load_models_to_device(["vae"])
+            reference_image = reference_image.resize((width, height))
+            reference_image = self.preprocess_images([reference_image])
+            reference_image = torch.stack(reference_image, dim=2).to(dtype=self.torch_dtype, device=self.device)
+            reference_latents = self.vae.encode(reference_image, device=self.device)
+            return {"reference_latents": reference_latents}
+        else:
+            return {}
+    
+    
    def prepare_controlnet_kwargs(self, control_video, num_frames, height, width, clip_feature=None, y=None, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
        if control_video is not None:
            control_latents = self.encode_control_video(control_video, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
@@ -253,6 +290,57 @@ class WanVideoPipeline(BasePipeline):
    def prepare_motion_bucket_id(self, motion_bucket_id):
        motion_bucket_id = torch.Tensor((motion_bucket_id,)).to(dtype=self.torch_dtype, device=self.device)
        return {"motion_bucket_id": motion_bucket_id}
+    
+    
+    def prepare_vace_kwargs(
+        self,
+        latents,
+        vace_video=None, vace_mask=None, vace_reference_image=None, vace_scale=1.0,
+        height=480, width=832, num_frames=81,
+        seed=None, rand_device="cpu",
+        tiled=True, tile_size=(34, 34), tile_stride=(18, 16)
+    ):
+        if vace_video is not None or vace_mask is not None or vace_reference_image is not None:
+            self.load_models_to_device(["vae"])
+            if vace_video is None:
+                vace_video = torch.zeros((1, 3, num_frames, height, width), dtype=self.torch_dtype, device=self.device)
+            else:
+                vace_video = self.preprocess_images(vace_video)
+                vace_video = torch.stack(vace_video, dim=2).to(dtype=self.torch_dtype, device=self.device)
+            
+            if vace_mask is None:
+                vace_mask = torch.ones_like(vace_video)
+            else:
+                vace_mask = self.preprocess_images(vace_mask)
+                vace_mask = torch.stack(vace_mask, dim=2).to(dtype=self.torch_dtype, device=self.device)
+            
+            inactive = vace_video * (1 - vace_mask) + 0 * vace_mask
+            reactive = vace_video * vace_mask + 0 * (1 - vace_mask)
+            inactive = self.encode_video(inactive, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride).to(dtype=self.torch_dtype, device=self.device)
+            reactive = self.encode_video(reactive, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride).to(dtype=self.torch_dtype, device=self.device)
+            vace_video_latents = torch.concat((inactive, reactive), dim=1)
+            
+            vace_mask_latents = rearrange(vace_mask[0,0], "T (H P) (W Q) -> 1 (P Q) T H W", P=8, Q=8)
+            vace_mask_latents = torch.nn.functional.interpolate(vace_mask_latents, size=((vace_mask_latents.shape[2] + 3) // 4, vace_mask_latents.shape[3], vace_mask_latents.shape[4]), mode='nearest-exact')
+            
+            if vace_reference_image is None:
+                pass
+            else:
+                vace_reference_image = self.preprocess_images([vace_reference_image])
+                vace_reference_image = torch.stack(vace_reference_image, dim=2).to(dtype=self.torch_dtype, device=self.device)
+                vace_reference_latents = self.encode_video(vace_reference_image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride).to(dtype=self.torch_dtype, device=self.device)
+                vace_reference_latents = torch.concat((vace_reference_latents, torch.zeros_like(vace_reference_latents)), dim=1)
+                vace_video_latents = torch.concat((vace_reference_latents, vace_video_latents), dim=2)
+                vace_mask_latents = torch.concat((torch.zeros_like(vace_mask_latents[:, :, :1]), vace_mask_latents), dim=2)
+                
+                noise = self.generate_noise((1, 16, 1, latents.shape[3], latents.shape[4]), seed=seed, device=rand_device, dtype=torch.float32)
+                noise = noise.to(dtype=self.torch_dtype, device=self.device)
+                latents = torch.concat((noise, latents), dim=2)
+            
+            vace_context = torch.concat((vace_video_latents, vace_mask_latents), dim=1)
+            return latents, {"vace_context": vace_context, "vace_scale": vace_scale}
+        else:
+            return latents, {"vace_context": None, "vace_scale": vace_scale}


    @torch.no_grad()
@@ -264,6 +352,11 @@ class WanVideoPipeline(BasePipeline):
        end_image=None,
        input_video=None,
        control_video=None,
+        reference_image=None,
+        vace_video=None,
+        vace_video_mask=None,
+        vace_reference_image=None,
+        vace_scale=1.0,
        denoising_strength=1.0,
        seed=None,
        rand_device="cpu",
@@ -315,10 +408,13 @@ class WanVideoPipeline(BasePipeline):
        # Encode image
        if input_image is not None and self.image_encoder is not None:
            self.load_models_to_device(["image_encoder", "vae"])
-            image_emb = self.encode_image(input_image, end_image, num_frames, height, width)
+            image_emb = self.encode_image(input_image, end_image, num_frames, height, width, **tiler_kwargs)
        else:
            image_emb = {}
            
+        # Reference image
+        reference_image_kwargs = self.prepare_reference_image(reference_image, height, width)
+            
        # ControlNet
        if control_video is not None:
            self.load_models_to_device(["image_encoder", "vae"])
@@ -333,6 +429,12 @@ class WanVideoPipeline(BasePipeline):
        # Extra input
        extra_input = self.prepare_extra_input(latents)
        
+        # VACE
+        latents, vace_kwargs = self.prepare_vace_kwargs(
+            latents, vace_video, vace_video_mask, vace_reference_image, vace_scale,
+            height=height, width=width, num_frames=num_frames, seed=seed, rand_device=rand_device, **tiler_kwargs
+        )
+        
        # TeaCache
        tea_cache_posi = {"tea_cache": TeaCache(num_inference_steps, rel_l1_thresh=tea_cache_l1_thresh, model_id=tea_cache_model_id) if tea_cache_l1_thresh is not None else None}
        tea_cache_nega = {"tea_cache": TeaCache(num_inference_steps, rel_l1_thresh=tea_cache_l1_thresh, model_id=tea_cache_model_id) if tea_cache_l1_thresh is not None else None}
@@ -341,23 +443,23 @@ class WanVideoPipeline(BasePipeline):
        usp_kwargs = self.prepare_unified_sequence_parallel()

        # Denoise
-        self.load_models_to_device(["dit", "motion_controller"])
+        self.load_models_to_device(["dit", "motion_controller", "vace"])
        for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
            timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device)

            # Inference
            noise_pred_posi = model_fn_wan_video(
-                self.dit, motion_controller=self.motion_controller,
+                self.dit, motion_controller=self.motion_controller, vace=self.vace,
                x=latents, timestep=timestep,
                **prompt_emb_posi, **image_emb, **extra_input,
-                **tea_cache_posi, **usp_kwargs, **motion_kwargs
+                **tea_cache_posi, **usp_kwargs, **motion_kwargs, **vace_kwargs, **reference_image_kwargs,
            )
            if cfg_scale != 1.0:
                noise_pred_nega = model_fn_wan_video(
-                    self.dit, motion_controller=self.motion_controller,
+                    self.dit, motion_controller=self.motion_controller, vace=self.vace,
                    x=latents, timestep=timestep,
                    **prompt_emb_nega, **image_emb, **extra_input,
-                    **tea_cache_nega, **usp_kwargs, **motion_kwargs
+                    **tea_cache_nega, **usp_kwargs, **motion_kwargs, **vace_kwargs, **reference_image_kwargs,
                )
                noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
            else:
@@ -365,6 +467,9 @@ class WanVideoPipeline(BasePipeline):

            # Scheduler
            latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents)
+            
+        if vace_reference_image is not None:
+            latents = latents[:, :, 1:]

        # Decode
        self.load_models_to_device(['vae'])
@@ -432,11 +537,15 @@ class TeaCache:
 def model_fn_wan_video(
    dit: WanModel,
    motion_controller: WanMotionControllerModel = None,
+    vace: VaceWanModel = None,
    x: torch.Tensor = None,
    timestep: torch.Tensor = None,
    context: torch.Tensor = None,
    clip_feature: Optional[torch.Tensor] = None,
    y: Optional[torch.Tensor] = None,
+    reference_latents = None,
+    vace_context = None,
+    vace_scale = 1.0,
    tea_cache: TeaCache = None,
    use_unified_sequence_parallel: bool = False,
    motion_bucket_id: Optional[torch.Tensor] = None,
@@ -461,6 +570,12 @@ def model_fn_wan_video(
    
    x, (f, h, w) = dit.patchify(x)
    
+    # Reference image
+    if reference_latents is not None:
+        reference_latents = dit.ref_conv(reference_latents[:, :, 0]).flatten(2).transpose(1, 2)
+        x = torch.concat([reference_latents, x], dim=1)
+        f += 1
+    
    freqs = torch.cat([
        dit.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
        dit.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
@@ -472,6 +587,9 @@ def model_fn_wan_video(
        tea_cache_update = tea_cache.check(dit, x, t_mod)
    else:
        tea_cache_update = False
+        
+    if vace_context is not None:
+        vace_hints = vace(x, vace_context, context, t_mod, freqs)
    
    # blocks
    if use_unified_sequence_parallel:
@@ -480,10 +598,16 @@ def model_fn_wan_video(
    if tea_cache_update:
        x = tea_cache.update(x)
    else:
-        for block in dit.blocks:
+        for block_id, block in enumerate(dit.blocks):
            x = block(x, context, t_mod, freqs)
+            if vace_context is not None and block_id in vace.vace_layers_mapping:
+                x = x + vace_hints[vace.vace_layers_mapping[block_id]] * vace_scale
        if tea_cache is not None:
            tea_cache.store(x)
+            
+    if reference_latents is not None:
+        x = x[:, reference_latents.shape[1]:]
+        f -= 1

    x = dit.head(x, t)
    if use_unified_sequence_parallel:
--- a/diffsynth/pipelines/wan_video_new.py
+++ b/diffsynth/pipelines/wan_video_new.py
--- a/diffsynth/schedulers/flow_match.py
+++ b/diffsynth/schedulers/flow_match.py
@@ -35,6 +35,9 @@ class FlowMatchScheduler():
            y_shifted = y - y.min()
            bsmntw_weighing = y_shifted * (num_inference_steps / y_shifted.sum())
            self.linear_timesteps_weights = bsmntw_weighing
+            self.training = True
+        else:
+            self.training = False


    def step(self, model_output, timestep, sample, to_final=False, **kwargs):
--- a/diffsynth/trainers/utils.py
+++ b/diffsynth/trainers/utils.py
@@ -0,0 +1,304 @@
+import imageio, os, torch, warnings, torchvision, argparse
+from peft import LoraConfig, inject_adapter_in_model
+from PIL import Image
+import pandas as pd
+from tqdm import tqdm
+from accelerate import Accelerator
+
+
+
+class VideoDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        base_path=None, metadata_path=None,
+        num_frames=81,
+        time_division_factor=4, time_division_remainder=1,
+        max_pixels=1920*1080, height=None, width=None,
+        height_division_factor=16, width_division_factor=16,
+        data_file_keys=("video",),
+        image_file_extension=("jpg", "jpeg", "png", "webp"),
+        video_file_extension=("mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"),
+        repeat=1,
+        args=None,
+    ):
+        if args is not None:
+            base_path = args.dataset_base_path
+            metadata_path = args.dataset_metadata_path
+            height = args.height
+            width = args.width
+            max_pixels = args.max_pixels
+            num_frames = args.num_frames
+            data_file_keys = args.data_file_keys.split(",")
+            repeat = args.dataset_repeat
+        
+        self.base_path = base_path
+        self.num_frames = num_frames
+        self.time_division_factor = time_division_factor
+        self.time_division_remainder = time_division_remainder
+        self.max_pixels = max_pixels
+        self.height = height
+        self.width = width
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.data_file_keys = data_file_keys
+        self.image_file_extension = image_file_extension
+        self.video_file_extension = video_file_extension
+        self.repeat = repeat
+        
+        if height is not None and width is not None:
+            print("Height and width are fixed. Setting `dynamic_resolution` to False.")
+            self.dynamic_resolution = False
+        elif height is None and width is None:
+            print("Height and width are none. Setting `dynamic_resolution` to True.")
+            self.dynamic_resolution = True
+            
+        if metadata_path is None:
+            print("No metadata. Trying to generate it.")
+            metadata = self.generate_metadata(base_path)
+            print(f"{len(metadata)} lines in metadata.")
+        else:
+            metadata = pd.read_csv(metadata_path)
+        self.data = [metadata.iloc[i].to_dict() for i in range(len(metadata))]
+            
+    
+    def generate_metadata(self, folder):
+        video_list, prompt_list = [], []
+        file_set = set(os.listdir(folder))
+        for file_name in file_set:
+            if "." not in file_name:
+                continue
+            file_ext_name = file_name.split(".")[-1].lower()
+            file_base_name = file_name[:-len(file_ext_name)-1]
+            if file_ext_name not in self.image_file_extension and file_ext_name not in self.video_file_extension:
+                continue
+            prompt_file_name = file_base_name + ".txt"
+            if prompt_file_name not in file_set:
+                continue
+            with open(os.path.join(folder, prompt_file_name), "r", encoding="utf-8") as f:
+                prompt = f.read().strip()
+            video_list.append(file_name)
+            prompt_list.append(prompt)
+        metadata = pd.DataFrame()
+        metadata["video"] = video_list
+        metadata["prompt"] = prompt_list
+        return metadata
+        
+        
+    def crop_and_resize(self, image, target_height, target_width):
+        width, height = image.size
+        scale = max(target_width / width, target_height / height)
+        image = torchvision.transforms.functional.resize(
+            image,
+            (round(height*scale), round(width*scale)),
+            interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+        )
+        image = torchvision.transforms.functional.center_crop(image, (target_height, target_width))
+        return image
+    
+    
+    def get_height_width(self, image):
+        if self.dynamic_resolution:
+            width, height = image.size
+            if width * height > self.max_pixels:
+                scale = (width * height / self.max_pixels) ** 0.5
+                height, width = int(height / scale), int(width / scale)
+            height = height // self.height_division_factor * self.height_division_factor
+            width = width // self.width_division_factor * self.width_division_factor
+        else:
+            height, width = self.height, self.width
+        return height, width
+    
+    
+    def get_num_frames(self, reader):
+        num_frames = self.num_frames
+        if int(reader.count_frames()) < num_frames:
+            num_frames = int(reader.count_frames())
+            while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
+                num_frames -= 1
+        return num_frames
+    
+
+    def load_video(self, file_path):
+        reader = imageio.get_reader(file_path)
+        num_frames = self.get_num_frames(reader)
+        frames = []
+        for frame_id in range(num_frames):
+            frame = reader.get_data(frame_id)
+            frame = Image.fromarray(frame)
+            frame = self.crop_and_resize(frame, *self.get_height_width(frame))
+            frames.append(frame)
+        reader.close()
+        return frames
+    
+    
+    def load_image(self, file_path):
+        image = Image.open(file_path).convert("RGB")
+        image = self.crop_and_resize(image, *self.get_height_width(image))
+        return image
+    
+    
+    def is_image(self, file_path):
+        file_ext_name = file_path.split(".")[-1]
+        return file_ext_name.lower() in self.image_file_extension
+    
+    
+    def is_video(self, file_path):
+        file_ext_name = file_path.split(".")[-1]
+        return file_ext_name.lower() in self.video_file_extension
+    
+    
+    def load_data(self, file_path):
+        if self.is_image(file_path):
+            return self.load_image(file_path)
+        elif self.is_video(file_path):
+            return self.load_video(file_path)
+        else:
+            return None
+
+
+    def __getitem__(self, data_id):
+        data = self.data[data_id % len(self.data)].copy()
+        for key in self.data_file_keys:
+            if key in data:
+                path = os.path.join(self.base_path, data[key])
+                data[key] = self.load_data(path)
+                if data[key] is None:
+                    warnings.warn(f"cannot load file {data[key]}.")
+                    return None
+        return data
+    
+
+    def __len__(self):
+        return len(self.data) * self.repeat
+
+
+
+class DiffusionTrainingModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        
+        
+    def to(self, *args, **kwargs):
+        for name, model in self.named_children():
+            model.to(*args, **kwargs)
+        return self
+        
+        
+    def trainable_modules(self):
+        trainable_modules = filter(lambda p: p.requires_grad, self.parameters())
+        return trainable_modules
+    
+    
+    def trainable_param_names(self):
+        trainable_param_names = list(filter(lambda named_param: named_param[1].requires_grad, self.named_parameters()))
+        trainable_param_names = set([named_param[0] for named_param in trainable_param_names])
+        return trainable_param_names
+    
+    
+    def add_lora_to_model(self, model, target_modules, lora_rank, lora_alpha=None):
+        if lora_alpha is None:
+            lora_alpha = lora_rank
+        lora_config = LoraConfig(r=lora_rank, lora_alpha=lora_alpha, target_modules=target_modules)
+        model = inject_adapter_in_model(lora_config, model)
+        return model
+    
+    
+    def export_trainable_state_dict(self, state_dict, remove_prefix=None):
+        trainable_param_names = self.trainable_param_names()
+        state_dict = {name: param for name, param in state_dict.items() if name in trainable_param_names}
+        if remove_prefix is not None:
+            state_dict_ = {}
+            for name, param in state_dict.items():
+                if name.startswith(remove_prefix):
+                    name = name[len(remove_prefix):]
+                state_dict_[name] = param
+            state_dict = state_dict_
+        return state_dict
+
+
+
+class ModelLogger:
+    def __init__(self, output_path, remove_prefix_in_ckpt=None):
+        self.output_path = output_path
+        self.remove_prefix_in_ckpt = remove_prefix_in_ckpt
+        
+    
+    def on_step_end(self, loss):
+        pass
+    
+    
+    def on_epoch_end(self, accelerator, model, epoch_id):
+        accelerator.wait_for_everyone()
+        if accelerator.is_main_process:
+            state_dict = accelerator.get_state_dict(model)
+            state_dict = accelerator.unwrap_model(model).export_trainable_state_dict(state_dict, remove_prefix=self.remove_prefix_in_ckpt)
+            os.makedirs(self.output_path, exist_ok=True)
+            path = os.path.join(self.output_path, f"epoch-{epoch_id}.safetensors")
+            accelerator.save(state_dict, path, safe_serialization=True)
+
+
+
+def launch_training_task(
+    dataset: torch.utils.data.Dataset,
+    model: DiffusionTrainingModule,
+    model_logger: ModelLogger,
+    optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler.LRScheduler,
+    num_epochs: int = 1,
+    gradient_accumulation_steps: int = 1,
+):
+    dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, collate_fn=lambda x: x[0])
+    accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
+    model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
+    
+    for epoch_id in range(num_epochs):
+        for data in tqdm(dataloader):
+            with accelerator.accumulate(model):
+                optimizer.zero_grad()
+                loss = model(data)
+                accelerator.backward(loss)
+                optimizer.step()
+                model_logger.on_step_end(loss)
+                scheduler.step()
+        model_logger.on_epoch_end(accelerator, model, epoch_id)
+
+
+
+def launch_data_process_task(model: DiffusionTrainingModule, dataset, output_path="./models"):
+    dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=lambda x: x[0])
+    accelerator = Accelerator()
+    model, dataloader = accelerator.prepare(model, dataloader)
+    os.makedirs(os.path.join(output_path, "data_cache"), exist_ok=True)
+    for data_id, data in enumerate(tqdm(dataloader)):
+        with torch.no_grad():
+            inputs = model.forward_preprocess(data)
+            inputs = {key: inputs[key] for key in model.model_input_keys if key in inputs}
+            torch.save(inputs, os.path.join(output_path, "data_cache", f"{data_id}.pth"))
+
+
+
+def wan_parser():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument("--dataset_base_path", type=str, default="", required=True, help="Base path of the dataset.")
+    parser.add_argument("--dataset_metadata_path", type=str, default=None, help="Path to the metadata file of the dataset.")
+    parser.add_argument("--max_pixels", type=int, default=1280*720, help="Maximum number of pixels per frame, used for dynamic resolution..")
+    parser.add_argument("--height", type=int, default=None, help="Height of images or videos. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--width", type=int, default=None, help="Width of images or videos. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of frames per video. Frames are sampled from the video prefix.")
+    parser.add_argument("--data_file_keys", type=str, default="image,video", help="Data file keys in the metadata. Comma-separated.")
+    parser.add_argument("--dataset_repeat", type=int, default=1, help="Number of times to repeat the dataset per epoch.")
+    parser.add_argument("--model_paths", type=str, default=None, help="Paths to load models. In JSON format.")
+    parser.add_argument("--model_id_with_origin_paths", type=str, default=None, help="Model ID with origin paths, e.g., Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors. Comma-separated.")
+    parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate.")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs.")
+    parser.add_argument("--output_path", type=str, default="./models", help="Output save path.")
+    parser.add_argument("--remove_prefix_in_ckpt", type=str, default="pipe.dit.", help="Remove prefix in ckpt.")
+    parser.add_argument("--trainable_models", type=str, default=None, help="Models to train, e.g., dit, vae, text_encoder.")
+    parser.add_argument("--lora_base_model", type=str, default=None, help="Which model LoRA is added to.")
+    parser.add_argument("--lora_target_modules", type=str, default="q,k,v,o,ffn.0,ffn.2", help="Which layers LoRA is added to.")
+    parser.add_argument("--lora_rank", type=int, default=32, help="Rank of LoRA.")
+    parser.add_argument("--extra_inputs", default=None, help="Additional model inputs, comma-separated.")
+    parser.add_argument("--use_gradient_checkpointing_offload", default=False, action="store_true", help="Whether to offload gradient checkpointing to CPU memory.")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Gradient accumulation steps.")
+    return parser
+
--- a/diffsynth/vram_management/layers.py
+++ b/diffsynth/vram_management/layers.py
@@ -8,8 +8,33 @@ def cast_to(weight, dtype, device):
    return r


-class AutoWrappedModule(torch.nn.Module):
-    def __init__(self, module: torch.nn.Module, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device):
+class AutoTorchModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        
+    def check_free_vram(self):
+        gpu_mem_state = torch.cuda.mem_get_info(self.computation_device)
+        used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024 ** 3)
+        return used_memory < self.vram_limit
+
+    def offload(self):
+        if self.state != 0:
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+
+    def onload(self):
+        if self.state != 1:
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+            
+    def keep(self):
+        if self.state != 2:
+            self.to(dtype=self.computation_dtype, device=self.computation_device)
+            self.state = 2
+
+
+class AutoWrappedModule(AutoTorchModule):
+    def __init__(self, module: torch.nn.Module, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device, vram_limit, **kwargs):
        super().__init__()
        self.module = module.to(dtype=offload_dtype, device=offload_device)
        self.offload_dtype = offload_dtype
@@ -18,28 +43,57 @@ class AutoWrappedModule(torch.nn.Module):
        self.onload_device = onload_device
        self.computation_dtype = computation_dtype
        self.computation_device = computation_device
+        self.vram_limit = vram_limit
        self.state = 0

-    def offload(self):
-        if self.state == 1 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
-            self.module.to(dtype=self.offload_dtype, device=self.offload_device)
-            self.state = 0
-
-    def onload(self):
-        if self.state == 0 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
-            self.module.to(dtype=self.onload_dtype, device=self.onload_device)
-            self.state = 1
-
    def forward(self, *args, **kwargs):
-        if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+        if self.state == 2:
            module = self.module
        else:
-            module = copy.deepcopy(self.module).to(dtype=self.computation_dtype, device=self.computation_device)
+            if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+                module = self.module
+            elif self.vram_limit is not None and self.check_free_vram():
+                self.keep()
+                module = self.module
+            else:
+                module = copy.deepcopy(self.module).to(dtype=self.computation_dtype, device=self.computation_device)
        return module(*args, **kwargs)
    

-class AutoWrappedLinear(torch.nn.Linear):
-    def __init__(self, module: torch.nn.Linear, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device):
+class WanAutoCastLayerNorm(torch.nn.LayerNorm, AutoTorchModule):
+    def __init__(self, module: torch.nn.LayerNorm, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device, vram_limit, **kwargs):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(module.normalized_shape, eps=module.eps, elementwise_affine=module.elementwise_affine, bias=module.bias is not None, dtype=offload_dtype, device=offload_device)
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.vram_limit = vram_limit
+        self.state = 0
+
+    def forward(self, x, *args, **kwargs):
+        if self.state == 2:
+            weight, bias = self.weight, self.bias
+        else:
+            if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+                weight, bias = self.weight, self.bias
+            elif self.vram_limit is not None and self.check_free_vram():
+                self.keep()
+                weight, bias = self.weight, self.bias
+            else:
+                weight = None if self.weight is None else cast_to(self.weight, self.computation_dtype, self.computation_device)
+                bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
+        with torch.amp.autocast(device_type=x.device.type):
+            x = torch.nn.functional.layer_norm(x.float(), self.normalized_shape, weight, bias, self.eps).type_as(x)
+        return x
+    
+
+class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
+    def __init__(self, module: torch.nn.Linear, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device, vram_limit, name="", **kwargs):
        with init_weights_on_device(device=torch.device("meta")):
            super().__init__(in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None, dtype=offload_dtype, device=offload_device)
        self.weight = module.weight
@@ -50,29 +104,28 @@ class AutoWrappedLinear(torch.nn.Linear):
        self.onload_device = onload_device
        self.computation_dtype = computation_dtype
        self.computation_device = computation_device
+        self.vram_limit = vram_limit
        self.state = 0
-
-    def offload(self):
-        if self.state == 1 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
-            self.to(dtype=self.offload_dtype, device=self.offload_device)
-            self.state = 0
-
-    def onload(self):
-        if self.state == 0 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
-            self.to(dtype=self.onload_dtype, device=self.onload_device)
-            self.state = 1
+        self.name = name

    def forward(self, x, *args, **kwargs):
-        if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+        if self.state == 2:
            weight, bias = self.weight, self.bias
        else:
-            weight = cast_to(self.weight, self.computation_dtype, self.computation_device)
-            bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
+            if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+                weight, bias = self.weight, self.bias
+            elif self.vram_limit is not None and self.check_free_vram():
+                self.keep()
+                weight, bias = self.weight, self.bias
+            else:
+                weight = cast_to(self.weight, self.computation_dtype, self.computation_device)
+                bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
        return torch.nn.functional.linear(x, weight, bias)


-def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, total_num_param=0):
+def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, total_num_param=0, vram_limit=None, name_prefix=""):
    for name, module in model.named_children():
+        layer_name = name if name_prefix == "" else name_prefix + "." + name
        for source_module, target_module in module_map.items():
            if isinstance(module, source_module):
                num_param = sum(p.numel() for p in module.parameters())
@@ -80,16 +133,16 @@ def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict,
                    module_config_ = overflow_module_config
                else:
                    module_config_ = module_config
-                module_ = target_module(module, **module_config_)
+                module_ = target_module(module, **module_config_, vram_limit=vram_limit, name=layer_name)
                setattr(model, name, module_)
                total_num_param += num_param
                break
        else:
-            total_num_param = enable_vram_management_recursively(module, module_map, module_config, max_num_param, overflow_module_config, total_num_param)
+            total_num_param = enable_vram_management_recursively(module, module_map, module_config, max_num_param, overflow_module_config, total_num_param, vram_limit=vram_limit, name_prefix=layer_name)
    return total_num_param


-def enable_vram_management(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None):
-    enable_vram_management_recursively(model, module_map, module_config, max_num_param, overflow_module_config, total_num_param=0)
+def enable_vram_management(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, vram_limit=None):
+    enable_vram_management_recursively(model, module_map, module_config, max_num_param, overflow_module_config, total_num_param=0, vram_limit=vram_limit)
    model.vram_management_enabled = True

--- a/examples/image_synthesis/flex_text_to_image.py
+++ b/examples/image_synthesis/flex_text_to_image.py
@@ -0,0 +1,49 @@
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_models
+from diffsynth.controlnets.processors import Annotator
+import numpy as np
+from PIL import Image
+
+
+download_models(["FLUX.1-dev"])
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
+model_manager.load_models([
+    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+    "models/FLUX/FLUX.1-dev/text_encoder_2",
+    "models/FLUX/FLUX.1-dev/ae.safetensors",
+    "models/ostris/Flex.2-preview/Flex.2-preview.safetensors"
+])
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl, long hair, red t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    seed=0
+)
+image.save("image_1.jpg")
+
+mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
+mask[200:400, 400:700] = 255
+mask = Image.fromarray(mask)
+mask.save("image_mask.jpg")
+
+inpaint_image = image
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl with sunglasses, long hair, red t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    flex_inpaint_image=inpaint_image, flex_inpaint_mask=mask,
+    seed=4
+)
+image.save("image_2.jpg")
+
+control_image = Annotator("canny")(image)
+control_image.save("image_control.jpg")
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl with sunglasses, long hair, yellow t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    flex_control_image=control_image,
+    seed=4
+)
+image.save("image_3.jpg")
--- a/examples/step1x/step1x.py
+++ b/examples/step1x/step1x.py
@@ -0,0 +1,35 @@
+import torch
+from diffsynth import FluxImagePipeline, ModelManager
+from modelscope import snapshot_download
+from PIL import Image
+import numpy as np
+
+
+snapshot_download("Qwen/Qwen2.5-VL-7B-Instruct", cache_dir="./models")
+snapshot_download("stepfun-ai/Step1X-Edit", cache_dir="./models")
+
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
+model_manager.load_models([
+    "models/Qwen/Qwen2.5-VL-7B-Instruct",
+    "models/stepfun-ai/Step1X-Edit/step1x-edit-i1258.safetensors",
+    "models/stepfun-ai/Step1X-Edit/vae.safetensors",
+])
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+pipe.enable_vram_management()
+
+image = Image.fromarray(np.zeros((1248, 832, 3), dtype=np.uint8) + 255)
+image = pipe(
+    prompt="draw red flowers in Chinese ink painting style",
+    step1x_reference_image=image,
+    width=832, height=1248, cfg_scale=6,
+    seed=1,
+)
+image.save("image_1.jpg")
+
+image = pipe(
+    prompt="add more flowers in Chinese ink painting style",
+    step1x_reference_image=image,
+    width=832, height=1248, cfg_scale=6,
+    seed=2,
+)
+image.save("image_2.jpg")
--- a/examples/wanvideo/README.md
+++ b/examples/wanvideo/README.md
@@ -1,6 +1,12 @@
-# Wan-Video
+# Wan 2.1

-Wan-Video is a collection of video synthesis models open-sourced by Alibaba.
+[切换到中文](./README_zh.md)
+
+Wan 2.1 is a collection of video synthesis models open-sourced by Alibaba.
+
+**DiffSynth-Studio has adopted a new inference and training framework. To use the previous version, please click [here](https://github.com/modelscope/DiffSynth-Studio/tree/3edf3583b1f08944cee837b94d9f84d669c2729c).**
+
+## Installation

 Before using this model, please install DiffSynth-Studio from **source code**.

@@ -10,257 +16,402 @@ cd DiffSynth-Studio
 pip install -e .
 ```

-## Model Zoo
+## Overview

-|Developer|Name|Link|Scripts|
-|-|-|-|-|
-|Wan Team|1.3B text-to-video|[Link](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)|[wan_1.3b_text_to_video.py](./wan_1.3b_text_to_video.py)|
-|Wan Team|14B text-to-video|[Link](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)|[wan_14b_text_to_video.py](./wan_14b_text_to_video.py)|
-|Wan Team|14B image-to-video 480P|[Link](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|[wan_14b_image_to_video.py](./wan_14b_image_to_video.py)|
-|Wan Team|14B image-to-video 720P|[Link](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|[wan_14b_image_to_video.py](./wan_14b_image_to_video.py)|
-|DiffSynth-Studio Team|1.3B aesthetics LoRA|[Link](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-aesthetics-v1)|Please see the [model card](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-aesthetics-v1).|
-|DiffSynth-Studio Team|1.3B Highres-fix LoRA|[Link](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-highresfix-v1)|Please see the [model card](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-highresfix-v1).|
-|DiffSynth-Studio Team|1.3B ExVideo LoRA|[Link](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-exvideo-v1)|Please see the [model card](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-lora-exvideo-v1).|
-|DiffSynth-Studio Team|1.3B Speed Control adapter|[Link](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|[wan_1.3b_motion_controller.py](./wan_1.3b_motion_controller.py)|
-|PAI Team|1.3B InP|[Link](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|[wan_fun_InP.py](./wan_fun_InP.py)|
-|PAI Team|14B InP|[Link](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|[wan_fun_InP.py](./wan_fun_InP.py)|
-|PAI Team|1.3B Control|[Link](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|[wan_fun_control.py](./wan_fun_control.py)|
-|PAI Team|14B Control|[Link](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|[wan_fun_control.py](./wan_fun_control.py)|
+| Model ID | Extra Parameters | Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
+|-|-|-|-|-|-|-|
+|[Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)||[code](./model_inference/Wan2.1-T2V-1.3B.py)|[code](./model_training/full/Wan2.1-T2V-1.3B.sh)|[code](./model_training/validate_full/Wan2.1-T2V-1.3B.py)|[code](./model_training/lora/Wan2.1-T2V-1.3B.sh)|[code](./model_training/validate_lora/Wan2.1-T2V-1.3B.py)|
+|[Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)||[code](./model_inference/Wan2.1-T2V-14B.py)|[code](./model_training/full/Wan2.1-T2V-14B.sh)|[code](./model_training/validate_full/Wan2.1-T2V-14B.py)|[code](./model_training/lora/Wan2.1-T2V-14B.sh)|[code](./model_training/validate_lora/Wan2.1-T2V-14B.py)|
+|[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](./model_inference/Wan2.1-I2V-14B-480P.py)|[code](./model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](./model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](./model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](./model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
+|[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](./model_inference/Wan2.1-I2V-14B-720P.py)|[code](./model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](./model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](./model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](./model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
+|[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](./model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](./model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](./model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](./model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
+|[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](./model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
+|[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](./model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](./model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
+|[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-14B-InP.py)|[code](./model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
+|[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](./model_inference/Wan2.1-Fun-14B-Control.py)|[code](./model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
+|[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](./model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
+|[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](./model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
+|[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
+|[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
+|[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](./model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
+|[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](./model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
+|[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](./model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](./model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](./model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](./model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](./model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
+|[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](./model_inference/Wan2.1-VACE-1.3B.py)|[code](./model_training/full/Wan2.1-VACE-1.3B.sh)|[code](./model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](./model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](./model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
+|[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](./model_inference/Wan2.1-VACE-14B.py)|[code](./model_training/full/Wan2.1-VACE-14B.sh)|[code](./model_training/validate_full/Wan2.1-VACE-14B.py)|[code](./model_training/lora/Wan2.1-VACE-14B.sh)|[code](./model_training/validate_lora/Wan2.1-VACE-14B.py)|
+|[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](./model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](./model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](./model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](./model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](./model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|

-Base model features

-||Text-to-video|Image-to-video|End frame|Control|
-|-|-|-|-|-|
-|1.3B text-to-video|✅||||
-|14B text-to-video|✅||||
-|14B image-to-video 480P||✅|||
-|14B image-to-video 720P||✅|||
-|1.3B InP||✅|✅||
-|14B InP||✅|✅||
-|1.3B Control||||✅|
-|14B Control||||✅|
+## Model Inference

-Adapter model compatibility
+The following sections will help you understand our functionalities and write inference code.

-||1.3B text-to-video|1.3B InP|
-|-|-|-|
-|1.3B aesthetics LoRA|✅||
-|1.3B Highres-fix LoRA|✅||
-|1.3B ExVideo LoRA|✅||
-|1.3B Speed Control adapter|✅|✅|
+<details>

-## VRAM Usage
+<summary>Loading the Model</summary>

-* Fine-grained offload: We recommend that users adjust the `num_persistent_param_in_dit` settings to find an optimal balance between speed and VRAM requirements. See [`./wan_14b_text_to_video.py`](./wan_14b_text_to_video.py).
+The model is loaded using `from_pretrained`:

-* FP8 Quantization: You only need to adjust the `torch_dtype` in the `ModelManager` (not the pipeline!).
-
-We present a detailed table here. The model (14B text-to-video) is tested on a single A100.
-
-|`torch_dtype`|`num_persistent_param_in_dit`|Speed|Required VRAM|Default Setting|
-|-|-|-|-|-|
-|torch.bfloat16|None (unlimited)|18.5s/it|40G||
-|torch.bfloat16|7*10**9 (7B)|20.8s/it|24G||
-|torch.bfloat16|0|23.4s/it|10G||
-|torch.float8_e4m3fn|None (unlimited)|18.3s/it|24G|yes|
-|torch.float8_e4m3fn|0|24.0s/it|10G||
-
-**We found that 14B image-to-video model is more sensitive to precision, so when the generated video content experiences issues such as artifacts, please switch to bfloat16 precision and use the `num_persistent_param_in_dit` parameter to control VRAM usage.**
-
-## Efficient Attention Implementation
-
-DiffSynth-Studio supports multiple Attention implementations. If you have installed any of the following Attention implementations, they will be enabled based on priority. However, we recommend to use the default torch SDPA.
-
-* [Flash Attention 3](https://github.com/Dao-AILab/flash-attention)
-* [Flash Attention 2](https://github.com/Dao-AILab/flash-attention)
-* [Sage Attention](https://github.com/thu-ml/SageAttention)
-* [torch SDPA](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (default. `torch>=2.5.0` is recommended.)
-
-## Acceleration
-
-We support multiple acceleration solutions:
-* [TeaCache](https://github.com/ali-vilab/TeaCache): See [wan_1.3b_text_to_video_accelerate.py](./wan_1.3b_text_to_video_accelerate.py).
-
-* [Unified Sequence Parallel](https://github.com/xdit-project/xDiT): See [wan_14b_text_to_video_usp.py](./wan_14b_text_to_video_usp.py)
-
-```bash
-pip install xfuser>=0.4.3
-torchrun --standalone --nproc_per_node=8 examples/wanvideo/wan_14b_text_to_video_usp.py
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"),
+    ],
+)
 ```

-* Tensor Parallel: See [wan_14b_text_to_video_tensor_parallel.py](./wan_14b_text_to_video_tensor_parallel.py).
+Here, `torch_dtype` and `device` specify the computation precision and device respectively. The `model_configs` can be used to configure model paths in various ways:
+
+* Downloading the model from [ModelScope](https://modelscope.cn/) and loading it. In this case, both `model_id` and `origin_file_pattern` need to be specified, for example:
+
+```python
+ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+```
+
+* Loading the model from a local file path. In this case, the `path` parameter needs to be specified, for example:
+
+```python
+ModelConfig(path="models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors")
+```
+
+For models that are loaded from multiple files, simply use a list, for example:
+
+```python
+ModelConfig(path=[
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors",
+])
+```
+
+The `from_pretrained` function also provides additional parameters to control the behavior during model loading:
+
+* `tokenizer_config`: Path to the tokenizer of the Wan model. Default value is `ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*")`.
+* `local_model_path`: Path where downloaded models are saved. Default value is `"./models"`.
+* `skip_download`: Whether to skip downloading models. Default value is `False`. When your network cannot access [ModelScope](https://modelscope.cn/), manually download the necessary files and set this to `True`.
+* `redirect_common_files`: Whether to redirect duplicate model files. Default value is `True`. Since the Wan series models include multiple base models, some modules like text encoder are shared across these models. To avoid redundant downloads, we redirect the model paths.
+* `use_usp`: Whether to enable Unified Sequence Parallel. Default value is `False`. Used for multi-GPU parallel inference.
+
+</details>
+
+<details>
+
+<summary>VRAM Management</summary>
+
+DiffSynth-Studio provides fine-grained VRAM management for the Wan model, allowing it to run on devices with limited VRAM. You can enable offloading functionality via the following code, which moves parts of the model to system memory on devices with limited VRAM:
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+```
+
+FP8 quantization is also supported:
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+```
+
+Both FP8 quantization and offloading can be enabled simultaneously:
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+```
+
+FP8 quantization significantly reduces VRAM usage but does not accelerate computations. Some models may experience issues such as blurry, torn, or distorted outputs due to insufficient precision when using FP8 quantization. Use FP8 quantization with caution.
+
+The `enable_vram_management` function provides the following parameters to control VRAM usage:
+
+* `vram_limit`: VRAM usage limit (in GB). By default, it uses all available VRAM on the device. Note that this is not an absolute limit; if the specified VRAM is insufficient but more VRAM is actually available, inference will proceed using the minimum required VRAM.
+* `vram_buffer`: Size of the VRAM buffer (in GB). Default is 0.5GB. Since certain large neural network layers may consume more VRAM unpredictably during their execution phase, a VRAM buffer is necessary. Ideally, this should match the maximum VRAM consumed by any single layer in the model.
+* `num_persistent_param_in_dit`: Number of persistent parameters in DiT models. By default, there is no limit. We plan to remove this parameter in the future, so please avoid relying on it.
+
+</details>
+
+<details>
+
+<summary>Inference Acceleration</summary>
+
+Wan supports multiple acceleration techniques, including:
+
+* **Efficient attention implementations**: If any of these attention implementations are installed in your Python environment, they will be automatically enabled in the following priority:
+    * [Flash Attention 3](https://github.com/Dao-AILab/flash-attention)  
+    * [Flash Attention 2](https://github.com/Dao-AILab/flash-attention)  
+    * [Sage Attention](https://github.com/thu-ml/SageAttention)  
+    * [torch SDPA](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)   (default setting; we recommend installing `torch>=2.5.0`)
+* **Unified Sequence Parallel**: Sequence parallelism based on [xDiT](https://github.com/xdit-project/xDiT). Please refer to [this example](./acceleration/unified_sequence_parallel.py), and run it using the command: 
+
+```shell
+pip install xfuser>=0.4.3
+torchrun --standalone --nproc_per_node=8 examples/wanvideo/acceleration/unified_sequence_parallel.py
+```
+
+* **TeaCache**: Acceleration technique [TeaCache](https://github.com/ali-vilab/TeaCache). Please refer to [this example](./acceleration/teacache.py).
+
+</details>
+
+
+<details>
+
+<summary>Input Parameters</summary>
+
+The pipeline accepts the following input parameters during inference:
+
+* `prompt`: Prompt describing the content to appear in the video.
+* `negative_prompt`: Negative prompt describing content that should not appear in the video. Default is `""`.
+* `input_image`: Input image, applicable for image-to-video models such as [`Wan-AI/Wan2.1-I2V-14B-480P`](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P) and [`PAI/Wan2.1-Fun-1.3B-InP`](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP), as well as first-and-last-frame models like [`Wan-AI/Wan2.1-FLF2V-14B-720P`](Wan-AI/Wan2.1-FLF2V-14B-720P).
+* `end_image`: End frame, applicable for first-and-last-frame models such as [`Wan-AI/Wan2.1-FLF2V-14B-720P`](Wan-AI/Wan2.1-FLF2V-14B-720P).
+* `input_video`: Input video used for video-to-video generation. Applicable to any Wan series model and must be used together with `denoising_strength`.
+* `denoising_strength`: Denoising strength in range [0, 1]. A smaller value results in a video closer to `input_video`.
+* `control_video`: Control video, applicable to Wan models with control capabilities such as [`PAI/Wan2.1-Fun-1.3B-Control`](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control).  
+* `reference_image`: Reference image, applicable to Wan models supporting reference images such as [`PAI/Wan2.1-Fun-V1.1-1.3B-Control`](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control).  
+* `camera_control_direction`: Camera control direction, optional values are "Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown". Applicable to Camera-Control models, such as [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://www.modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera).  
+* `camera_control_speed`: Camera control speed. Applicable to Camera-Control models, such as [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://www.modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera).  
+* `camera_control_origin`: Origin coordinate of the camera control sequence. Please refer to the [original paper](https://arxiv.org/pdf/2404.02101) for proper configuration. Applicable to Camera-Control models, such as [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://www.modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera).
+* `vace_video`: Input video for VACE models, applicable to the VACE series such as [`iic/VACE-Wan2.1-1.3B-Preview`](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview).  
+* `vace_video_mask`: Mask video for VACE models, applicable to the VACE series such as [`iic/VACE-Wan2.1-1.3B-Preview`](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview).  
+* `vace_reference_image`: Reference image for VACE models, applicable to the VACE series such as [`iic/VACE-Wan2.1-1.3B-Preview`](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview).  
+* `vace_scale`: Influence of the VACE model on the base model, default is 1. Higher values increase control strength but may lead to visual artifacts or breakdowns.
+* `seed`: Random seed. Default is `None`, meaning fully random.
+* `rand_device`: Device used to generate random Gaussian noise matrix. Default is `"cpu"`. When set to `"cuda"`, different GPUs may produce different generation results.
+* `height`: Frame height, default is 480. Must be a multiple of 16; if not, it will be rounded up.
+* `width`: Frame width, default is 832. Must be a multiple of 16; if not, it will be rounded up.
+* `num_frames`: Number of frames, default is 81. Must be a multiple of 4 plus 1; if not, it will be rounded up, minimum is 1.
+* `cfg_scale`: Classifier-free guidance scale, default is 5. Higher values increase adherence to the prompt but may cause visual artifacts.
+* `cfg_merge`: Whether to merge both sides of classifier-free guidance for unified inference. Default is `False`. This parameter currently only works for basic text-to-video and image-to-video models.
+* `num_inference_steps`: Number of inference steps, default is 50.
+* `sigma_shift`: Parameter from Rectified Flow theory, default is 5. Higher values make the model stay longer at the initial denoising stage. Increasing this may improve video quality but may also cause inconsistency between generated videos and training data due to deviation from training behavior.
+* `motion_bucket_id`: Motion intensity, range [0, 100], applicable to motion control modules such as [`DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1`](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1). Larger values indicate more intense motion.  
+* `tiled`: Whether to enable tiled VAE inference, default is `False`. Setting to `True` significantly reduces VRAM usage during VAE encoding/decoding but introduces small errors and slightly increases inference time.
+* `tile_size`: Tile size during VAE encoding/decoding, default is (30, 52), only effective when `tiled=True`.
+* `tile_stride`: Stride of tiles during VAE encoding/decoding, default is (15, 26), only effective when `tiled=True`. Must be less than or equal to `tile_size`.
+* `sliding_window_size`: Sliding window size for DiT part. Experimental feature, effects are unstable.
+* `sliding_window_stride`: Sliding window stride for DiT part. Experimental feature, effects are unstable.
+* `tea_cache_l1_thresh`: Threshold for TeaCache. Larger values result in faster speed but lower quality. Note that after enabling TeaCache, the inference speed is not uniform, so the remaining time shown on the progress bar becomes inaccurate.
+* `tea_cache_model_id`: TeaCache parameter template, options include `"Wan2.1-T2V-1.3B"`, `"Wan2.1-T2V-14B"`, `"Wan2.1-I2V-14B-480P"`, `"Wan2.1-I2V-14B-720P"`.
+* `progress_bar_cmd`: Progress bar implementation, default is `tqdm.tqdm`. You can set it to `lambda x:x` to disable the progress bar.
+
+</details>
+
+## Model Training
+
+Wan series models are trained using a unified script located at [`./model_training/train.py`](./model_training/train.py).
+
+<details>
+
+<summary>Script Parameters</summary>
+
+The script includes the following parameters:
+
+* Dataset
+  * `--dataset_base_path`: Base path of the dataset.
+  * `--dataset_metadata_path`: Path to the metadata file of the dataset.
+  * `--height`: Height of images or videos. Leave `height` and `width` empty to enable dynamic resolution.
+  * `--width`: Width of images or videos. Leave `height` and `width` empty to enable dynamic resolution.
+  * `--num_frames`: Number of frames per video. Frames are sampled from the video prefix.
+  * `--data_file_keys`: Data file keys in the metadata. Comma-separated.
+  * `--dataset_repeat`: Number of times to repeat the dataset per epoch.
+* Models
+  * `--model_paths`: Paths to load models. In JSON format.
+  * `--model_id_with_origin_paths`: Model ID with origin paths, e.g., Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors. Comma-separated.
+* Training
+  * `--learning_rate`: Learning rate.
+  * `--num_epochs`: Number of epochs.
+  * `--output_path`: Output save path.
+  * `--remove_prefix_in_ckpt`: Remove prefix in ckpt.
+* Trainable Modules
+  * `--trainable_models`: Models to train, e.g., dit, vae, text_encoder.
+  * `--lora_base_model`: Which model LoRA is added to.
+  * `--lora_target_modules`: Which layers LoRA is added to.
+  * `--lora_rank`: Rank of LoRA.
+* Extra Inputs
+  * `--extra_inputs`: Additional model inputs, comma-separated.
+* VRAM Management
+  * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to CPU memory.
+
+Additionally, the training framework is built upon [`accelerate`](https://huggingface.co/docs/accelerate/index). Before starting training, run `accelerate config` to configure GPU-related parameters. For certain training scripts (e.g., full fine-tuning of 14B models), we provide recommended `accelerate` configuration files, which can be found in the corresponding training scripts.
+
+</details>
+
+
+<details>
+
+<summary>Step 1: Prepare the Dataset</summary>
+
+The dataset consists of a series of files. We recommend organizing your dataset as follows:
+
+```
+data/example_video_dataset/
+├── metadata.csv
+├── video1.mp4
+└── video2.mp4
+```
+
+Here, `video1.mp4` and `video2.mp4` are training video files, and `metadata.csv` is the metadata list, for example:
+
+```
+video,prompt
+video1.mp4,"from sunset to night, a small town, light, house, river"
+video2.mp4,"a dog is running"
+```
+
+We have prepared a sample video dataset to help you test. You can download it using the following command:
+
+```shell
+modelscope download --dataset DiffSynth-Studio/example_video_dataset --local_dir ./data/example_video_dataset
+```
+
+The dataset supports mixed training of videos and images. Supported video formats include `"mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"`, and supported image formats include `"jpg", "jpeg", "png", "webp"`.
+
+The resolution of videos can be controlled via script parameters `--height`, `--width`, and `--num_frames`. For each video, the first `num_frames` frames will be used for training; therefore, an error will occur if the video length is less than `num_frames`. Image files will be treated as single-frame videos. When both `--height` and `--width` are left empty, dynamic resolution will be enabled, meaning training will use the actual resolution of each video or image in the dataset.
+
+**We strongly recommend using fixed-resolution training and avoiding mixing images and videos in the same dataset due to load balancing issues in multi-GPU training.**
+
+When the model requires additional inputs, such as the `control_video` needed by control-capable models like [`PAI/Wan2.1-Fun-1.3B-Control`](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control), please add corresponding columns in the metadata file, for example:
+
+```
+video,prompt,control_video
+video1.mp4,"from sunset to night, a small town, light, house, river",video1_softedge.mp4
+```
+
+If additional inputs contain video or image files, their column names need to be specified in the `--data_file_keys` parameter. The default value of this parameter is `"image,video"`, meaning it parses columns named `image` and `video`. You can extend this list based on the additional input requirements, for example: `--data_file_keys "image,video,control_video"`, and also enable `--input_contains_control_video`.
+
+</details>
+
+
+<details>
+
+<summary>Step 2: Load the Model</summary>
+
+Similar to the model loading logic during inference, you can configure the model to be loaded directly via its model ID. For instance, during inference we load the model using:
+
+```python
+model_configs=[
+    ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
+    ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"),
+    ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"),
+]
+```
+
+During training, simply use the following parameter to load the corresponding model:
+
+```shell
+--model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth"
+```
+
+If you want to load the model from local files, for example during inference:
+
+```python
+model_configs=[
+    ModelConfig(path=[
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors",
+    ]),
+    ModelConfig(path="models/Wan-AI/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth"),
+    ModelConfig(path="models/Wan-AI/Wan2.1-T2V-14B/Wan2.1_VAE.pth"),
+]
+```
+
+Then during training, set the parameter as:
+
+```shell
+--model_paths '[
+    [
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors"
+    ],
+    "models/Wan-AI/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth",
+    "models/Wan-AI/Wan2.1-T2V-14B/Wan2.1_VAE.pth"
+]' \
+```
+
+</details>
+
+
+<details>
+
+<summary>Step 3: Configure Trainable Modules</summary>
+
+The training framework supports full fine-tuning of base models or LoRA-based training. Here are some examples:
+
+* Full fine-tuning of the DiT module: `--trainable_models dit`
+* Training a LoRA model for the DiT module: `--lora_base_model dit --lora_target_modules "q,k,v,o,ffn.0,ffn.2" --lora_rank 32`
+* Training both a LoRA model for DiT and the Motion Controller (yes, you can train such advanced structures): `--trainable_models motion_controller --lora_base_model dit --lora_target_modules "q,k,v,o,ffn.0,ffn.2" --lora_rank 32`
+
+Additionally, since multiple modules (text encoder, dit, vae) are loaded in the training script, you need to remove prefixes when saving model files. For example, when fully fine-tuning the DiT module or training a LoRA version of DiT, please set `--remove_prefix_in_ckpt pipe.dit.`
+
+</details>
+
+
+<details>
+
+<summary>Step 4: Launch the Training Process</summary>
+
+We have prepared training commands for each model. Please refer to the table at the beginning of this document.
+
+Note that full fine-tuning of the 14B model requires 8 GPUs, each with at least 80GB VRAM. During full fine-tuning of these 14B models, you must install `deepspeed` (`pip install deepspeed`). We have provided recommended [configuration files](./model_training/full/accelerate_config_14B.yaml), which will be loaded automatically in the corresponding training scripts. These scripts have been tested on 8*A100.
+
+The default video resolution in the training script is `480*832*81`. Increasing the resolution may cause out-of-memory errors. To reduce VRAM usage, add the parameter `--use_gradient_checkpointing_offload`.
+
+</details>

 ## Gallery

-1.3B text-to-video.
+1.3B text-to-video:

 https://github.com/user-attachments/assets/124397be-cd6a-4f29-a87c-e4c695aaabb8

-Put sunglasses on the dog.
+Put sunglasses on the dog (1.3B video-to-video):

 https://github.com/user-attachments/assets/272808d7-fbeb-4747-a6df-14a0860c75fb

-14B text-to-video.
+14B text-to-video:

 https://github.com/user-attachments/assets/3908bc64-d451-485a-8b61-28f6d32dd92f

-14B image-to-video.
+14B image-to-video:

 https://github.com/user-attachments/assets/c0bdd5ca-292f-45ed-b9bc-afe193156e75

-## Train
-
-We support Wan-Video LoRA training and full training. Here is a tutorial. This is an experimental feature. Below is a video sample generated from the character Keqing LoRA:
-
-https://github.com/user-attachments/assets/9bd8e30b-97e8-44f9-bb6f-da004ba376a9
-
-Step 1: Install additional packages
-
-```
-pip install peft lightning pandas
-```
-
-Step 2: Prepare your dataset
-
-You need to manage the training videos as follows:
-
-```
-data/example_dataset/
-├── metadata.csv
-└── train
-    ├── video_00001.mp4
-    └── image_00002.jpg
-```
-
-`metadata.csv`:
-
-```
-file_name,text
-video_00001.mp4,"video description"
-image_00002.jpg,"video description"
-```
-
-We support both images and videos. An image is treated as a single frame of video.
-
-Step 3: Data process
-
-```shell
-CUDA_VISIBLE_DEVICES="0" python examples/wanvideo/train_wan_t2v.py \
-  --task data_process \
-  --dataset_path data/example_dataset \
-  --output_path ./models \
-  --text_encoder_path "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth" \
-  --vae_path "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth" \
-  --tiled \
-  --num_frames 81 \
-  --height 480 \
-  --width 832
-```
-
-After that, some cached files will be stored in the dataset folder.
-
-```
-data/example_dataset/
-├── metadata.csv
-└── train
-    ├── video_00001.mp4
-    ├── video_00001.mp4.tensors.pth
-    ├── video_00002.mp4
-    └── video_00002.mp4.tensors.pth
-```
-
-Step 4: Train
-
 LoRA training:

-```shell
-CUDA_VISIBLE_DEVICES="0" python examples/wanvideo/train_wan_t2v.py \
-  --task train \
-  --train_architecture lora \
-  --dataset_path data/example_dataset \
-  --output_path ./models \
-  --dit_path "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors" \
-  --steps_per_epoch 500 \
-  --max_epochs 10 \
-  --learning_rate 1e-4 \
-  --lora_rank 16 \
-  --lora_alpha 16 \
-  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
-  --accumulate_grad_batches 1 \
-  --use_gradient_checkpointing
-```
-
-Full training:
-
-```shell
-CUDA_VISIBLE_DEVICES="0" python examples/wanvideo/train_wan_t2v.py \
-  --task train \
-  --train_architecture full \
-  --dataset_path data/example_dataset \
-  --output_path ./models \
-  --dit_path "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors" \
-  --steps_per_epoch 500 \
-  --max_epochs 10 \
-  --learning_rate 1e-4 \
-  --accumulate_grad_batches 1 \
-  --use_gradient_checkpointing
-```
-
-If you wish to train the 14B model, please separate the safetensor files with a comma. For example: `models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors`.
-
-If you wish to train the image-to-video model, please add an extra parameter `--image_encoder_path "models/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"`.
-
-For LoRA training, the Wan-1.3B-T2V model requires 16G of VRAM for processing 81 frames at 480P, while the Wan-14B-T2V model requires 60G of VRAM for the same configuration. To further reduce VRAM requirements by 20%-30%, you can include the parameter `--use_gradient_checkpointing_offload`.
-
-Step 5: Test
-
-Test LoRA:
-
-```python
-import torch
-from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
-
-
-model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
-model_manager.load_models([
-    "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
-    "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
-    "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
-])
-model_manager.load_lora("models/lightning_logs/version_1/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = WanVideoPipeline.from_model_manager(model_manager, device="cuda")
-pipe.enable_vram_management(num_persistent_param_in_dit=None)
-
-video = pipe(
-    prompt="...",
-    negative_prompt="...",
-    num_inference_steps=50,
-    seed=0, tiled=True
-)
-save_video(video, "video.mp4", fps=30, quality=5)
-```
-
-Test fine-tuned base model:
-
-```python
-import torch
-from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
-
-
-model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
-model_manager.load_models([
-    "models/lightning_logs/version_1/checkpoints/epoch=0-step=500.ckpt",
-    "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
-    "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
-])
-pipe = WanVideoPipeline.from_model_manager(model_manager, device="cuda")
-pipe.enable_vram_management(num_persistent_param_in_dit=None)
-
-video = pipe(
-    prompt="...",
-    negative_prompt="...",
-    num_inference_steps=50,
-    seed=0, tiled=True
-)
-save_video(video, "video.mp4", fps=30, quality=5)
-```
+https://github.com/user-attachments/assets/9bd8e30b-97e8-44f9-bb6f-da004ba376a9
--- a/examples/wanvideo/README_zh.md
+++ b/examples/wanvideo/README_zh.md
@@ -0,0 +1,420 @@
+# 通义万相 2.1（Wan 2.1）
+
+[Switch to English](./README.md)
+
+Wan 2.1 是由阿里巴巴通义实验室开源的一系列视频生成模型。
+
+**DiffSynth-Studio 启用了新的推理和训练框架，如需使用旧版本，请点击[这里](https://github.com/modelscope/DiffSynth-Studio/tree/3edf3583b1f08944cee837b94d9f84d669c2729c)。**
+
+## 安装
+
+在使用本系列模型之前，请通过源码安装 DiffSynth-Studio。
+
+```shell
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+
+## 模型总览
+
+|模型 ID|额外参数|推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
+|-|-|-|-|-|-|-|
+|[Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B)||[code](./model_inference/Wan2.1-T2V-1.3B.py)|[code](./model_training/full/Wan2.1-T2V-1.3B.sh)|[code](./model_training/validate_full/Wan2.1-T2V-1.3B.py)|[code](./model_training/lora/Wan2.1-T2V-1.3B.sh)|[code](./model_training/validate_lora/Wan2.1-T2V-1.3B.py)|
+|[Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B)||[code](./model_inference/Wan2.1-T2V-14B.py)|[code](./model_training/full/Wan2.1-T2V-14B.sh)|[code](./model_training/validate_full/Wan2.1-T2V-14B.py)|[code](./model_training/lora/Wan2.1-T2V-14B.sh)|[code](./model_training/validate_lora/Wan2.1-T2V-14B.py)|
+|[Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)|`input_image`|[code](./model_inference/Wan2.1-I2V-14B-480P.py)|[code](./model_training/full/Wan2.1-I2V-14B-480P.sh)|[code](./model_training/validate_full/Wan2.1-I2V-14B-480P.py)|[code](./model_training/lora/Wan2.1-I2V-14B-480P.sh)|[code](./model_training/validate_lora/Wan2.1-I2V-14B-480P.py)|
+|[Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)|`input_image`|[code](./model_inference/Wan2.1-I2V-14B-720P.py)|[code](./model_training/full/Wan2.1-I2V-14B-720P.sh)|[code](./model_training/validate_full/Wan2.1-I2V-14B-720P.py)|[code](./model_training/lora/Wan2.1-I2V-14B-720P.sh)|[code](./model_training/validate_lora/Wan2.1-I2V-14B-720P.py)|
+|[Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-FLF2V-14B-720P.py)|[code](./model_training/full/Wan2.1-FLF2V-14B-720P.sh)|[code](./model_training/validate_full/Wan2.1-FLF2V-14B-720P.py)|[code](./model_training/lora/Wan2.1-FLF2V-14B-720P.sh)|[code](./model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py)|
+|[PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-1.3B-InP.py)|[code](./model_training/full/Wan2.1-Fun-1.3B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-1.3B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-1.3B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py)|
+|[PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)|`control_video`|[code](./model_inference/Wan2.1-Fun-1.3B-Control.py)|[code](./model_training/full/Wan2.1-Fun-1.3B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-1.3B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-1.3B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py)|
+|[PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-14B-InP.py)|[code](./model_training/full/Wan2.1-Fun-14B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-14B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-14B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-14B-InP.py)|
+|[PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control)|`control_video`|[code](./model_inference/Wan2.1-Fun-14B-Control.py)|[code](./model_training/full/Wan2.1-Fun-14B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-14B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-14B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-14B-Control.py)|
+|[PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)|`control_video`, `reference_image`|[code](./model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py)|
+|[PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control)|`control_video`, `reference_image`|[code](./model_inference/Wan2.1-Fun-V1.1-14B-Control.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py)|
+|[PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py)|
+|[PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP)|`input_image`, `end_image`|[code](./model_inference/Wan2.1-Fun-V1.1-14B-InP.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py)|
+|[PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera)|`control_camera_video`, `input_image`|[code](./model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py)|
+|[PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)|`control_camera_video`, `input_image`|[code](./model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](./model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](./model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|[code](./model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh)|[code](./model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py)|
+|[iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)|`vace_control_video`, `vace_reference_image`|[code](./model_inference/Wan2.1-VACE-1.3B-Preview.py)|[code](./model_training/full/Wan2.1-VACE-1.3B-Preview.sh)|[code](./model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py)|[code](./model_training/lora/Wan2.1-VACE-1.3B-Preview.sh)|[code](./model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py)|
+|[Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B)|`vace_control_video`, `vace_reference_image`|[code](./model_inference/Wan2.1-VACE-1.3B.py)|[code](./model_training/full/Wan2.1-VACE-1.3B.sh)|[code](./model_training/validate_full/Wan2.1-VACE-1.3B.py)|[code](./model_training/lora/Wan2.1-VACE-1.3B.sh)|[code](./model_training/validate_lora/Wan2.1-VACE-1.3B.py)|
+|[Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)|`vace_control_video`, `vace_reference_image`|[code](./model_inference/Wan2.1-VACE-14B.py)|[code](./model_training/full/Wan2.1-VACE-14B.sh)|[code](./model_training/validate_full/Wan2.1-VACE-14B.py)|[code](./model_training/lora/Wan2.1-VACE-14B.sh)|[code](./model_training/validate_lora/Wan2.1-VACE-14B.py)|
+|[DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)|`motion_bucket_id`|[code](./model_inference/Wan2.1-1.3b-speedcontrol-v1.py)|[code](./model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](./model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py)|[code](./model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh)|[code](./model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py)|
+
+## 模型推理
+
+以下部分将会帮助您理解我们的功能并编写推理代码。
+
+
+<details>
+
+<summary>加载模型</summary>
+
+模型通过 `from_pretrained` 加载：
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"),
+    ],
+)
+```
+
+其中 `torch_dtype` 和 `device` 是计算精度和计算设备。`model_configs` 可通过多种方式配置模型路径：
+
+* 从[魔搭社区](https://modelscope.cn/)下载模型并加载。此时需要填写 `model_id` 和 `origin_file_pattern`，例如
+
+```python
+ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+```
+
+* 从本地文件路径加载模型。此时需要填写 `path`，例如
+
+```python
+ModelConfig(path="models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors")
+```
+
+对于从多个文件加载的单一模型，使用列表即可，例如
+
+```python
+ModelConfig(path=[
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors",
+    "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors",
+])
+```
+
+`from_pretrained` 还提供了额外的参数用于控制模型加载时的行为：
+
+* `tokenizer_config`: Wan 模型的 tokenizer 路径，默认值为 `ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*")`。
+* `local_model_path`: 用于保存下载模型的路径，默认值为 `"./models"`。
+* `skip_download`: 是否跳过下载，默认值为 `False`。当您的网络无法访问[魔搭社区](https://modelscope.cn/)时，请手动下载必要的文件，并将其设置为 `True`。
+* `redirect_common_files`: 是否重定向重复模型文件，默认值为 `True`。由于 Wan 系列模型包括多个基础模型，每个基础模型的 text encoder 等模块都是相同的，为避免重复下载，我们会对模型路径进行重定向。
+* `use_usp`: 是否启用 Unified Sequence Parallel，默认值为 `False`。用于多 GPU 并行推理。
+
+</details>
+
+
+<details>
+
+<summary>显存管理</summary>
+
+DiffSynth-Studio 为 Wan 模型提供了细粒度的显存管理，让模型能够在低显存设备上进行推理，可通过以下代码开启 offload 功能，在显存有限的设备上将部分模块 offload 到内存中。
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+```
+
+FP8 量化功能也是支持的：
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+```
+
+FP8 量化和 offload 可同时开启：
+
+```python
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+```
+
+FP8 量化能够大幅度减少显存占用，但不会加速，部分模型在 FP8 量化下会出现精度不足导致的画面模糊、撕裂、失真问题，请谨慎使用 FP8 量化。
+
+`enable_vram_management` 函数提供了以下参数，用于控制显存使用情况：
+
+* `vram_limit`: 显存占用量（GB），默认占用设备上的剩余显存。注意这不是一个绝对限制，当设置的显存不足以支持模型进行推理，但实际可用显存足够时，将会以最小化显存占用的形式进行推理。
+* `vram_buffer`: 显存缓冲区大小（GB），默认为 0.5GB。由于部分较大的神经网络层在 onload 阶段会不可控地占用更多显存，因此一个显存缓冲区是必要的，理论上的最优值为模型中最大的层所占的显存。
+* `num_persistent_param_in_dit`: DiT 模型中常驻显存的参数数量（个），默认为无限制。我们将会在未来删除这个参数，请不要依赖这个参数。
+
+</details>
+
+
+<details>
+
+<summary>推理加速</summary>
+
+Wan 支持多种加速方案，包括
+
+* 高效注意力机制实现：当您的 Python 环境中安装过这些注意力机制实现方案时，我们将会按照以下优先级自动启用。
+    * [Flash Attention 3](https://github.com/Dao-AILab/flash-attention)
+    * [Flash Attention 2](https://github.com/Dao-AILab/flash-attention)
+    * [Sage Attention](https://github.com/thu-ml/SageAttention)
+    * [torch SDPA](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (默认设置，建议安装 `torch>=2.5.0`)
+* 统一序列并行：基于 [xDiT](https://github.com/xdit-project/xDiT) 实现的序列并行，请参考[示例代码](./acceleration/unified_sequence_parallel.py)，使用以下命令运行：
+
+```shell
+pip install xfuser>=0.4.3
+torchrun --standalone --nproc_per_node=8 examples/wanvideo/acceleration/unified_sequence_parallel.py
+```
+
+* TeaCache：加速技术 [TeaCache](https://github.com/ali-vilab/TeaCache)，请参考[示例代码](./acceleration/teacache.py)。
+
+</details>
+
+
+<details>
+
+<summary>输入参数</summary>
+
+Pipeline 在推理阶段能够接收以下输入参数：
+
+* `prompt`: 提示词，描述画面中出现的内容。
+* `negative_prompt`: 负向提示词，描述画面中不应该出现的内容，默认值为 `""`。
+* `input_image`: 输入图片，适用于图生视频模型，例如 [`Wan-AI/Wan2.1-I2V-14B-480P`](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P)、[`PAI/Wan2.1-Fun-1.3B-InP`](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP)，以及首尾帧模型，例如 [`Wan-AI/Wan2.1-FLF2V-14B-720P`](Wan-AI/Wan2.1-FLF2V-14B-720P)。
+* `end_image`: 结尾帧，适用于首尾帧模型，例如 [`Wan-AI/Wan2.1-FLF2V-14B-720P`](Wan-AI/Wan2.1-FLF2V-14B-720P)。
+* `input_video`: 输入视频，用于视频生视频，适用于任意 Wan 系列模型，需与参数 `denoising_strength` 配合使用。
+* `denoising_strength`: 去噪强度，范围为 [0, 1]。数值越小，生成的视频越接近 `input_video`。
+* `control_video`: 控制视频，适用于带控制能力的 Wan 系列模型，例如 [`PAI/Wan2.1-Fun-1.3B-Control`](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control)。
+* `reference_image`: 参考图片，适用于带参考图能力的 Wan 系列模型，例如 [`PAI/Wan2.1-Fun-V1.1-1.3B-Control`](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control)。
+* `camera_control_direction`: 镜头控制方向，可选 "Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown" 之一，适用于 Camera-Control 模型，例如 [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://www.modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)。
+* `camera_control_speed`: 镜头控制速度，适用于 Camera-Control 模型，例如 [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://www.modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)。
+* `camera_control_origin`: 镜头控制序列的原点坐标，请参考[原论文](https://arxiv.org/pdf/2404.02101)进行设置，适用于 Camera-Control 模型，例如 [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://www.modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera)。
+* `vace_video`: VACE 模型的输入视频，适用于 VACE 系列模型，例如 [`iic/VACE-Wan2.1-1.3B-Preview`](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)。
+* `vace_video_mask`: VACE 模型的 mask 视频，适用于 VACE 系列模型，例如 [`iic/VACE-Wan2.1-1.3B-Preview`](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)。
+* `vace_reference_image`: VACE 模型的参考图片，适用于 VACE 系列模型，例如 [`iic/VACE-Wan2.1-1.3B-Preview`](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview)。
+* `vace_scale`: VACE 模型对基础模型的影响程度，默认为1。数值越大，控制强度越高，但画面崩坏概率越大。
+* `seed`: 随机种子。默认为 `None`，即完全随机。
+* `rand_device`: 生成随机高斯噪声矩阵的计算设备，默认为 `"cpu"`。当设置为 `cuda` 时，在不同 GPU 上会导致不同的生成结果。
+* `height`: 帧高度，默认为 480。需设置为 16 的倍数，不满足时向上取整。
+* `width`: 帧宽度，默认为 832。需设置为 16 的倍数，不满足时向上取整。
+* `num_frames`: 帧数，默认为 81。需设置为 4 的倍数 + 1，不满足时向上取整，最小值为 1。
+* `cfg_scale`: Classifier-free guidance 机制的数值，默认为 5。数值越大，提示词的控制效果越强，但画面崩坏的概率越大。
+* `cfg_merge`: 是否合并 Classifier-free guidance 的两侧进行统一推理，默认为 `False`。该参数目前仅在基础的文生视频和图生视频模型上生效。
+* `num_inference_steps`: 推理次数，默认值为 50。
+* `sigma_shift`: Rectified Flow 理论中的参数，默认为 5。数值越大，模型在去噪的开始阶段停留的步骤数越多，可适当调大这个参数来提高画面质量，但会因生成过程与训练过程不一致导致生成的视频内容与训练数据存在差异。
+* `motion_bucket_id`: 运动幅度，范围为 [0, 100]。适用于速度控制模块，例如 [`DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1`](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1)，数值越大，运动幅度越大。
+* `tiled`: 是否启用 VAE 分块推理，默认为 `False`。设置为 `True` 时可显著减少 VAE 编解码阶段的显存占用，会产生少许误差，以及少量推理时间延长。
+* `tile_size`: VAE 编解码阶段的分块大小，默认为 (30, 52)，仅在 `tiled=True` 时生效。
+* `tile_stride`: VAE 编解码阶段的分块步长，默认为 (15, 26)，仅在 `tiled=True` 时生效，需保证其数值小于或等于 `tile_size`。
+* `sliding_window_size`: DiT 部分的滑动窗口大小。实验性功能，效果不稳定。
+* `sliding_window_stride`: DiT 部分的滑动窗口步长。实验性功能，效果不稳定。
+* `tea_cache_l1_thresh`: TeaCache 的阈值，数值越大，速度越快，画面质量越差。请注意，开启 TeaCache 后推理速度并非均匀，因此进度条上显示的剩余时间将会变得不准确。
+* `tea_cache_model_id`: TeaCache 的参数模板，可选 `"Wan2.1-T2V-1.3B"`、`Wan2.1-T2V-14B`、`Wan2.1-I2V-14B-480P`、`Wan2.1-I2V-14B-720P` 之一。
+* `progress_bar_cmd`: 进度条，默认为 `tqdm.tqdm`。可通过设置为 `lambda x:x` 来屏蔽进度条。
+
+</details>
+
+
+## 模型训练
+
+Wan 系列模型训练通过统一的 [`./model_training/train.py`](./model_training/train.py) 脚本进行。
+
+<details>
+
+<summary>脚本参数</summary>
+
+脚本包含以下参数：
+
+* 数据集
+  * `--dataset_base_path`: 数据集的根路径。
+  * `--dataset_metadata_path`: 数据集的元数据文件路径。
+  * `--height`: 图像或视频的高度。将 `height` 和 `width` 留空以启用动态分辨率。
+  * `--width`: 图像或视频的宽度。将 `height` 和 `width` 留空以启用动态分辨率。
+  * `--num_frames`: 每个视频中的帧数。帧从视频前缀中采样。
+  * `--data_file_keys`: 元数据中的数据文件键。用逗号分隔。
+  * `--dataset_repeat`: 每个 epoch 中数据集重复的次数。
+* 模型
+  * `--model_paths`: 要加载的模型路径。JSON 格式。
+  * `--model_id_with_origin_paths`: 带原始路径的模型 ID，例如 Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors。用逗号分隔。
+* 训练
+  * `--learning_rate`: 学习率。
+  * `--num_epochs`: 轮数（Epoch）数量。
+  * `--output_path`: 保存路径。
+  * `--remove_prefix_in_ckpt`: 在 ckpt 中移除前缀。
+* 可训练模块
+  * `--trainable_models`: 可训练的模型，例如 dit、vae、text_encoder。
+  * `--lora_base_model`: LoRA 添加到哪个模型上。
+  * `--lora_target_modules`: LoRA 添加到哪一层上。
+  * `--lora_rank`: LoRA 的秩（Rank）。
+* 额外模型输入
+  * `--extra_inputs`: 额外的模型输入，以逗号分隔。
+* 显存管理
+  * `--use_gradient_checkpointing_offload`: 是否将 gradient checkpointing 卸载到内存中。
+
+此外，训练框架基于 [`accelerate`](https://huggingface.co/docs/accelerate/index) 构建，在开始训练前运行 `accelerate config` 可配置 GPU 的相关参数。对于部分模型训练（例如 14B 模型的全量训练）脚本，我们提供了建议的 `accelerate` 配置文件，可在对应的训练脚本中查看。
+
+</details>
+
+
+<details>
+
+<summary>Step 1: 准备数据集</summary>
+
+数据集包含一系列文件，我们建议您这样组织数据集文件：
+
+```
+data/example_video_dataset/
+├── metadata.csv
+├── video1.mp4
+└── video2.mp4
+```
+
+其中 `video1.mp4`、`video2.mp4` 为训练用视频数据，`metadata.csv` 为元数据列表，例如
+
+```
+video,prompt
+video1.mp4,"from sunset to night, a small town, light, house, river"
+video2.mp4,"a dog is running"
+```
+
+我们构建了一个样例视频数据集，以方便您进行测试，通过以下命令可以下载这个数据集：
+
+```shell
+modelscope download --dataset DiffSynth-Studio/example_video_dataset --local_dir ./data/example_video_dataset
+```
+
+数据集支持视频和图片混合训练，支持的视频文件格式包括 `"mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"`，支持的图片格式包括 `"jpg", "jpeg", "png", "webp"`。
+
+视频的尺寸可通过脚本参数 `--height`、`--width`、`--num_frames` 控制。在每个视频中，前 `num_frames` 帧会被用于训练，因此当视频长度不足 `num_frames` 帧时会报错，图片文件会被视为单帧视频。当 `--height` 和 `--width` 为空时将会开启动态分辨率，按照数据集中每个视频或图片的实际宽高训练。
+
+**我们强烈建议使用固定分辨率训练，并避免图像和视频混合训练，因为在多卡训练中存在负载均衡问题。**
+
+当模型需要额外输入时，例如具备控制能力的模型 [`PAI/Wan2.1-Fun-1.3B-Control`](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control) 所需的 `control_video`，请在数据集中补充相应的列，例如：
+
+```
+video,prompt,control_video
+video1.mp4,"from sunset to night, a small town, light, house, river",video1_softedge.mp4
+```
+
+额外输入若包含视频和图像文件，则需要在 `--data_file_keys` 参数中指定要解析的列名。该参数的默认值为 `"image,video"`，即解析列名为 `image` 和 `video` 的列。可根据额外输入增加相应的列名，例如 `--data_file_keys "image,video,control_video"`，同时启用 `--input_contains_control_video`。
+
+</details>
+
+
+<details>
+
+<summary>Step 2: 加载模型</summary>
+
+类似于推理时的模型加载逻辑，可直接通过模型 ID 配置要加载的模型。例如，推理时我们通过以下设置加载模型
+
+```python
+model_configs=[
+    ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
+    ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"),
+    ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"),
+]
+```
+
+那么在训练时，填入以下参数即可加载对应的模型。
+
+```shell
+--model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth"
+```
+
+如果您希望从本地文件加载模型，例如推理时
+
+```python
+model_configs=[
+    ModelConfig(path=[
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors",
+    ]),
+    ModelConfig(path="models/Wan-AI/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth"),
+    ModelConfig(path="models/Wan-AI/Wan2.1-T2V-14B/Wan2.1_VAE.pth"),
+]
+```
+
+那么训练时需设置为
+
+```shell
+--model_paths '[
+    [
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors"
+    ],
+    "models/Wan-AI/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth",
+    "models/Wan-AI/Wan2.1-T2V-14B/Wan2.1_VAE.pth"
+]' \
+```
+
+</details>
+
+
+<details>
+
+<summary>Step 3: 设置可训练模块</summary>
+
+训练框架支持训练基础模型，或 LoRA 模型。以下是几个例子：
+
+* 全量训练 DiT 部分：`--trainable_models dit`
+* 训练 DiT 部分的 LoRA 模型：`--lora_base_model dit --lora_target_modules "q,k,v,o,ffn.0,ffn.2" --lora_rank 32`
+* 训练 DiT 部分的 LoRA 和 Motion Controller 部分（是的，可以训练这种花里胡哨的结构）：`--trainable_models motion_controller --lora_base_model dit --lora_target_modules "q,k,v,o,ffn.0,ffn.2" --lora_rank 32`
+
+此外，由于训练脚本中加载了多个模块（text encoder、dit、vae），保存模型文件时需要移除前缀，例如在全量训练 DiT 部分或者训练 DiT 部分的 LoRA 模型时，请设置 `--remove_prefix_in_ckpt pipe.dit.`
+
+</details>
+
+
+<details>
+
+<summary>Step 4: 启动训练程序</summary>
+
+我们为每一个模型编写了训练命令，请参考本文档开头的表格。
+
+请注意，14B 模型全量训练需要8个GPU，每个GPU的显存至少为80G。全量训练这些14B模型时需要安装 `deepspeed`（`pip install deepspeed`），我们编写了建议的[配置文件](./model_training/full/accelerate_config_14B.yaml)，这个配置文件会在对应的训练脚本中被加载，这些脚本已在 8*A100 上测试过。
+
+训练脚本的默认视频尺寸为 `480*832*81`，提升分辨率将可能导致显存不足，请添加参数 `--use_gradient_checkpointing_offload` 降低显存占用。
+
+</details>
+
+## 案例展示
+
+1.3B 文生视频：
+
+https://github.com/user-attachments/assets/124397be-cd6a-4f29-a87c-e4c695aaabb8
+
+给狗狗戴上墨镜（1.3B 视频生视频）：
+
+https://github.com/user-attachments/assets/272808d7-fbeb-4747-a6df-14a0860c75fb
+
+14B 文生视频：
+
+https://github.com/user-attachments/assets/3908bc64-d451-485a-8b61-28f6d32dd92f
+
+14B 图生视频：
+
+https://github.com/user-attachments/assets/c0bdd5ca-292f-45ed-b9bc-afe193156e75
+
+LoRA 训练：
+
+https://github.com/user-attachments/assets/9bd8e30b-97e8-44f9-bb6f-da004ba376a9
--- a/examples/wanvideo/wan_1.3b_text_to_video_accelerate.py
+++ b/examples/wanvideo/wan_1.3b_text_to_video_accelerate.py
@@ -1,34 +1,27 @@
 import torch
-from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
-from modelscope import snapshot_download
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig


-# Download models
-snapshot_download("Wan-AI/Wan2.1-T2V-1.3B", local_dir="models/Wan-AI/Wan2.1-T2V-1.3B")
-
-# Load models
-model_manager = ModelManager(device="cpu")
-model_manager.load_models(
-    [
-        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
-        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
-        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
    ],
-    torch_dtype=torch.bfloat16, # You can set `torch_dtype=torch.float8_e4m3fn` to enable FP8 quantization.
 )
-pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
-pipe.enable_vram_management(num_persistent_param_in_dit=None)
+pipe.enable_vram_management()
+

-# Text-to-video
 video = pipe(
    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    num_inference_steps=50,
    seed=0, tiled=True,
    # TeaCache parameters
    tea_cache_l1_thresh=0.05, # The larger this value is, the faster the speed, but the worse the visual quality.
    tea_cache_model_id="Wan2.1-T2V-1.3B", # Choose one in (Wan2.1-T2V-1.3B, Wan2.1-T2V-14B, Wan2.1-I2V-14B-480P, Wan2.1-I2V-14B-720P).
 )
 save_video(video, "video1.mp4", fps=15, quality=5)
-
-# TeaCache doesn't support video-to-video
--- a/examples/wanvideo/acceleration/unified_sequence_parallel.py
+++ b/examples/wanvideo/acceleration/unified_sequence_parallel.py
@@ -0,0 +1,27 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+import torch.distributed as dist
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    use_usp=True,
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+
+video = pipe(
+    prompt="一名宇航员身穿太空服，面朝镜头骑着一匹机械马在火星表面驰骋。红色的荒凉地表延伸至远方，点缀着巨大的陨石坑和奇特的岩石结构。机械马的步伐稳健，扬起微弱的尘埃，展现出未来科技与原始探索的完美结合。宇航员手持操控装置，目光坚定，仿佛正在开辟人类的新疆域。背景是深邃的宇宙和蔚蓝的地球，画面既科幻又充满希望，让人不禁畅想未来的星际生活。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+)
+if dist.get_rank() == 0:
+    save_video(video, "video1.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py
+++ b/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py
@@ -1,31 +1,25 @@
 import torch
-from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
-from modelscope import snapshot_download
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig


-# Download models
-snapshot_download("Wan-AI/Wan2.1-T2V-1.3B", local_dir="models/Wan-AI/Wan2.1-T2V-1.3B")
-snapshot_download("DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", local_dir="models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1")
-
-# Load models
-model_manager = ModelManager(device="cpu")
-model_manager.load_models(
-    [
-        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
-        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
-        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
-        "models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1/model.safetensors",
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors", offload_device="cpu"),
    ],
-    torch_dtype=torch.bfloat16, # You can set `torch_dtype=torch.float8_e4m3fn` to enable FP8 quantization.
 )
-pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
-pipe.enable_vram_management(num_persistent_param_in_dit=None)
+pipe.enable_vram_management()

 # Text-to-video
 video = pipe(
    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    num_inference_steps=50,
    seed=1, tiled=True,
    motion_bucket_id=0
 )
@@ -34,8 +28,7 @@ save_video(video, "video_slow.mp4", fps=15, quality=5)
 video = pipe(
    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    num_inference_steps=50,
    seed=1, tiled=True,
    motion_bucket_id=100
 )
-save_video(video, "video_fast.mp4", fps=15, quality=5)
+save_video(video, "video_fast.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py
+++ b/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py
@@ -0,0 +1,36 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=["data/examples/wan/first_frame.jpeg", "data/examples/wan/last_frame.jpeg"]
+)
+
+# First and last frame to video
+video = pipe(
+    prompt="写实风格，一个女生手持枯萎的花站在花园中，镜头逐渐拉远，记录下花园的全貌。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=Image.open("data/examples/wan/first_frame.jpeg").resize((960, 960)),
+    end_image=Image.open("data/examples/wan/last_frame.jpeg").resize((960, 960)),
+    seed=0, tiled=True,
+    height=960, width=960, num_frames=33,
+    sigma_shift=16,
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py
@@ -0,0 +1,34 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/control_video.mp4"
+)
+
+# Control video
+control_video = VideoData("data/examples/wan/control_video.mp4", height=832, width=576)
+video = pipe(
+    prompt="扁平风格动漫，一位长发少女优雅起舞。她五官精致，大眼睛明亮有神，黑色长发柔顺光泽。身穿淡蓝色T恤和深蓝色牛仔短裤。背景是粉色。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=control_video, height=832, width=576, num_frames=49,
+    seed=1, tiled=True
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py
@@ -1,27 +1,22 @@
 import torch
-from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
-from modelscope import snapshot_download, dataset_snapshot_download
 from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download


-# Download models
-snapshot_download("PAI/Wan2.1-Fun-1.3B-InP", local_dir="models/PAI/Wan2.1-Fun-1.3B-InP")
-
-# Load models
-model_manager = ModelManager(device="cpu")
-model_manager.load_models(
-    [
-        "models/PAI/Wan2.1-Fun-1.3B-InP/diffusion_pytorch_model.safetensors",
-        "models/PAI/Wan2.1-Fun-1.3B-InP/models_t5_umt5-xxl-enc-bf16.pth",
-        "models/PAI/Wan2.1-Fun-1.3B-InP/Wan2.1_VAE.pth",
-        "models/PAI/Wan2.1-Fun-1.3B-InP/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
    ],
-    torch_dtype=torch.bfloat16, # You can set `torch_dtype=torch.float8_e4m3fn` to enable FP8 quantization.
 )
-pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
-pipe.enable_vram_management(num_persistent_param_in_dit=None)
+pipe.enable_vram_management()

-# Download example image
 dataset_snapshot_download(
    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
    local_dir="./",
@@ -29,14 +24,13 @@ dataset_snapshot_download(
 )
 image = Image.open("data/examples/wan/input_image.jpg")

-# Image-to-video
+# First and last frame to video
 video = pipe(
    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    num_inference_steps=50,
    input_image=image,
+    seed=0, tiled=True
    # You can input `end_image=xxx` to control the last frame of the video.
    # The model will automatically generate the dynamic content between `input_image` and `end_image`.
-    seed=1, tiled=True
 )
-save_video(video, "video1.mp4", fps=15, quality=5)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py
@@ -0,0 +1,34 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/control_video.mp4"
+)
+
+# Control video
+control_video = VideoData("data/examples/wan/control_video.mp4", height=832, width=576)
+video = pipe(
+    prompt="扁平风格动漫，一位长发少女优雅起舞。她五官精致，大眼睛明亮有神，黑色长发柔顺光泽。身穿淡蓝色T恤和深蓝色牛仔短裤。背景是粉色。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=control_video, height=832, width=576, num_frames=49,
+    seed=1, tiled=True
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py
@@ -0,0 +1,36 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+image = Image.open("data/examples/wan/input_image.jpg")
+
+# First and last frame to video
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=image,
+    seed=0, tiled=True
+    # You can input `end_image=xxx` to control the last frame of the video.
+    # The model will automatically generate the dynamic content between `input_image` and `end_image`.
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
@@ -0,0 +1,44 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+input_image = Image.open("data/examples/wan/input_image.jpg")
+
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+    input_image=input_image,
+    camera_control_direction="Left", camera_control_speed=0.01,
+)
+save_video(video, "video_left.mp4", fps=15, quality=5)
+
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+    input_image=input_image,
+    camera_control_direction="Up", camera_control_speed=0.01,
+)
+save_video(video, "video_up.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py
@@ -0,0 +1,36 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=["data/examples/wan/control_video.mp4", "data/examples/wan/reference_image_girl.png"]
+)
+
+# Control video
+control_video = VideoData("data/examples/wan/control_video.mp4", height=832, width=576)
+reference_image = Image.open("data/examples/wan/reference_image_girl.png").resize((576, 832))
+video = pipe(
+    prompt="扁平风格动漫，一位长发少女优雅起舞。她五官精致，大眼睛明亮有神，黑色长发柔顺光泽。身穿淡蓝色T恤和深蓝色牛仔短裤。背景是粉色。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=control_video, reference_image=reference_image,
+    height=832, width=576, num_frames=49,
+    seed=1, tiled=True
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py
@@ -0,0 +1,36 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+image = Image.open("data/examples/wan/input_image.jpg")
+
+# First and last frame to video
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=image,
+    seed=0, tiled=True
+    # You can input `end_image=xxx` to control the last frame of the video.
+    # The model will automatically generate the dynamic content between `input_image` and `end_image`.
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py
@@ -0,0 +1,44 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+input_image = Image.open("data/examples/wan/input_image.jpg")
+
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+    input_image=input_image,
+    camera_control_direction="Left", camera_control_speed=0.01,
+)
+save_video(video, "video_left.mp4", fps=15, quality=5)
+
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+    input_image=input_image,
+    camera_control_direction="Up", camera_control_speed=0.01,
+)
+save_video(video, "video_up.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py
@@ -0,0 +1,36 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=["data/examples/wan/control_video.mp4", "data/examples/wan/reference_image_girl.png"]
+)
+
+# Control video
+control_video = VideoData("data/examples/wan/control_video.mp4", height=832, width=576)
+reference_image = Image.open("data/examples/wan/reference_image_girl.png").resize((576, 832))
+video = pipe(
+    prompt="扁平风格动漫，一位长发少女优雅起舞。她五官精致，大眼睛明亮有神，黑色长发柔顺光泽。身穿淡蓝色T恤和深蓝色牛仔短裤。背景是粉色。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=control_video, reference_image=reference_image,
+    height=832, width=576, num_frames=49,
+    seed=1, tiled=True
+)
+save_video(video, "video1.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py
+++ b/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py
@@ -0,0 +1,36 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+image = Image.open("data/examples/wan/input_image.jpg")
+
+# First and last frame to video
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=image,
+    seed=0, tiled=True
+    # You can input `end_image=xxx` to control the last frame of the video.
+    # The model will automatically generate the dynamic content between `input_image` and `end_image`.
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py
+++ b/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py
@@ -0,0 +1,34 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+image = Image.open("data/examples/wan/input_image.jpg")
+
+# Image-to-video
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=image,
+    seed=0, tiled=True
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py
+++ b/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py
@@ -0,0 +1,35 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/wan/input_image.jpg"
+)
+image = Image.open("data/examples/wan/input_image.jpg")
+
+# Image-to-video
+video = pipe(
+    prompt="一艘小船正勇敢地乘风破浪前行。蔚蓝的大海波涛汹涌，白色的浪花拍打着船身，但小船毫不畏惧，坚定地驶向远方。阳光洒在水面上，闪烁着金色的光芒，为这壮丽的场景增添了一抹温暖。镜头拉近，可以看到船上的旗帜迎风飘扬，象征着不屈的精神与冒险的勇气。这段画面充满力量，激励人心，展现了面对挑战时的无畏与执着。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=image,
+    seed=0, tiled=True,
+    height=720, width=1280,
+)
+save_video(video, "video.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py
+++ b/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py
@@ -1,30 +1,25 @@
 import torch
-from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
-from modelscope import snapshot_download
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig


-# Download models
-snapshot_download("Wan-AI/Wan2.1-T2V-1.3B", local_dir="models/Wan-AI/Wan2.1-T2V-1.3B")
-
-# Load models
-model_manager = ModelManager(device="cpu")
-model_manager.load_models(
-    [
-        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
-        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
-        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
    ],
-    torch_dtype=torch.bfloat16, # You can set `torch_dtype=torch.float8_e4m3fn` to enable FP8 quantization.
 )
-pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
-pipe.enable_vram_management(num_persistent_param_in_dit=None)
+pipe.enable_vram_management()

 # Text-to-video
 video = pipe(
    prompt="纪实摄影风格画面，一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-    num_inference_steps=50,
-    seed=0, tiled=True
+    seed=0, tiled=True,
 )
 save_video(video, "video1.mp4", fps=15, quality=5)

@@ -34,7 +29,6 @@ video = pipe(
    prompt="纪实摄影风格画面，一只活泼的小狗戴着黑色墨镜在绿茵茵的草地上迅速奔跑。小狗毛色棕黄，戴着黑色墨镜，两只耳朵立起，神情专注而欢快。阳光洒在它身上，使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地，偶尔点缀着几朵野花，远处隐约可见蓝天和几片白云。透视感鲜明，捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。",
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
    input_video=video, denoising_strength=0.7,
-    num_inference_steps=50,
    seed=1, tiled=True
 )
 save_video(video, "video2.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py
+++ b/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py
@@ -0,0 +1,24 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+# Text-to-video
+video = pipe(
+    prompt="一名宇航员身穿太空服，面朝镜头骑着一匹机械马在火星表面驰骋。红色的荒凉地表延伸至远方，点缀着巨大的陨石坑和奇特的岩石结构。机械马的步伐稳健，扬起微弱的尘埃，展现出未来科技与原始探索的完美结合。宇航员手持操控装置，目光坚定，仿佛正在开辟人类的新疆域。背景是深邃的宇宙和蔚蓝的地球，画面既科幻又充满希望，让人不禁畅想未来的星际生活。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=0, tiled=True,
+)
+save_video(video, "video1.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py
+++ b/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py
@@ -0,0 +1,52 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=["data/examples/wan/depth_video.mp4", "data/examples/wan/cat_fightning.jpg"]
+)
+
+# Depth video -> Video
+control_video = VideoData("data/examples/wan/depth_video.mp4", height=480, width=832)
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_video=control_video,
+    seed=1, tiled=True
+)
+save_video(video, "video1.mp4", fps=15, quality=5)
+
+# Reference image -> Video
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_reference_image=Image.open("data/examples/wan/cat_fightning.jpg").resize((832, 480)),
+    seed=1, tiled=True
+)
+save_video(video, "video2.mp4", fps=15, quality=5)
+
+# Depth video + Reference image -> Video
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_video=control_video,
+    vace_reference_image=Image.open("data/examples/wan/cat_fightning.jpg").resize((832, 480)),
+    seed=1, tiled=True
+)
+save_video(video, "video3.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py
+++ b/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py
@@ -0,0 +1,53 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=["data/examples/wan/depth_video.mp4", "data/examples/wan/cat_fightning.jpg"]
+)
+
+# Depth video -> Video
+control_video = VideoData("data/examples/wan/depth_video.mp4", height=480, width=832)
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_video=control_video,
+    seed=1, tiled=True
+)
+save_video(video, "video1.mp4", fps=15, quality=5)
+
+# Reference image -> Video
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_reference_image=Image.open("data/examples/wan/cat_fightning.jpg").resize((832, 480)),
+    seed=1, tiled=True
+)
+save_video(video, "video2.mp4", fps=15, quality=5)
+
+# Depth video + Reference image -> Video
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_video=control_video,
+    vace_reference_image=Image.open("data/examples/wan/cat_fightning.jpg").resize((832, 480)),
+    seed=1, tiled=True
+)
+save_video(video, "video3.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py
+++ b/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py
@@ -0,0 +1,54 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+    ],
+)
+
+
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=["data/examples/wan/depth_video.mp4", "data/examples/wan/cat_fightning.jpg"]
+)
+
+# Depth video -> Video
+control_video = VideoData("data/examples/wan/depth_video.mp4", height=480, width=832)
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_video=control_video,
+    seed=1, tiled=True
+)
+save_video(video, "video1_14b.mp4", fps=15, quality=5)
+
+# Reference image -> Video
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_reference_image=Image.open("data/examples/wan/cat_fightning.jpg").resize((832, 480)),
+    seed=1, tiled=True
+)
+save_video(video, "video2_14b.mp4", fps=15, quality=5)
+
+# Depth video + Reference image -> Video
+video = pipe(
+    prompt="两只可爱的橘猫戴上拳击手套，站在一个拳击台上搏斗。",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    vace_video=control_video,
+    vace_reference_image=Image.open("data/examples/wan/cat_fightning.jpg").resize((832, 480)),
+    seed=1, tiled=True
+)
+save_video(video, "video3_14b.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh
@@ -0,0 +1,13 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_motion_bucket_id.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth,DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1:model.safetensors" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.motion_controller." \
+  --output_path "./models/train/Wan2.1-1.3b-speedcontrol-v1_full" \
+  --trainable_models "motion_controller" \
+  --extra_inputs "motion_bucket_id"
--- a/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh
@@ -0,0 +1,13 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-FLF2V-14B-720P:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-FLF2V-14B-720P:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-FLF2V-14B-720P:Wan2.1_VAE.pth,Wan-AI/Wan2.1-FLF2V-14B-720P:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-FLF2V-14B-720P_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh
@@ -0,0 +1,14 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_control.csv \
+  --data_file_keys "video,control_video" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-1.3B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-1.3B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-1.3B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-1.3B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-1.3B-Control_full" \
+  --trainable_models "dit" \
+  --extra_inputs "control_video"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh
@@ -0,0 +1,13 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-1.3B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-1.3B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-1.3B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-1.3B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-1.3B-InP_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh
@@ -0,0 +1,14 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_control.csv \
+  --data_file_keys "video,control_video" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-14B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-14B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-14B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-14B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-14B-Control_full" \
+  --trainable_models "dit" \
+  --extra_inputs "control_video"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh
@@ -0,0 +1,13 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-14B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-14B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-14B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-14B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-14B-InP_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh
@@ -0,0 +1,13 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,camera_control_direction,camera_control_speed"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh
@@ -0,0 +1,14 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_reference_control.csv \
+  --data_file_keys "video,control_video,reference_image" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control_full" \
+  --trainable_models "dit" \
+  --extra_inputs "control_video,reference_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh
@@ -0,0 +1,13 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-InP_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh
@@ -0,0 +1,13 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,camera_control_direction,camera_control_speed"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh
@@ -0,0 +1,14 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_reference_control.csv \
+  --data_file_keys "video,control_video,reference_image" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control_full" \
+  --trainable_models "dit" \
+  --extra_inputs "control_video,reference_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh
@@ -0,0 +1,13 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-14B-InP_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh
@@ -0,0 +1,13 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-I2V-14B-480P:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-I2V-14B-480P:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-I2V-14B-480P:Wan2.1_VAE.pth,Wan-AI/Wan2.1-I2V-14B-480P:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-I2V-14B-480P_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh
@@ -0,0 +1,13 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-I2V-14B-720P:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-I2V-14B-720P:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-I2V-14B-720P:Wan2.1_VAE.pth,Wan-AI/Wan2.1-I2V-14B-720P:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-I2V-14B-720P_full" \
+  --trainable_models "dit" \
+  --extra_inputs "input_image"
--- a/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh
@@ -0,0 +1,12 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-T2V-1.3B_full" \
+  --trainable_models "dit"
--- a/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh
@@ -0,0 +1,12 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-14B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-T2V-14B_full" \
+  --trainable_models "dit"
--- a/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh
@@ -0,0 +1,16 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \
+  --data_file_keys "video,vace_video,vace_reference_image" \
+  --height 480 \
+  --width 832 \
+  --num_frames 49 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "iic/VACE-Wan2.1-1.3B-Preview:diffusion_pytorch_model*.safetensors,iic/VACE-Wan2.1-1.3B-Preview:models_t5_umt5-xxl-enc-bf16.pth,iic/VACE-Wan2.1-1.3B-Preview:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.vace." \
+  --output_path "./models/train/Wan2.1-VACE-1.3B-Preview_full" \
+  --trainable_models "vace" \
+  --extra_inputs "vace_video,vace_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh
@@ -0,0 +1,16 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \
+  --data_file_keys "video,vace_video,vace_reference_image" \
+  --height 480 \
+  --width 832 \
+  --num_frames 49 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-VACE-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-VACE-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-VACE-1.3B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.vace." \
+  --output_path "./models/train/Wan2.1-VACE-1.3B_full" \
+  --trainable_models "vace" \
+  --extra_inputs "vace_video,vace_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh
+++ b/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh
@@ -0,0 +1,16 @@
+accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \
+  --data_file_keys "video,vace_video,vace_reference_image" \
+  --height 480 \
+  --width 832 \
+  --num_frames 17 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-VACE-14B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-VACE-14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-VACE-14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.vace." \
+  --output_path "./models/train/Wan2.1-VACE-14B_full" \
+  --trainable_models "vace" \
+  --extra_inputs "vace_video,vace_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/wanvideo/model_training/full/accelerate_config_14B.yaml
+++ b/examples/wanvideo/model_training/full/accelerate_config_14B.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/wanvideo/model_training/full/run_test.py
+++ b/examples/wanvideo/model_training/full/run_test.py
@@ -0,0 +1,38 @@
+import multiprocessing, os
+
+
+def run_task(scripts, thread_id, thread_num):
+    for script_id, script in enumerate(scripts):
+        if script_id % thread_num == thread_id:
+            log_file_name = script.replace("/", "_") + ".txt"
+            cmd = f"CUDA_VISIBLE_DEVICES={thread_id} bash {script} > data/log/{log_file_name} 2>&1"
+            os.makedirs("data/log", exist_ok=True)
+            print(cmd, flush=True)
+            os.system(cmd)
+    
+
+if __name__ == "__main__":
+    # 1.3B
+    scripts = []
+    for file_name in os.listdir("examples/wanvideo/model_training/full"):
+        if file_name != "run_test.py" and "14B" not in file_name:
+            scripts.append(os.path.join("examples/wanvideo/model_training/full", file_name))
+
+    processes = [multiprocessing.Process(target=run_task, args=(scripts, i, 8)) for i in range(8)]
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    
+    # 14B
+    scripts = []
+    for file_name in os.listdir("examples/wanvideo/model_training/full"):
+        if file_name != "run_test.py" and "14B" in file_name:
+            scripts.append(os.path.join("examples/wanvideo/model_training/full", file_name))
+    for script in scripts:
+        log_file_name = script.replace("/", "_") + ".txt"
+        cmd = f"bash {script} > data/log/{log_file_name} 2>&1"
+        print(cmd, flush=True)
+        os.system(cmd)
+    
+    print("Done!")
--- a/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_motion_bucket_id.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth,DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1:model.safetensors" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-1.3b-speedcontrol-v1_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "motion_bucket_id"
--- a/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-FLF2V-14B-720P:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-FLF2V-14B-720P:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-FLF2V-14B-720P:Wan2.1_VAE.pth,Wan-AI/Wan2.1-FLF2V-14B-720P:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-FLF2V-14B-720P_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh
@@ -0,0 +1,16 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_control.csv \
+  --data_file_keys "video,control_video" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-1.3B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-1.3B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-1.3B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-1.3B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-1.3B-Control_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "control_video"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-1.3B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-1.3B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-1.3B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-1.3B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-1.3B-InP_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh
@@ -0,0 +1,16 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_control.csv \
+  --data_file_keys "video,control_video" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-14B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-14B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-14B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-14B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-14B-Control_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "control_video"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-14B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-14B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-14B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-14B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-14B-InP_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,camera_control_direction,camera_control_speed"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh
@@ -0,0 +1,16 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_reference_control.csv \
+  --data_file_keys "video,control_video,reference_image" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "control_video,reference_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-InP_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-5 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,camera_control_direction,camera_control_speed"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh
@@ -0,0 +1,16 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_reference_control.csv \
+  --data_file_keys "video,control_video,reference_image" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-Control:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-Control:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-Control:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-Control:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "control_video,reference_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-InP:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-InP:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-InP:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-InP:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-Fun-V1.1-14B-InP_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image,end_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-I2V-14B-480P:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-I2V-14B-480P:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-I2V-14B-480P:Wan2.1_VAE.pth,Wan-AI/Wan2.1-I2V-14B-480P:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-I2V-14B-480P_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh
@@ -0,0 +1,15 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-I2V-14B-720P:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-I2V-14B-720P:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-I2V-14B-720P:Wan2.1_VAE.pth,Wan-AI/Wan2.1-I2V-14B-720P:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-I2V-14B-720P_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "input_image"
--- a/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh
@@ -0,0 +1,14 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-T2V-1.3B_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32
--- a/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh
@@ -0,0 +1,14 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata.csv \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-14B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/Wan2.1-T2V-14B_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32
--- a/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh
@@ -0,0 +1,17 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \
+  --data_file_keys "video,vace_video,vace_reference_image" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "iic/VACE-Wan2.1-1.3B-Preview:diffusion_pytorch_model*.safetensors,iic/VACE-Wan2.1-1.3B-Preview:models_t5_umt5-xxl-enc-bf16.pth,iic/VACE-Wan2.1-1.3B-Preview:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.vace." \
+  --output_path "./models/train/Wan2.1-VACE-1.3B-Preview_lora" \
+  --lora_base_model "vace" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "vace_video,vace_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh
@@ -0,0 +1,17 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \
+  --data_file_keys "video,vace_video,vace_reference_image" \
+  --height 480 \
+  --width 832 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-VACE-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-VACE-1.3B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-VACE-1.3B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.vace." \
+  --output_path "./models/train/Wan2.1-VACE-1.3B_lora" \
+  --lora_base_model "vace" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "vace_video,vace_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh
+++ b/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh
@@ -0,0 +1,18 @@
+accelerate launch examples/wanvideo/model_training/train.py \
+  --dataset_base_path data/example_video_dataset \
+  --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \
+  --data_file_keys "video,vace_video,vace_reference_image" \
+  --height 480 \
+  --width 832 \
+  --num_frames 17 \
+  --dataset_repeat 100 \
+  --model_id_with_origin_paths "Wan-AI/Wan2.1-VACE-14B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-VACE-14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-VACE-14B:Wan2.1_VAE.pth" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.vace." \
+  --output_path "./models/train/Wan2.1-VACE-14B_lora" \
+  --lora_base_model "vace" \
+  --lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
+  --lora_rank 32 \
+  --extra_inputs "vace_video,vace_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/wanvideo/model_training/lora/run_test.py
+++ b/examples/wanvideo/model_training/lora/run_test.py
@@ -0,0 +1,25 @@
+import multiprocessing, os
+
+
+def run_task(scripts, thread_id, thread_num):
+    for script_id, script in enumerate(scripts):
+        if script_id % thread_num == thread_id:
+            log_file_name = script.replace("/", "_") + ".txt"
+            cmd = f"CUDA_VISIBLE_DEVICES={thread_id} bash {script} > data/log/{log_file_name} 2>&1"
+            os.makedirs("data/log", exist_ok=True)
+            print(cmd, flush=True)
+            os.system(cmd)
+    
+
+if __name__ == "__main__":
+    scripts = []
+    for file_name in os.listdir("examples/wanvideo/model_training/lora"):
+        if file_name != "run_test.py":
+            scripts.append(os.path.join("examples/wanvideo/model_training/lora", file_name))
+
+    processes = [multiprocessing.Process(target=run_task, args=(scripts, i, 8)) for i in range(8)]
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    print("Done!")
--- a/examples/wanvideo/model_training/train.py
+++ b/examples/wanvideo/model_training/train.py
@@ -0,0 +1,120 @@
+import torch, os, json
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from diffsynth.trainers.utils import DiffusionTrainingModule, VideoDataset, ModelLogger, launch_training_task, wan_parser
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+
+class WanTrainingModule(DiffusionTrainingModule):
+    def __init__(
+        self,
+        model_paths=None, model_id_with_origin_paths=None,
+        trainable_models=None,
+        lora_base_model=None, lora_target_modules="q,k,v,o,ffn.0,ffn.2", lora_rank=32,
+        use_gradient_checkpointing=True,
+        use_gradient_checkpointing_offload=False,
+        extra_inputs=None,
+    ):
+        super().__init__()
+        # Load models
+        model_configs = []
+        if model_paths is not None:
+            model_paths = json.loads(model_paths)
+            model_configs += [ModelConfig(path=path) for path in model_paths]
+        if model_id_with_origin_paths is not None:
+            model_id_with_origin_paths = model_id_with_origin_paths.split(",")
+            model_configs += [ModelConfig(model_id=i.split(":")[0], origin_file_pattern=i.split(":")[1]) for i in model_id_with_origin_paths]
+        self.pipe = WanVideoPipeline.from_pretrained(torch_dtype=torch.bfloat16, device="cpu", model_configs=model_configs)
+        
+        # Reset training scheduler
+        self.pipe.scheduler.set_timesteps(1000, training=True)
+        
+        # Freeze untrainable models
+        self.pipe.freeze_except([] if trainable_models is None else trainable_models.split(","))
+        
+        # Add LoRA to the base models
+        if lora_base_model is not None:
+            model = self.add_lora_to_model(
+                getattr(self.pipe, lora_base_model),
+                target_modules=lora_target_modules.split(","),
+                lora_rank=lora_rank
+            )
+            setattr(self.pipe, lora_base_model, model)
+            
+        # Store other configs
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        self.use_gradient_checkpointing_offload = use_gradient_checkpointing_offload
+        self.extra_inputs = extra_inputs.split(",") if extra_inputs is not None else []
+        
+        
+    def forward_preprocess(self, data):
+        # CFG-sensitive parameters
+        inputs_posi = {"prompt": data["prompt"]}
+        inputs_nega = {}
+        
+        # CFG-unsensitive parameters
+        inputs_shared = {
+            # Assume you are using this pipeline for inference,
+            # please fill in the input parameters.
+            "input_video": data["video"],
+            "height": data["video"][0].size[1],
+            "width": data["video"][0].size[0],
+            "num_frames": len(data["video"]),
+            # Please do not modify the following parameters
+            # unless you clearly know what this will cause.
+            "cfg_scale": 1,
+            "tiled": False,
+            "rand_device": self.pipe.device,
+            "use_gradient_checkpointing": self.use_gradient_checkpointing,
+            "use_gradient_checkpointing_offload": self.use_gradient_checkpointing_offload,
+            "cfg_merge": False,
+            "vace_scale": 1,
+        }
+        
+        # Extra inputs
+        for extra_input in self.extra_inputs:
+            if extra_input == "input_image":
+                inputs_shared["input_image"] = data["video"][0]
+            elif extra_input == "end_image":
+                inputs_shared["end_image"] = data["video"][-1]
+            else:
+                inputs_shared[extra_input] = data[extra_input]
+        
+        # Pipeline units will automatically process the input parameters.
+        for unit in self.pipe.units:
+            inputs_shared, inputs_posi, inputs_nega = self.pipe.unit_runner(unit, self.pipe, inputs_shared, inputs_posi, inputs_nega)
+        return {**inputs_shared, **inputs_posi}
+    
+    
+    def forward(self, data, inputs=None):
+        if inputs is None: inputs = self.forward_preprocess(data)
+        models = {name: getattr(self.pipe, name) for name in self.pipe.in_iteration_models}
+        loss = self.pipe.training_loss(**models, **inputs)
+        return loss
+
+
+if __name__ == "__main__":
+    parser = wan_parser()
+    args = parser.parse_args()
+    dataset = VideoDataset(args=args)
+    model = WanTrainingModule(
+        model_paths=args.model_paths,
+        model_id_with_origin_paths=args.model_id_with_origin_paths,
+        trainable_models=args.trainable_models,
+        lora_base_model=args.lora_base_model,
+        lora_target_modules=args.lora_target_modules,
+        lora_rank=args.lora_rank,
+        use_gradient_checkpointing_offload=args.use_gradient_checkpointing_offload,
+        extra_inputs=args.extra_inputs,
+    )
+    model_logger = ModelLogger(
+        args.output_path,
+        remove_prefix_in_ckpt=args.remove_prefix_in_ckpt
+    )
+    optimizer = torch.optim.AdamW(model.trainable_modules(), lr=args.learning_rate)
+    scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer)
+    launch_training_task(
+        dataset, model, model_logger, optimizer, scheduler,
+        num_epochs=args.num_epochs,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+    )
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py
@@ -0,0 +1,28 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-1.3b-speedcontrol-v1_full/epoch-1.safetensors")
+pipe.motion_controller.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+# Text-to-video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    seed=1, tiled=True,
+    motion_bucket_id=50
+)
+save_video(video, "video_Wan2.1-1.3b-speedcontrol-v1.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py
@@ -0,0 +1,33 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-FLF2V-14B-720P_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0],
+    end_image=video[80],
+    seed=0, tiled=True,
+    sigma_shift=16,
+)
+save_video(video, "video_Wan2.1-FLF2V-14B-720P.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py
@@ -0,0 +1,32 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-1.3B-Control_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832)
+video = [video[i] for i in range(81)]
+
+# Control video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=video,
+    seed=1, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-1.3B-Control.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py
@@ -0,0 +1,31 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-1.3B-InP_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0], end_image=video[80],
+    seed=0, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-1.3B-InP.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py
@@ -0,0 +1,32 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-14B-Control_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832)
+video = [video[i] for i in range(81)]
+
+# Control video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=video,
+    seed=1, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-14B-Control.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py
@@ -0,0 +1,31 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-14B-InP_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0], end_image=video[80],
+    seed=0, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-14B-InP.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py
@@ -0,0 +1,32 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0],
+    camera_control_direction="Left", camera_control_speed=0.0,
+    seed=0, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-V1.1-1.3B-Control-Camera.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py
@@ -0,0 +1,33 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-Control_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832)
+video = [video[i] for i in range(81)]
+reference_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0]
+
+# Control video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=video, reference_image=reference_image,
+    seed=1, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-V1.1-1.3B-Control.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py
@@ -0,0 +1,31 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-InP_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0], end_image=video[80],
+    seed=0, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-V1.1-1.3B-InP.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py
@@ -0,0 +1,32 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0],
+    camera_control_direction="Left", camera_control_speed=0.0,
+    seed=0, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-V1.1-14B-Control-Camera.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py
@@ -0,0 +1,33 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-Control_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832)
+video = [video[i] for i in range(81)]
+reference_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0]
+
+# Control video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    control_video=video, reference_image=reference_image,
+    seed=1, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-V1.1-14B-Control.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py
@@ -0,0 +1,31 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-InP_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
+
+# First and last frame to video
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=video[0], end_image=video[80],
+    seed=0, tiled=True
+)
+save_video(video, "video_Wan2.1-Fun-V1.1-14B-InP.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py
@@ -0,0 +1,30 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-I2V-14B-480P_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0]
+
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=input_image,
+    seed=1, tiled=True
+)
+save_video(video, "video_Wan2.1-I2V-14B-480P.mp4", fps=15, quality=5)
--- a/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py
+++ b/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py
@@ -0,0 +1,30 @@
+import torch
+from PIL import Image
+from diffsynth import save_video, VideoData, load_state_dict
+from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+pipe = WanVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
+        ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
+    ],
+)
+state_dict = load_state_dict("models/train/Wan2.1-I2V-14B-720P_full/epoch-1.safetensors")
+pipe.dit.load_state_dict(state_dict)
+pipe.enable_vram_management()
+
+input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0]
+
+video = pipe(
+    prompt="from sunset to night, a small town, light, house, river",
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    input_image=input_image,
+    seed=1, tiled=True
+)
+save_video(video, "video_Wan2.1-I2V-14B-720P.mp4", fps=15, quality=5)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Artiprocher	040e7fda49	update wan training	2025-06-16 15:46:20 +08:00
Artiprocher	c0706e3fbd	support torch<2.6.0	2025-06-16 13:05:54 +08:00
Artiprocher	3d2b51554a	update torch version	2025-06-16 12:35:21 +08:00
Artiprocher	9f3e02f167	fix model id	2025-06-16 10:57:33 +08:00
Artiprocher	b6a111d3a2	refine readme	2025-06-13 14:13:38 +08:00
Artiprocher	6eecc9d442	refine readme	2025-06-13 14:02:20 +08:00
Artiprocher	35269783d7	refine readme	2025-06-13 14:00:58 +08:00
Artiprocher	830b1b7202	wan-refactor	2025-06-13 13:46:17 +08:00
Zhongjie Duan	436a91e0c9	Merge pull request #602 from modelscope/revert-601-wan-refactor Revert "Wan refactor"	2025-06-11 17:30:06 +08:00
Zhongjie Duan	40760ab88b	Revert "Wan refactor"	2025-06-11 17:29:27 +08:00
CD22104	8badd63a2d	Merge pull request #601 from CD22104/wan-refactor Wan refactor	2025-06-11 17:26:58 +08:00
CD22104	b1afff1728	camera	2025-06-11 17:24:09 +08:00
Artiprocher	6e977e1181	refine wan doc	2025-06-06 15:19:09 +08:00
Artiprocher	62f6ca2b8a	new wan trainer	2025-06-06 14:58:41 +08:00
Artiprocher	8f10a9c353	training script	2025-05-19 19:02:52 +08:00
Artiprocher	675eefa07e	training framework	2025-05-12 17:48:28 +08:00
Artiprocher	dbef6122e9	...	2025-05-05 23:23:06 +08:00
Artiprocher	d150bcf622	...	2025-05-05 13:01:45 +08:00
Artiprocher	451aab0116	refactor	2025-05-04 15:42:11 +08:00
Artiprocher	3edf3583b1	wan-fun-v1.1 reference control	2025-04-30 11:38:17 +08:00
Zhongjie Duan	ef2a7abad4	Step1x vram (#556 ) * support step1x vram management	2025-04-28 10:13:20 +08:00
Zhongjie Duan	32f630ff5f	Merge pull request #555 from modelscope/step1x support step1x	2025-04-27 20:40:43 +08:00
Artiprocher	109a0a0d49	support step1x	2025-04-27 20:37:43 +08:00
Zhongjie Duan	4f01b37a2a	Merge pull request #553 from modelscope/flex Flex	2025-04-25 12:24:18 +08:00
Artiprocher	cc6306136c	flex full support	2025-04-25 12:23:29 +08:00
Artiprocher	419ace37f3	flex full support	2025-04-25 11:32:13 +08:00
Artiprocher	ccf24c363f	flex control	2025-04-24 19:18:54 +08:00
Artiprocher	b7a1ac6671	flex t2i	2025-04-24 14:51:40 +08:00
Zhongjie Duan	e54c0a8468	Merge pull request #548 from CD22104/main liblib-controlnet	2025-04-22 14:54:16 +08:00
xuyixuan.xyx	5f4cb32255	liblib-controlnet	2025-04-22 13:45:49 +08:00
Zhongjie Duan	7b6cf39618	Merge pull request #544 from modelscope/Artiprocher-patch-1 Update train_wan_t2v.py	2025-04-17 15:39:44 +08:00
Zhongjie Duan	bf81de0c88	Update train_wan_t2v.py	2025-04-17 15:37:30 +08:00
Zhongjie Duan	b36cad6929	Merge pull request #543 from modelscope/wan-flf2v bugfix	2025-04-17 15:24:36 +08:00
Zhongjie Duan	b161bd6dfd	bugfix	2025-04-17 15:23:46 +08:00
Zhongjie Duan	538cfcbb77	Merge pull request #541 from modelscope/wan-flf2v Wan flf2v	2025-04-17 14:51:08 +08:00
Artiprocher	a4105d2c0e	support wan-flf2v	2025-04-17 14:48:55 +08:00
Artiprocher	553b341f5f	support wan-flf2v	2025-04-17 14:47:55 +08:00
Zhongjie Duan	e9e24b8cf1	Merge pull request #537 from CD22104/main issue523	2025-04-16 15:53:39 +08:00
CD22104	1b693d0028	issue523	2025-04-16 15:49:52 +08:00
Zhongjie Duan	a4c3c07229	Merge pull request #536 from modelscope/wan-vace-quant support vace quant	2025-04-16 10:43:14 +08:00
Artiprocher	6b24748c80	support vace quant	2025-04-16 10:29:21 +08:00
Zhongjie Duan	8f2f8646eb	Merge pull request #526 from mohui37/main Update train_wan_t2v.py	2025-04-16 09:55:19 +08:00
Zhongjie Duan	e3ac438f5a	Merge pull request #533 from modelscope/wan-vace vace	2025-04-15 18:47:36 +08:00
Artiprocher	b731628112	vace	2025-04-15 17:52:25 +08:00
mohui37	0dc56d9dcc	Update train_wan_t2v.py 在应用itv的管道处理数据时有bug，提交修复	2025-04-11 17:05:40 +08:00
Zhongjie Duan	b925b402e2	Merge pull request #522 from modelscope/Artiprocher-patch-1 Update README.md	2025-04-10 11:42:32 +08:00
Zhongjie Duan	61d9653536	Update README.md	2025-04-10 11:42:18 +08:00