Merge branch 'main' into wanvideo_seq_usp

2026-03-18 22:08:13 +00:00 · 2025-07-30 16:44:44 +08:00
parent 00279a8375 db124fa6bc
commit aef982a53c
106 changed files with 6696 additions and 697 deletions
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -58,14 +58,19 @@ from ..models.stepvideo_dit import StepVideoModel
 from ..models.wan_video_dit import WanModel
 from ..models.wan_video_text_encoder import WanTextEncoder
 from ..models.wan_video_image_encoder import WanImageEncoder
-from ..models.wan_video_vae import WanVideoVAE
+from ..models.wan_video_vae import WanVideoVAE, WanVideoVAE38
 from ..models.wan_video_motion_controller import WanMotionControllerModel
 from ..models.wan_video_vace import VaceWanModel

 from ..models.step1x_connector import Qwen2Connector

-from ..lora.flux_lora import FluxLoraPatcher
+from ..models.flux_value_control import SingleValueEncoder

+from ..lora.flux_lora import FluxLoraPatcher
+from ..models.flux_lora_encoder import FluxLoRAEncoder
+
+from ..models.nexus_gen_projector import NexusGenAdapter, NexusGenImageEmbeddingMerger
+from ..models.nexus_gen import NexusGenAutoregressiveModel

 model_loader_configs = [
    # These configs are provided for detecting model type automatically.
@@ -104,6 +109,7 @@ model_loader_configs = [
    (None, "023f054d918a84ccf503481fd1e3379e", ["flux_dit"], [FluxDiT], "civitai"),
    (None, "d02f41c13549fa5093d3521f62a5570a", ["flux_dit"], [FluxDiT], "civitai"),
    (None, "605c56eab23e9e2af863ad8f0813a25d", ["flux_dit"], [FluxDiT], "diffusers"),
+    (None, "0629116fce1472503a66992f96f3eb1a", ["flux_value_controller"], [SingleValueEncoder], "civitai"),
    (None, "280189ee084bca10f70907bf6ce1649d", ["cog_vae_encoder", "cog_vae_decoder"], [CogVAEEncoder, CogVAEDecoder], "diffusers"),
    (None, "9b9313d104ac4df27991352fec013fd4", ["rife"], [IFNet], "civitai"),
    (None, "6b7116078c4170bfbeaedc8fe71f6649", ["esrgan"], [RRDBNet], "civitai"),
@@ -137,6 +143,8 @@ model_loader_configs = [
    (None, "26bde73488a92e64cc20b0a7485b9e5b", ["wan_video_dit"], [WanModel], "civitai"),
    (None, "ac6a5aa74f4a0aab6f64eb9a72f19901", ["wan_video_dit"], [WanModel], "civitai"), 
    (None, "b61c605c2adbd23124d152ed28e049ae", ["wan_video_dit"], [WanModel], "civitai"), 
+    (None, "1f5ab7703c6fc803fdded85ff040c316", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "5b013604280dd715f8457c6ed6d6a626", ["wan_video_dit"], [WanModel], "civitai"),
    (None, "a61453409b67cd3246cf0c3bebad47ba", ["wan_video_dit", "wan_video_vace"], [WanModel, VaceWanModel], "civitai"),
    (None, "7a513e1f257a861512b1afd387a8ecd9", ["wan_video_dit", "wan_video_vace"], [WanModel, VaceWanModel], "civitai"),
    (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
@@ -144,9 +152,14 @@ model_loader_configs = [
    (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
    (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
    (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+    (None, "e1de6c02cdac79f8b739f4d3698cd216", ["wan_video_vae"], [WanVideoVAE38], "civitai"),
    (None, "dbd5ec76bbf977983f972c151d545389", ["wan_video_motion_controller"], [WanMotionControllerModel], "civitai"),
    (None, "d30fb9e02b1dbf4e509142f05cf7dd50", ["flux_dit", "step1x_connector"], [FluxDiT, Qwen2Connector], "civitai"),
    (None, "30143afb2dea73d1ac580e0787628f8c", ["flux_lora_patcher"], [FluxLoraPatcher], "civitai"),
+    (None, "77c2e4dd2440269eb33bfaa0d004f6ab", ["flux_lora_encoder"], [FluxLoRAEncoder], "civitai"),
+    (None, "3e6c61b0f9471135fc9c6d6a98e98b6d", ["flux_dit", "nexus_gen_generation_adapter"], [FluxDiT, NexusGenAdapter], "civitai"),
+    (None, "63c969fd37cce769a90aa781fbff5f81", ["flux_dit", "nexus_gen_editing_adapter"], [FluxDiT, NexusGenImageEmbeddingMerger], "civitai"),
+    (None, "2bd19e845116e4f875a0a048e27fc219", ["nexus_gen_llm"], [NexusGenAutoregressiveModel], "civitai"),
 ]
 huggingface_model_loader_configs = [
    # These configs are provided for detecting model type automatically.
--- a/diffsynth/lora/flux_lora.py
+++ b/diffsynth/lora/flux_lora.py
@@ -1,18 +1,58 @@
 import torch, math
-from diffsynth.lora import GeneralLoRALoader
-from diffsynth.models.lora import FluxLoRAFromCivitai
+from . import GeneralLoRALoader
+from ..utils import ModelConfig
+from ..models.utils import load_state_dict
+from typing import Union


 class FluxLoRALoader(GeneralLoRALoader):
    def __init__(self, device="cpu", torch_dtype=torch.float32):
        super().__init__(device=device, torch_dtype=torch_dtype)
+    
+        self.diffusers_rename_dict = {
+            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_A.weight":"single_blocks.blockid.a_to_k.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_B.weight":"single_blocks.blockid.a_to_k.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_A.weight":"single_blocks.blockid.a_to_q.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_B.weight":"single_blocks.blockid.a_to_q.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_A.weight":"single_blocks.blockid.a_to_v.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_B.weight":"single_blocks.blockid.a_to_v.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.norm.linear.lora_A.weight":"single_blocks.blockid.norm.linear.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.norm.linear.lora_B.weight":"single_blocks.blockid.norm.linear.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_A.weight":"single_blocks.blockid.proj_in_besides_attn.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_B.weight":"single_blocks.blockid.proj_in_besides_attn.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_out.lora_A.weight":"single_blocks.blockid.proj_out.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_out.lora_B.weight":"single_blocks.blockid.proj_out.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_A.weight":"blocks.blockid.attn.b_to_k.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_B.weight":"blocks.blockid.attn.b_to_k.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_A.weight":"blocks.blockid.attn.b_to_q.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_B.weight":"blocks.blockid.attn.b_to_q.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_A.weight":"blocks.blockid.attn.b_to_v.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_B.weight":"blocks.blockid.attn.b_to_v.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_A.weight":"blocks.blockid.attn.b_to_out.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_B.weight":"blocks.blockid.attn.b_to_out.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_k.lora_A.weight":"blocks.blockid.attn.a_to_k.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_k.lora_B.weight":"blocks.blockid.attn.a_to_k.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_A.weight":"blocks.blockid.attn.a_to_out.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_B.weight":"blocks.blockid.attn.a_to_out.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_q.lora_A.weight":"blocks.blockid.attn.a_to_q.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_q.lora_B.weight":"blocks.blockid.attn.a_to_q.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_v.lora_A.weight":"blocks.blockid.attn.a_to_v.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_v.lora_B.weight":"blocks.blockid.attn.a_to_v.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_A.weight":"blocks.blockid.ff_a.0.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_B.weight":"blocks.blockid.ff_a.0.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.2.lora_A.weight":"blocks.blockid.ff_a.2.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.2.lora_B.weight":"blocks.blockid.ff_a.2.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_A.weight":"blocks.blockid.ff_b.0.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_B.weight":"blocks.blockid.ff_b.0.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_A.weight":"blocks.blockid.ff_b.2.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_B.weight":"blocks.blockid.ff_b.2.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.norm1.linear.lora_A.weight":"blocks.blockid.norm1_a.linear.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.norm1.linear.lora_B.weight":"blocks.blockid.norm1_a.linear.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_A.weight":"blocks.blockid.norm1_b.linear.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_B.weight":"blocks.blockid.norm1_b.linear.lora_B.default.weight",
+        }

-    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
-        super().load(model, state_dict_lora, alpha)
-        
-    def convert_state_dict(self, state_dict):
-        # TODO: support other lora format
-        rename_dict = {
+        self.civitai_rename_dict = {
            "lora_unet_double_blocks_blockid_img_mod_lin.lora_down.weight": "blocks.blockid.norm1_a.linear.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_img_mod_lin.lora_up.weight": "blocks.blockid.norm1_a.linear.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_down.weight": "blocks.blockid.norm1_b.linear.lora_A.default.weight",
@@ -40,25 +80,57 @@ class FluxLoRALoader(GeneralLoRALoader):
            "lora_unet_single_blocks_blockid_linear2.lora_down.weight": "single_blocks.blockid.proj_out.lora_A.default.weight",
            "lora_unet_single_blocks_blockid_linear2.lora_up.weight": "single_blocks.blockid.proj_out.lora_B.default.weight",
        }
-        def guess_block_id(name):
-            names = name.split("_")
-            for i in names:
-                if i.isdigit():
-                    return i, name.replace(f"_{i}_", "_blockid_")
+
+    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
+        super().load(model, state_dict_lora, alpha)
+
+    
+    def convert_state_dict(self,state_dict):
+
+        def guess_block_id(name,model_resource):
+            if model_resource == 'civitai':
+                names = name.split("_")
+                for i in names:
+                    if i.isdigit():
+                        return i, name.replace(f"_{i}_", "_blockid_")
+            if model_resource == 'diffusers':
+                names = name.split(".")
+                for i in names:
+                    if i.isdigit():
+                        return i, name.replace(f"transformer_blocks.{i}.", "transformer_blocks.blockid.")
            return None, None
+
+        def guess_resource(state_dict):
+            for k in state_dict:
+                if "lora_unet_" in k:
+                    return 'civitai'
+                elif k.startswith("transformer."):
+                    return 'diffusers'
+                else:
+                    None
+        
+        model_resource = guess_resource(state_dict)
+        if model_resource is None:
+            return state_dict
+
+        rename_dict = self.diffusers_rename_dict if model_resource == 'diffusers' else self.civitai_rename_dict
        def guess_alpha(state_dict):
-            for name, param in state_dict.items():
-                if ".alpha" in name:
-                    name_ = name.replace(".alpha", ".lora_down.weight")
-                    if name_ in state_dict:
-                        lora_alpha = param.item() / state_dict[name_].shape[0]
-                        lora_alpha = math.sqrt(lora_alpha)
-                        return lora_alpha
-            return 1
+                for name, param in state_dict.items():
+                    if ".alpha" in name:
+                        for suffix in [".lora_down.weight", ".lora_A.weight"]:
+                            name_ = name.replace(".alpha", suffix)
+                            if name_ in state_dict:
+                                lora_alpha = param.item() / state_dict[name_].shape[0]
+                                lora_alpha = math.sqrt(lora_alpha)
+                                return lora_alpha
+
+                return 1
+        
        alpha = guess_alpha(state_dict)
+        
        state_dict_ = {}
        for name, param in state_dict.items():
-            block_id, source_name = guess_block_id(name)
+            block_id, source_name = guess_block_id(name,model_resource)
            if alpha != 1:
                param *= alpha
            if source_name in rename_dict:
@@ -67,6 +139,72 @@ class FluxLoRALoader(GeneralLoRALoader):
                state_dict_[target_name] = param
            else:
                state_dict_[name] = param
+        
+        if model_resource == 'diffusers':
+            for name in list(state_dict_.keys()):
+                if "single_blocks." in name and ".a_to_q." in name:
+                    mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
+                    if mlp is None:
+                        dim = 4
+                        if 'lora_A' in name:
+                            dim = 1
+                        mlp = torch.zeros(dim * state_dict_[name].shape[0],
+                                        *state_dict_[name].shape[1:],
+                                        dtype=state_dict_[name].dtype)
+                    else:
+                        state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
+                    if 'lora_A' in name:
+                        param = torch.concat([
+                            state_dict_.pop(name),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
+                            mlp,
+                        ], dim=0)
+                    elif 'lora_B' in name:
+                        d, r = state_dict_[name].shape
+                        param = torch.zeros((3*d+mlp.shape[0], 3*r+mlp.shape[1]), dtype=state_dict_[name].dtype, device=state_dict_[name].device)
+                        param[:d, :r] = state_dict_.pop(name)
+                        param[d:2*d, r:2*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_k."))
+                        param[2*d:3*d, 2*r:3*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_v."))
+                        param[3*d:, 3*r:] = mlp
+                    else:
+                        param = torch.concat([
+                            state_dict_.pop(name),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
+                            mlp,
+                        ], dim=0)
+                    name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
+                    state_dict_[name_] = param
+            for name in list(state_dict_.keys()):
+                for component in ["a", "b"]:
+                    if f".{component}_to_q." in name:
+                        name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
+                        concat_dim = 0
+                        if 'lora_A' in name:
+                            param = torch.concat([
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                            ], dim=0)
+                        elif 'lora_B' in name:
+                            origin = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
+                            d, r = origin.shape
+                            # print(d, r)
+                            param = torch.zeros((3*d, 3*r), dtype=origin.dtype, device=origin.device)
+                            param[:d, :r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
+                            param[d:2*d, r:2*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")]
+                            param[2*d:3*d, 2*r:3*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")]
+                        else:
+                            param = torch.concat([
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                            ], dim=0)
+                        state_dict_[name_] = param
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))  
        return state_dict_


@@ -140,3 +278,47 @@ class FluxLoraPatcherStateDictConverter:
    
    def from_civitai(self, state_dict):
        return state_dict
+
+
+class FluxLoRAFuser:
+    def __init__(self, device="cuda", torch_dtype=torch.bfloat16):
+        self.device = device
+        self.torch_dtype = torch_dtype
+        
+    def Matrix_Decomposition_lowrank(self, A, k):
+        U, S, V = torch.svd_lowrank(A.float(), q=k)
+        S_k = torch.diag(S[:k])
+        U_hat = U @ S_k
+        return U_hat, V.t()
+
+    def LoRA_State_Dicts_Decomposition(self, lora_state_dicts=[], q=4):
+        lora_1 = lora_state_dicts[0]
+        state_dict_ = {}
+        for k,v in lora_1.items():
+            if 'lora_A.' in k:
+                lora_B_name = k.replace('lora_A.', 'lora_B.')
+                lora_B = lora_1[lora_B_name]
+                weight = torch.mm(lora_B, v)
+                for lora_dict in lora_state_dicts[1:]:
+                    lora_A_ = lora_dict[k]
+                    lora_B_ = lora_dict[lora_B_name]
+                    weight_ = torch.mm(lora_B_, lora_A_)
+                    weight += weight_
+                new_B, new_A = self.Matrix_Decomposition_lowrank(weight, q)
+                state_dict_[lora_B_name] = new_B.to(dtype=torch.bfloat16)
+                state_dict_[k] = new_A.to(dtype=torch.bfloat16)
+        return state_dict_
+        
+    def __call__(self, lora_configs: list[Union[ModelConfig, str]]):
+        loras = []
+        loader = FluxLoRALoader(torch_dtype=self.torch_dtype, device=self.device)
+        for lora_config in lora_configs:
+            if isinstance(lora_config, str):
+                lora = load_state_dict(lora_config, torch_dtype=self.torch_dtype, device=self.device)
+            else:
+                lora_config.download_if_necessary()
+                lora = load_state_dict(lora_config.path, torch_dtype=self.torch_dtype, device=self.device)
+            lora = loader.convert_state_dict(lora)
+            loras.append(lora)
+        lora = self.LoRA_State_Dicts_Decomposition(loras)
+        return lora
--- a/diffsynth/models/flux_dit.py
+++ b/diffsynth/models/flux_dit.py
@@ -2,7 +2,7 @@ import torch
 from .sd3_dit import TimestepEmbeddings, AdaLayerNorm, RMSNorm
 from einops import rearrange
 from .tiler import TileWorker
-from .utils import init_weights_on_device
+from .utils import init_weights_on_device, hash_state_dict_keys

 def interact_with_ipadapter(hidden_states, q, ip_k, ip_v, scale=1.0):
    batch_size, num_tokens = hidden_states.shape[0:2]
@@ -662,6 +662,9 @@ class FluxDiTStateDictConverter:
        return state_dict_

    def from_civitai(self, state_dict):
+        if hash_state_dict_keys(state_dict, with_shape=True) in ["3e6c61b0f9471135fc9c6d6a98e98b6d", "63c969fd37cce769a90aa781fbff5f81"]:
+            dit_state_dict = {key.replace("pipe.dit.", ""): value for key, value in state_dict.items() if key.startswith('pipe.dit.')}
+            return dit_state_dict
        rename_dict = {
            "time_in.in_layer.bias": "time_embedder.timestep_embedder.0.bias",
            "time_in.in_layer.weight": "time_embedder.timestep_embedder.0.weight",
--- a/diffsynth/models/flux_infiniteyou.py
+++ b/diffsynth/models/flux_infiniteyou.py
@@ -104,6 +104,7 @@ class InfiniteYouImageProjector(nn.Module):
    def forward(self, x):

        latents = self.latents.repeat(x.size(0), 1, 1)
+        latents = latents.to(dtype=x.dtype, device=x.device)

        x = self.proj_in(x)

--- a/diffsynth/models/flux_lora_encoder.py
+++ b/diffsynth/models/flux_lora_encoder.py
@@ -0,0 +1,111 @@
+import torch
+from .sd_text_encoder import CLIPEncoderLayer
+
+
+class LoRALayerBlock(torch.nn.Module):
+    def __init__(self, L, dim_in, dim_out):
+        super().__init__()
+        self.x = torch.nn.Parameter(torch.randn(1, L, dim_in))
+        self.layer_norm = torch.nn.LayerNorm(dim_out)
+
+    def forward(self, lora_A, lora_B):
+        x = self.x @ lora_A.T @ lora_B.T
+        x = self.layer_norm(x)
+        return x
+    
+
+class LoRAEmbedder(torch.nn.Module):
+    def __init__(self, lora_patterns=None, L=1, out_dim=2048):
+        super().__init__()
+        if lora_patterns is None:
+            lora_patterns = self.default_lora_patterns()
+            
+        model_dict = {}
+        for lora_pattern in lora_patterns:
+            name, dim = lora_pattern["name"], lora_pattern["dim"]
+            model_dict[name.replace(".", "___")] = LoRALayerBlock(L, dim[0], dim[1])
+        self.model_dict = torch.nn.ModuleDict(model_dict)
+        
+        proj_dict = {}
+        for lora_pattern in lora_patterns:
+            layer_type, dim = lora_pattern["type"], lora_pattern["dim"]
+            if layer_type not in proj_dict:
+                proj_dict[layer_type.replace(".", "___")] = torch.nn.Linear(dim[1], out_dim)
+        self.proj_dict = torch.nn.ModuleDict(proj_dict)
+        
+        self.lora_patterns = lora_patterns
+        
+        
+    def default_lora_patterns(self):
+        lora_patterns = []
+        lora_dict = {
+            "attn.a_to_qkv": (3072, 9216), "attn.a_to_out": (3072, 3072), "ff_a.0": (3072, 12288), "ff_a.2": (12288, 3072), "norm1_a.linear": (3072, 18432),
+            "attn.b_to_qkv": (3072, 9216), "attn.b_to_out": (3072, 3072), "ff_b.0": (3072, 12288), "ff_b.2": (12288, 3072), "norm1_b.linear": (3072, 18432),
+        }
+        for i in range(19):
+            for suffix in lora_dict:
+                lora_patterns.append({
+                    "name": f"blocks.{i}.{suffix}",
+                    "dim": lora_dict[suffix],
+                    "type": suffix,
+                })
+        lora_dict = {"to_qkv_mlp": (3072, 21504), "proj_out": (15360, 3072), "norm.linear": (3072, 9216)}
+        for i in range(38):
+            for suffix in lora_dict:
+                lora_patterns.append({
+                    "name": f"single_blocks.{i}.{suffix}",
+                    "dim": lora_dict[suffix],
+                    "type": suffix,
+                })
+        return lora_patterns
+        
+    def forward(self, lora):
+        lora_emb = []
+        for lora_pattern in self.lora_patterns:
+            name, layer_type = lora_pattern["name"], lora_pattern["type"]
+            lora_A = lora[name + ".lora_A.default.weight"]
+            lora_B = lora[name + ".lora_B.default.weight"]
+            lora_out = self.model_dict[name.replace(".", "___")](lora_A, lora_B)
+            lora_out = self.proj_dict[layer_type.replace(".", "___")](lora_out)
+            lora_emb.append(lora_out)
+        lora_emb = torch.concat(lora_emb, dim=1)
+        return lora_emb
+    
+    
+class FluxLoRAEncoder(torch.nn.Module):
+    def __init__(self, embed_dim=4096, encoder_intermediate_size=8192, num_encoder_layers=1, num_embeds_per_lora=16, num_special_embeds=1):
+        super().__init__()
+        self.num_embeds_per_lora = num_embeds_per_lora
+        # embedder
+        self.embedder = LoRAEmbedder(L=num_embeds_per_lora, out_dim=embed_dim)
+        
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=32, head_dim=128) for _ in range(num_encoder_layers)])
+
+        # special embedding
+        self.special_embeds = torch.nn.Parameter(torch.randn(1, num_special_embeds, embed_dim))
+        self.num_special_embeds = num_special_embeds
+        
+        # final layer
+        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
+        self.final_linear = torch.nn.Linear(embed_dim, embed_dim)
+
+    def forward(self, lora):
+        lora_embeds = self.embedder(lora)
+        special_embeds = self.special_embeds.to(dtype=lora_embeds.dtype, device=lora_embeds.device)
+        embeds = torch.concat([special_embeds, lora_embeds], dim=1)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds)
+        embeds = embeds[:, :self.num_special_embeds]
+        embeds = self.final_layer_norm(embeds)
+        embeds = self.final_linear(embeds)
+        return embeds
+    
+    @staticmethod
+    def state_dict_converter():
+        return FluxLoRAEncoderStateDictConverter()
+
+
+class FluxLoRAEncoderStateDictConverter:
+    def from_civitai(self, state_dict):
+        return state_dict
--- a/diffsynth/models/flux_value_control.py
+++ b/diffsynth/models/flux_value_control.py
@@ -0,0 +1,60 @@
+import torch
+from diffsynth.models.svd_unet import TemporalTimesteps
+
+
+class MultiValueEncoder(torch.nn.Module):
+    def __init__(self, encoders=()):
+        super().__init__()
+        self.encoders = torch.nn.ModuleList(encoders)
+
+    def __call__(self, values, dtype):
+        emb = []
+        for encoder, value in zip(self.encoders, values):
+            if value is not None:
+                value = value.unsqueeze(0)
+                emb.append(encoder(value, dtype))
+        emb = torch.concat(emb, dim=0)
+        return emb
+
+
+class SingleValueEncoder(torch.nn.Module):
+    def __init__(self, dim_in=256, dim_out=4096, prefer_len=32, computation_device=None):
+        super().__init__()
+        self.prefer_len = prefer_len
+        self.prefer_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0, computation_device=computation_device)
+        self.prefer_value_embedder = torch.nn.Sequential(
+            torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
+        )
+        self.positional_embedding = torch.nn.Parameter(
+            torch.randn(self.prefer_len, dim_out) 
+        )
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        last_linear = self.prefer_value_embedder[-1]
+        torch.nn.init.zeros_(last_linear.weight)
+        torch.nn.init.zeros_(last_linear.bias)
+
+    def forward(self, value, dtype):
+        value = value * 1000
+        emb = self.prefer_proj(value).to(dtype)
+        emb = self.prefer_value_embedder(emb).squeeze(0)
+        base_embeddings = emb.expand(self.prefer_len, -1)
+        positional_embedding = self.positional_embedding.to(dtype=base_embeddings.dtype, device=base_embeddings.device)
+        learned_embeddings = base_embeddings + positional_embedding
+        return learned_embeddings
+
+    @staticmethod
+    def state_dict_converter():
+        return SingleValueEncoderStateDictConverter()
+
+
+class SingleValueEncoderStateDictConverter:
+    def __init__(self):
+        pass
+
+    def from_diffusers(self, state_dict):
+        return state_dict
+
+    def from_civitai(self, state_dict):
+        return state_dict
--- a/diffsynth/models/lora.py
+++ b/diffsynth/models/lora.py
@@ -277,7 +277,7 @@ class FluxLoRAConverter:
        pass

    @staticmethod
-    def align_to_opensource_format(state_dict, alpha=1.0):
+    def align_to_opensource_format(state_dict, alpha=None):
        prefix_rename_dict = {
            "single_blocks": "lora_unet_single_blocks",
            "blocks": "lora_unet_double_blocks",
@@ -316,7 +316,8 @@ class FluxLoRAConverter:
            rename = prefix_rename_dict[prefix] + "_" + block_id + "_" + middle_rename_dict[middle] + "." + suffix_rename_dict[suffix]
            state_dict_[rename] = param
            if rename.endswith("lora_up.weight"):
-                state_dict_[rename.replace("lora_up.weight", "alpha")] = torch.tensor((alpha,))[0]
+                lora_alpha = alpha if alpha is not None else param.shape[-1]
+                state_dict_[rename.replace("lora_up.weight", "alpha")] = torch.tensor((lora_alpha,))[0]
        return state_dict_
    
    @staticmethod
--- a/diffsynth/models/model_manager.py
+++ b/diffsynth/models/model_manager.py
@@ -426,7 +426,7 @@ class ModelManager:
            self.load_model(file_path, model_names, device=device, torch_dtype=torch_dtype)

    
-    def fetch_model(self, model_name, file_path=None, require_model_path=False):
+    def fetch_model(self, model_name, file_path=None, require_model_path=False, index=None):
        fetched_models = []
        fetched_model_paths = []
        for model, model_path, model_name_ in zip(self.model, self.model_path, self.model_name):
@@ -440,12 +440,25 @@ class ModelManager:
            return None
        if len(fetched_models) == 1:
            print(f"Using {model_name} from {fetched_model_paths[0]}.")
+            model = fetched_models[0]
+            path = fetched_model_paths[0]
        else:
-            print(f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[0]}.")
+            if index is None:
+                model = fetched_models[0]
+                path = fetched_model_paths[0]
+                print(f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[0]}.")
+            elif isinstance(index, int):
+                model = fetched_models[:index]
+                path = fetched_model_paths[:index]
+                print(f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[:index]}.")
+            else:
+                model = fetched_models
+                path = fetched_model_paths
+                print(f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths}.")
        if require_model_path:
-            return fetched_models[0], fetched_model_paths[0]
+            return model, path
        else:
-            return fetched_models[0]
+            return model
        

    def to(self, device):
--- a/diffsynth/models/nexus_gen.py
+++ b/diffsynth/models/nexus_gen.py
@@ -0,0 +1,161 @@
+import torch
+from PIL import Image
+
+
+class NexusGenAutoregressiveModel(torch.nn.Module):
+    def __init__(self, max_length=1024, max_pixels=262640):
+        super(NexusGenAutoregressiveModel, self).__init__()
+        from .nexus_gen_ar_model import Qwen2_5_VLForConditionalGeneration
+        from transformers import Qwen2_5_VLConfig
+        self.max_length = max_length
+        self.max_pixels = max_pixels
+        model_config = Qwen2_5_VLConfig(**{
+            "_name_or_path": "DiffSynth-Studio/Nexus-GenV2",
+            "architectures": [
+                "Qwen2_5_VLForConditionalGeneration"
+            ],
+            "attention_dropout": 0.0,
+            "auto_map": {
+                "AutoConfig": "configuration_qwen2_5_vl.Qwen2_5_VLConfig",
+                "AutoModel": "modeling_qwen2_5_vl.Qwen2_5_VLModel",
+                "AutoModelForCausalLM": "modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration"
+            },
+            "bos_token_id": 151643,
+            "eos_token_id": 151645,
+            "hidden_act": "silu",
+            "hidden_size": 3584,
+            "image_token_id": 151655,
+            "initializer_range": 0.02,
+            "intermediate_size": 18944,
+            "max_position_embeddings": 128000,
+            "max_window_layers": 28,
+            "model_type": "qwen2_5_vl",
+            "num_attention_heads": 28,
+            "num_hidden_layers": 28,
+            "num_key_value_heads": 4,
+            "pad_token_id": 151643,
+            "rms_norm_eps": 1e-06,
+            "rope_scaling": {
+                "mrope_section": [
+                16,
+                24,
+                24
+                ],
+                "rope_type": "default",
+                "type": "default"
+            },
+            "rope_theta": 1000000.0,
+            "sliding_window": 32768,
+            "tie_word_embeddings": False,
+            "torch_dtype": "bfloat16",
+            "transformers_version": "4.49.0",
+            "use_cache": False,
+            "use_sliding_window": False,
+            "video_token_id": 151656,
+            "vision_config": {
+                "hidden_size": 1280,
+                "in_chans": 3,
+                "model_type": "qwen2_5_vl",
+                "spatial_patch_size": 14,
+                "tokens_per_second": 2,
+                "torch_dtype": "bfloat16"
+            },
+            "vision_end_token_id": 151653,
+            "vision_start_token_id": 151652,
+            "vision_token_id": 151654,
+            "vocab_size": 152064
+        })
+        self.model = Qwen2_5_VLForConditionalGeneration(model_config)
+        self.processor = None
+        
+        
+    def load_processor(self, path):
+        from .nexus_gen_ar_model import Qwen2_5_VLProcessor
+        self.processor = Qwen2_5_VLProcessor.from_pretrained(path)
+
+
+    @staticmethod
+    def state_dict_converter():
+        return NexusGenAutoregressiveModelStateDictConverter()
+
+    def bound_image(self, image, max_pixels=262640):
+        from qwen_vl_utils import smart_resize
+        resized_height, resized_width = smart_resize(
+            image.height,
+            image.width,
+            max_pixels=max_pixels,
+        )
+        return image.resize((resized_width, resized_height))
+
+    def get_editing_msg(self, instruction):
+        if '<image>' not in instruction:
+            instruction = '<image> ' + instruction
+        messages = [{"role":"user", "content":instruction}, {"role":"assistant", "content":"Here is the image: <image>"}]
+        return messages
+
+    def get_generation_msg(self, instruction):
+        instruction = "Generate an image according to the following description: {}".format(instruction)
+        messages = [{"role":"user", "content":instruction}, {"role":"assistant", "content":"Here is an image based on the description: <image>"}]
+        return messages
+
+    def forward(self, instruction, ref_image=None, num_img_tokens=81):
+        """
+        Generate target embeddings for the given instruction and reference image.
+        """
+        if ref_image is not None:
+            messages = self.get_editing_msg(instruction)
+            images = [self.bound_image(ref_image)] + [Image.new(mode='RGB', size=(252, 252), color=(255, 255, 255))]
+            output_image_embeddings = self.get_target_embeddings(images, messages, self.processor, self.model, num_img_tokens)
+        else:
+            messages = self.get_generation_msg(instruction)
+            images = [Image.new(mode='RGB', size=(252, 252), color=(255, 255, 255))]
+            output_image_embeddings = self.get_target_embeddings(images, messages, self.processor, self.model, num_img_tokens)
+
+        return output_image_embeddings
+
+    def get_target_embeddings(self, images, messages, processor, model, num_img_tokens=81):
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+        text = text.replace('<image>', '<|vision_start|><|image_pad|><|vision_end|>')
+        inputs = processor(
+            text=[text],
+            images=images,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.device)
+
+        input_embeds = model.model.embed_tokens(inputs['input_ids'])
+        image_embeds = model.visual(inputs['pixel_values'], grid_thw=inputs['image_grid_thw'])
+        ground_truth_image_embeds = image_embeds[-num_img_tokens:]
+        input_image_embeds = image_embeds[:-num_img_tokens]
+
+        image_mask = inputs['input_ids'] == model.config.image_token_id
+        indices = image_mask.cumsum(dim=1)
+        input_image_mask = torch.logical_and(indices <= (image_embeds.shape[0] - ground_truth_image_embeds.shape[0]), image_mask)
+        gt_image_mask = torch.logical_and(image_mask, ~input_image_mask)
+        input_image_mask = input_image_mask.unsqueeze(-1).expand_as(input_embeds)
+        input_embeds = input_embeds.masked_scatter(input_image_mask, input_image_embeds)
+
+        image_prefill_embeds = model.image_prefill_embeds(
+            torch.arange(81, device=model.device).long()
+        )
+        input_embeds = input_embeds.masked_scatter(gt_image_mask.unsqueeze(-1).expand_as(input_embeds), image_prefill_embeds)
+
+        position_ids, _ = model.get_rope_index(
+            inputs['input_ids'],
+            inputs['image_grid_thw'],
+            attention_mask=inputs['attention_mask'])
+        position_ids = position_ids.contiguous()
+        outputs = model(inputs_embeds=input_embeds, position_ids=position_ids, attention_mask=inputs['attention_mask'], return_dict=True)
+        output_image_embeddings = outputs.image_embeddings[:, :-1, :]
+        output_image_embeddings = output_image_embeddings[gt_image_mask[:, 1:]]
+        return output_image_embeddings, input_image_embeds, inputs['image_grid_thw']
+
+
+class NexusGenAutoregressiveModelStateDictConverter:
+    def __init__(self):
+        pass
+
+    def from_civitai(self, state_dict):
+        state_dict = {"model." + key: value for key, value in state_dict.items()}
+        return state_dict
--- a/diffsynth/models/nexus_gen_ar_model.py
+++ b/diffsynth/models/nexus_gen_ar_model.py
--- a/diffsynth/models/nexus_gen_projector.py
+++ b/diffsynth/models/nexus_gen_projector.py
@@ -0,0 +1,417 @@
+import math
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Qwen2_5_VLRotaryEmbedding(nn.Module):
+    def __init__(self, config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        from transformers.modeling_rope_utils import _compute_default_rope_parameters
+        self.rope_init_fn = _compute_default_rope_parameters
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block. In contrast to other models, Qwen2_5_VL has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2_5_VLAttention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.rope_scaling = config.rope_scaling
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        # Fix precision issues in Qwen2-VL float16 inference
+        # Replace inf values with zeros in attention weights to prevent NaN propagation
+        if query_states.dtype == torch.float16:
+            attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output
+
+
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        from transformers.activations import ACT2FN
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen2_5_VLDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Qwen2_5_VLAttention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class NexusGenImageEmbeddingMerger(nn.Module):
+    def __init__(self, num_layers=1, out_channel=4096, expand_ratio=4, device='cpu'):
+        super().__init__()
+        from transformers import Qwen2_5_VLConfig
+        from transformers.activations import ACT2FN
+        config = Qwen2_5_VLConfig(**{
+            "_name_or_path": "DiffSynth-Studio/Nexus-GenV2",
+            "architectures": [
+                "Qwen2_5_VLForConditionalGeneration"
+            ],
+            "attention_dropout": 0.0,
+            "auto_map": {
+                "AutoConfig": "configuration_qwen2_5_vl.Qwen2_5_VLConfig",
+                "AutoModel": "modeling_qwen2_5_vl.Qwen2_5_VLModel",
+                "AutoModelForCausalLM": "modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration"
+            },
+            "bos_token_id": 151643,
+            "eos_token_id": 151645,
+            "hidden_act": "silu",
+            "hidden_size": 3584,
+            "image_token_id": 151655,
+            "initializer_range": 0.02,
+            "intermediate_size": 18944,
+            "max_position_embeddings": 128000,
+            "max_window_layers": 28,
+            "model_type": "qwen2_5_vl",
+            "num_attention_heads": 28,
+            "num_hidden_layers": 28,
+            "num_key_value_heads": 4,
+            "pad_token_id": 151643,
+            "rms_norm_eps": 1e-06,
+            "rope_scaling": {
+                "mrope_section": [
+                16,
+                24,
+                24
+                ],
+                "rope_type": "default",
+                "type": "default"
+            },
+            "rope_theta": 1000000.0,
+            "sliding_window": 32768,
+            "tie_word_embeddings": False,
+            "torch_dtype": "bfloat16",
+            "transformers_version": "4.49.0",
+            "use_cache": False,
+            "use_sliding_window": False,
+            "video_token_id": 151656,
+            "vision_config": {
+                "hidden_size": 1280,
+                "in_chans": 3,
+                "model_type": "qwen2_5_vl",
+                "spatial_patch_size": 14,
+                "tokens_per_second": 2,
+                "torch_dtype": "bfloat16"
+            },
+            "vision_end_token_id": 151653,
+            "vision_start_token_id": 151652,
+            "vision_token_id": 151654,
+            "vocab_size": 152064
+        })
+        self.config = config
+        self.num_layers = num_layers
+        self.layers = nn.ModuleList([Qwen2_5_VLDecoderLayer(config, layer_idx) for layer_idx in range(num_layers)])
+        self.projector = nn.Sequential(Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps),
+                                       nn.Linear(config.hidden_size, out_channel * expand_ratio),
+                                       Qwen2RMSNorm(out_channel * expand_ratio, eps=config.rms_norm_eps),
+                                       ACT2FN[config.hidden_act], nn.Linear(out_channel * expand_ratio, out_channel),
+                                       Qwen2RMSNorm(out_channel, eps=config.rms_norm_eps))
+        self.base_grid = torch.tensor([[1, 72, 72]], device=device)
+        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config, device=device)
+
+    def get_position_ids(self, image_grid_thw):
+        """
+        Generates position ids for the input embeddings grid.
+        modified from the qwen2_vl mrope.
+        """
+        batch_size = image_grid_thw.shape[0]
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        t, h, w = (
+            image_grid_thw[0][0],
+            image_grid_thw[0][1],
+            image_grid_thw[0][2],
+        )
+        llm_grid_t, llm_grid_h, llm_grid_w = (
+            t.item(),
+            h.item() // spatial_merge_size,
+            w.item() // spatial_merge_size,
+        )
+        scale_h = self.base_grid[0][1].item() / h.item()
+        scale_w = self.base_grid[0][2].item() / w.item()
+
+        range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+        expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+        time_tensor = expanded_range * self.config.vision_config.tokens_per_second
+        t_index = time_tensor.long().flatten().to(image_grid_thw.device)
+        h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten().to(image_grid_thw.device) * scale_h
+        w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten().to(image_grid_thw.device) * scale_w
+        # 3, B, L
+        position_ids = torch.stack([t_index, h_index, w_index]).unsqueeze(0).repeat(batch_size, 1, 1).permute(1, 0, 2)
+        return position_ids
+
+    def forward(self, embeds, embeds_grid, ref_embeds=None, ref_embeds_grid=None):
+        position_ids = self.get_position_ids(embeds_grid)
+        hidden_states = embeds
+        if ref_embeds is not None:
+            position_ids_ref_embeds = self.get_position_ids(ref_embeds_grid)
+            position_ids = torch.cat((position_ids, position_ids_ref_embeds), dim=-1)
+            hidden_states = torch.cat((embeds, ref_embeds), dim=1)
+
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, position_embeddings)
+
+        hidden_states = self.projector(hidden_states)
+        return hidden_states
+
+    @staticmethod
+    def state_dict_converter():
+        return NexusGenMergerStateDictConverter()
+
+
+class NexusGenMergerStateDictConverter:
+    def __init__(self):
+        pass
+
+    def from_diffusers(self, state_dict):
+        return state_dict
+    
+    def from_civitai(self, state_dict):
+        merger_state_dict = {key.replace("embedding_merger.", ""): value for key, value in state_dict.items() if key.startswith('embedding_merger.')}
+        return merger_state_dict
+
+
+class NexusGenAdapter(nn.Module):
+    """
+    Adapter for Nexus-Gen generation decoder.
+    """
+    def __init__(self, input_dim=3584, output_dim=4096):
+        super(NexusGenAdapter, self).__init__()
+        self.adapter = nn.Sequential(nn.Linear(input_dim, output_dim),
+                                     nn.LayerNorm(output_dim), nn.ReLU(),
+                                     nn.Linear(output_dim, output_dim),
+                                     nn.LayerNorm(output_dim))
+
+    def forward(self, x):
+        return self.adapter(x)
+
+    @staticmethod
+    def state_dict_converter():
+        return NexusGenAdapterStateDictConverter()
+
+
+class NexusGenAdapterStateDictConverter:
+    def __init__(self):
+        pass
+
+    def from_diffusers(self, state_dict):
+        return state_dict
+    
+    def from_civitai(self, state_dict):
+        adapter_state_dict = {key: value for key, value in state_dict.items() if key.startswith('adapter.')}
+        return adapter_state_dict
--- a/diffsynth/models/step1x_connector.py
+++ b/diffsynth/models/step1x_connector.py
@@ -162,7 +162,7 @@ class TimestepEmbedder(nn.Module):
    def forward(self, t):
        t_freq = self.timestep_embedding(
            t, self.frequency_embedding_size, self.max_period
-        ).type(self.mlp[0].weight.dtype)  # type: ignore
+        ).type(t.dtype)  # type: ignore
        t_emb = self.mlp(t_freq)
        return t_emb
    
@@ -656,7 +656,7 @@ class Qwen2Connector(torch.nn.Module):
        mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
        x_mean = (x * mask_float).sum(
                dim=1
-            ) / mask_float.sum(dim=1) * (1 + self.scale_factor)
+            ) / mask_float.sum(dim=1) * (1 + self.scale_factor.to(dtype=x.dtype, device=x.device))

        global_out=self.global_proj_out(x_mean)
        encoder_hidden_states = self.S(x,t,mask)
--- a/diffsynth/models/utils.py
+++ b/diffsynth/models/utils.py
@@ -71,7 +71,7 @@ def load_state_dict(file_path, torch_dtype=None, device="cpu"):

 def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
    state_dict = {}
-    with safe_open(file_path, framework="pt", device=device) as f:
+    with safe_open(file_path, framework="pt", device=str(device)) as f:
        for k in f.keys():
            state_dict[k] = f.get_tensor(k)
            if torch_dtype is not None:
--- a/diffsynth/models/wan_video_dit.py
+++ b/diffsynth/models/wan_video_dit.py
@@ -212,9 +212,16 @@ class DiTBlock(nn.Module):
        self.gate = GateModule()

    def forward(self, x, context, t_mod, freqs):
+        has_seq = len(t_mod.shape) == 4
+        chunk_dim = 2 if has_seq else 1
        # msa: multi-head self-attention  mlp: multi-layer perceptron
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(6, dim=1)
+            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(6, dim=chunk_dim)
+        if has_seq:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                shift_msa.squeeze(2), scale_msa.squeeze(2), gate_msa.squeeze(2),
+                shift_mlp.squeeze(2), scale_mlp.squeeze(2), gate_mlp.squeeze(2),
+            )
        input_x = modulate(self.norm1(x), shift_msa, scale_msa)
        x = self.gate(x, gate_msa, self.self_attn(input_x, freqs))
        x = x + self.cross_attn(self.norm3(x), context)
@@ -253,8 +260,12 @@ class Head(nn.Module):
        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)

    def forward(self, x, t_mod):
-        shift, scale = (self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(2, dim=1)
-        x = (self.head(self.norm(x) * (1 + scale) + shift))
+        if len(t_mod.shape) == 3:
+            shift, scale = (self.modulation.unsqueeze(0).to(dtype=t_mod.dtype, device=t_mod.device) + t_mod.unsqueeze(2)).chunk(2, dim=2)
+            x = (self.head(self.norm(x) * (1 + scale.squeeze(2)) + shift.squeeze(2)))
+        else:
+            shift, scale = (self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(2, dim=1)
+            x = (self.head(self.norm(x) * (1 + scale) + shift))
        return x


@@ -276,12 +287,20 @@ class WanModel(torch.nn.Module):
        has_ref_conv: bool = False,
        add_control_adapter: bool = False,
        in_dim_control_adapter: int = 24,
+        seperated_timestep: bool = False,
+        require_vae_embedding: bool = True,
+        require_clip_embedding: bool = True,
+        fuse_vae_embedding_in_latents: bool = False,
    ):
        super().__init__()
        self.dim = dim
        self.freq_dim = freq_dim
        self.has_image_input = has_image_input
        self.patch_size = patch_size
+        self.seperated_timestep = seperated_timestep
+        self.require_vae_embedding = require_vae_embedding
+        self.require_clip_embedding = require_clip_embedding
+        self.fuse_vae_embedding_in_latents = fuse_vae_embedding_in_latents

        self.patch_embedding = nn.Conv3d(
            in_dim, dim, kernel_size=patch_size, stride=patch_size)
@@ -659,6 +678,41 @@ class WanModelStateDictConverter:
                "add_control_adapter": True,
                "in_dim_control_adapter": 24,
            }
+        elif hash_state_dict_keys(state_dict) == "1f5ab7703c6fc803fdded85ff040c316":
+            # Wan-AI/Wan2.2-TI2V-5B
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 3072,
+                "ffn_dim": 14336,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 48,
+                "num_heads": 24,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "seperated_timestep": True,
+                "require_clip_embedding": False,
+                "require_vae_embedding": False,
+                "fuse_vae_embedding_in_latents": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "5b013604280dd715f8457c6ed6d6a626":
+            # Wan-AI/Wan2.2-I2V-A14B
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "require_clip_embedding": False,
+            }
        else:
            config = {}
        return state_dict, config
--- a/diffsynth/models/wan_video_vae.py
+++ b/diffsynth/models/wan_video_vae.py
@@ -195,6 +195,75 @@ class Resample(nn.Module):
        nn.init.zeros_(conv.bias.data)


+
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(x,
+                      "b c f (h q) (w r) -> b (c r q) f h w",
+                      q=patch_size,
+                      r=patch_size)
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+
+
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(x,
+                      "b (c r q) f h w -> b c f (h q) (w r)",
+                      q=patch_size,
+                      r=patch_size)
+    return x
+
+
+class Resample38(Resample):
+
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super(Resample, self).__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
+            )
+        else:
+            self.resample = nn.Identity()
+
 class ResidualBlock(nn.Module):

    def __init__(self, in_dim, out_dim, dropout=0.0):
@@ -273,6 +342,178 @@ class AttentionBlock(nn.Module):
        return x + identity


+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+
+
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+
+
+class Down_ResidualBlock(nn.Module):
+    def __init__(
+        self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False
+    ):
+        super().__init__()
+
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+
+        # Main path with residual blocks and downsample
+        downsamples = []
+        for _ in range(mult):
+            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            downsamples.append(Resample38(out_dim, mode=mode))
+
+        self.downsamples = nn.Sequential(*downsamples)
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for module in self.downsamples:
+            x = module(x, feat_cache, feat_idx)
+
+        return x + self.avg_shortcut(x_copy)
+
+
+class Up_ResidualBlock(nn.Module):
+    def __init__(
+        self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False
+    ):
+        super().__init__()
+        # Shortcut path with upsample
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2 if up_flag else 1,
+            )
+        else:
+            self.avg_shortcut = None
+
+        # Main path with residual blocks and upsample
+        upsamples = []
+        for _ in range(mult):
+            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+
+        # Add the final upsample block
+        if up_flag:
+            mode = "upsample3d" if temperal_upsample else "upsample2d"
+            upsamples.append(Resample38(out_dim, mode=mode))
+
+        self.upsamples = nn.Sequential(*upsamples)
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        x_main = x.clone()
+        for module in self.upsamples:
+            x_main = module(x_main, feat_cache, feat_idx)
+        if self.avg_shortcut is not None:
+            x_shortcut = self.avg_shortcut(x, first_chunk)
+            return x_main + x_shortcut
+        else:
+            return x_main
+
+
 class Encoder3d(nn.Module):

    def __init__(self,
@@ -376,6 +617,122 @@ class Encoder3d(nn.Module):
        return x


+class Encoder3d_38(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+
+        # init block
+        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
+
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_down_flag = (
+                temperal_downsample[i] if i < len(temperal_downsample) else False
+            )
+            downsamples.append(
+                Down_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks,
+                    temperal_downsample=t_down_flag,
+                    down_flag=i != len(dim_mult) - 1,
+                )
+            )
+            scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+
+        # # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+
+        return x
+
+
 class Decoder3d(nn.Module):

    def __init__(self,
@@ -481,10 +838,112 @@ class Decoder3d(nn.Module):
        return x


+
+class Decoder3d_38(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
+                                    AttentionBlock(dims[0]),
+                                    ResidualBlock(dims[0], dims[0], dropout))
+
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
+            upsamples.append(
+                Up_ResidualBlock(in_dim=in_dim,
+                                 out_dim=out_dim,
+                                 dropout=dropout,
+                                 mult=num_res_blocks + 1,
+                                 temperal_upsample=t_up_flag,
+                                 up_flag=i != len(dim_mult) - 1))
+        self.upsamples = nn.Sequential(*upsamples)
+
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, 12, 3, padding=1))
+
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, first_chunk)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
 def count_conv3d(model):
    count = 0
    for m in model.modules():
-        if check_is_instance(m, CausalConv3d):
+        if isinstance(m, CausalConv3d):
            count += 1
    return count

@@ -616,6 +1075,7 @@ class WanVideoVAE(nn.Module):
        # init model
        self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
        self.upsampling_factor = 8
+        self.z_dim = z_dim


    def build_1d_mask(self, length, left_bound, right_bound, border_width):
@@ -711,7 +1171,7 @@ class WanVideoVAE(nn.Module):

        out_T = (T + 3) // 4
        weight = torch.zeros((1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
-        values = torch.zeros((1, 16, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
+        values = torch.zeros((1, self.z_dim, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)

        for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding"):
            hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
@@ -762,8 +1222,8 @@ class WanVideoVAE(nn.Module):
        for video in videos:
            video = video.unsqueeze(0)
            if tiled:
-                tile_size = (tile_size[0] * 8, tile_size[1] * 8)
-                tile_stride = (tile_stride[0] * 8, tile_stride[1] * 8)
+                tile_size = (tile_size[0] * self.upsampling_factor, tile_size[1] * self.upsampling_factor)
+                tile_stride = (tile_stride[0] * self.upsampling_factor, tile_stride[1] * self.upsampling_factor)
                hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
            else:
                hidden_state = self.single_encode(video, device)
@@ -798,3 +1258,119 @@ class WanVideoVAEStateDictConverter:
        for name in state_dict:
            state_dict_['model.' + name] = state_dict[name]
        return state_dict_
+
+
+class VideoVAE38_(VideoVAE_):
+
+    def __init__(self,
+                 dim=160,
+                 z_dim=48,
+                 dec_dim=256,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[False, True, True],
+                 dropout=0.0):
+        super(VideoVAE_, self).__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+
+        # modules
+        self.encoder = Encoder3d_38(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                    attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d_38(dec_dim, z_dim, dim_mult, num_res_blocks,
+                                    attn_scales, self.temperal_upsample, dropout)
+
+
+    def encode(self, x, scale):
+        self.clear_cache()
+        x = patchify(x, patch_size=2)
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :],
+                                   feat_cache=self._enc_feat_map,
+                                   feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                                    feat_cache=self._enc_feat_map,
+                                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+
+
+    def decode(self, z, scale):
+        self.clear_cache()
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i:i + 1, :, :],
+                                   feat_cache=self._feat_map,
+                                   feat_idx=self._conv_idx,
+                                   first_chunk=True)
+            else:
+                out_ = self.decoder(x[:, :, i:i + 1, :, :],
+                                    feat_cache=self._feat_map,
+                                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        out = unpatchify(out, patch_size=2)
+        self.clear_cache()
+        return out
+
+
+class WanVideoVAE38(WanVideoVAE):
+
+    def __init__(self, z_dim=48, dim=160):
+        super(WanVideoVAE, self).__init__()
+
+        mean = [
+            -0.2289, -0.0052, -0.1323, -0.2339, -0.2799,  0.0174,  0.1838,  0.1557,
+            -0.1382,  0.0542,  0.2813,  0.0891,  0.1570, -0.0098,  0.0375, -0.1825,
+            -0.2246, -0.1207, -0.0698,  0.5109,  0.2665, -0.2108, -0.2158,  0.2502,
+            -0.2055, -0.0322,  0.1109,  0.1567, -0.0729,  0.0899, -0.2799, -0.1230,
+            -0.0313, -0.1649,  0.0117,  0.0723, -0.2839, -0.2083, -0.0520,  0.3748,
+            0.0152,  0.1957,  0.1433, -0.2944,  0.3573, -0.0548, -0.1681, -0.0667
+        ]
+        std = [
+            0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
+            0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
+            0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
+            0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
+            0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
+            0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+
+        # init model
+        self.model = VideoVAE38_(z_dim=z_dim, dim=dim).eval().requires_grad_(False)
+        self.upsampling_factor = 16
+        self.z_dim = z_dim
--- a/diffsynth/pipelines/flux_image_new.py
+++ b/diffsynth/pipelines/flux_image_new.py
@@ -18,12 +18,15 @@ from ..models import ModelManager, load_state_dict, SD3TextEncoder1, FluxTextEnc
 from ..models.step1x_connector import Qwen2Connector
 from ..models.flux_controlnet import FluxControlNet
 from ..models.flux_ipadapter import FluxIpAdapter
+from ..models.flux_value_control import MultiValueEncoder
 from ..models.flux_infiniteyou import InfiniteYouImageProjector
+from ..models.flux_lora_encoder import FluxLoRAEncoder, LoRALayerBlock
 from ..models.tiler import FastTileWorker
-from .wan_video_new import BasePipeline, ModelConfig, PipelineUnitRunner, PipelineUnit
-from ..lora.flux_lora import FluxLoRALoader, FluxLoraPatcher
+from ..models.nexus_gen import NexusGenAutoregressiveModel
+from ..models.nexus_gen_projector import NexusGenAdapter, NexusGenImageEmbeddingMerger
+from ..utils import BasePipeline, ModelConfig, PipelineUnitRunner, PipelineUnit
+from ..lora.flux_lora import FluxLoRALoader, FluxLoraPatcher, FluxLoRAFuser

-from transformers.models.t5.modeling_t5 import T5LayerNorm, T5DenseActDense, T5DenseGatedActDense
 from ..models.flux_dit import RMSNorm
 from ..vram_management import gradient_checkpoint_forward, enable_vram_management, AutoWrappedModule, AutoWrappedLinear

@@ -93,9 +96,14 @@ class FluxImagePipeline(BasePipeline):
        self.ipadapter_image_encoder = None
        self.qwenvl = None
        self.step1x_connector: Qwen2Connector = None
+        self.nexus_gen: NexusGenAutoregressiveModel = None
+        self.nexus_gen_generation_adapter: NexusGenAdapter = None
+        self.nexus_gen_editing_adapter: NexusGenImageEmbeddingMerger = None
+        self.value_controller: MultiValueEncoder = None
        self.infinityou_processor: InfinitYou = None
        self.image_proj_model: InfiniteYouImageProjector = None
        self.lora_patcher: FluxLoraPatcher = None
+        self.lora_encoder: FluxLoRAEncoder = None
        self.unit_runner = PipelineUnitRunner()
        self.in_iteration_models = ("dit", "step1x_connector", "controlnet", "lora_patcher")
        self.units = [
@@ -110,9 +118,12 @@ class FluxImagePipeline(BasePipeline):
            FluxImageUnit_ControlNet(),
            FluxImageUnit_IPAdapter(),
            FluxImageUnit_EntityControl(),
+            FluxImageUnit_NexusGen(),
            FluxImageUnit_TeaCache(),
            FluxImageUnit_Flex(),
            FluxImageUnit_Step1x(),
+            FluxImageUnit_ValueControl(),
+            FluxImageUnit_LoRAEncode(),
        ]
        self.model_fn = model_fn_flux_image
        
@@ -120,18 +131,20 @@ class FluxImagePipeline(BasePipeline):
    def load_lora(
        self,
        module: torch.nn.Module,
-        lora_config: Union[ModelConfig, str],
+        lora_config: Union[ModelConfig, str] = None,
        alpha=1,
        hotload=False,
-        local_model_path="./models",
-        skip_download=False
+        state_dict=None,
    ):
-        if isinstance(lora_config, str):
-            lora_config = ModelConfig(path=lora_config)
+        if state_dict is None:
+            if isinstance(lora_config, str):
+                lora = load_state_dict(lora_config, torch_dtype=self.torch_dtype, device=self.device)
+            else:
+                lora_config.download_if_necessary()
+                lora = load_state_dict(lora_config.path, torch_dtype=self.torch_dtype, device=self.device)
        else:
-            lora_config.download_if_necessary(local_model_path, skip_download=skip_download)
+            lora = state_dict
        loader = FluxLoRALoader(torch_dtype=self.torch_dtype, device=self.device)
-        lora = load_state_dict(lora_config.path, torch_dtype=self.torch_dtype, device=self.device)
        lora = loader.convert_state_dict(lora)
        if hotload:
            for name, module in module.named_modules():
@@ -145,19 +158,21 @@ class FluxImagePipeline(BasePipeline):
            loader.load(module, lora, alpha=alpha)


-    def enable_lora_patcher(self):
-        if not (hasattr(self, "vram_management_enabled") and self.vram_management_enabled):
-            print("Please enable VRAM management using `enable_vram_management()` before `enable_lora_patcher()`.")
-            return
-        if self.lora_patcher is None:
-            print("Please load lora patcher models before `enable_lora_patcher()`.")
-            return
-        for name, module in self.dit.named_modules():
-            if isinstance(module, AutoWrappedLinear):
-                merger_name = name.replace(".", "___")
-                if merger_name in self.lora_patcher.model_dict:
-                    module.lora_merger = self.lora_patcher.model_dict[merger_name]
-    
+    def load_loras(
+        self,
+        module: torch.nn.Module,
+        lora_configs: list[Union[ModelConfig, str]],
+        alpha=1,
+        hotload=False,
+        extra_fused_lora=False,
+    ):
+        for lora_config in lora_configs:
+            self.load_lora(module, lora_config, hotload=hotload, alpha=alpha)
+        if extra_fused_lora:
+            lora_fuser = FluxLoRAFuser(device="cuda", torch_dtype=torch.bfloat16)
+            fused_lora = lora_fuser(lora_configs)
+            self.load_lora(module, state_dict=fused_lora, hotload=hotload, alpha=alpha)
+
    
    def clear_lora(self):
        for name, module in self.named_modules():
@@ -182,22 +197,19 @@ class FluxImagePipeline(BasePipeline):
        return loss
    
    
-    def enable_vram_management(self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5):
-        self.vram_management_enabled = True
-        if num_persistent_param_in_dit is not None:
-            vram_limit = None
-        else:
-            if vram_limit is None:
-                vram_limit = self.get_vram()
-            vram_limit = vram_limit - vram_buffer
-        if self.text_encoder_1 is not None:
-            dtype = next(iter(self.text_encoder_1.parameters())).dtype
+    def _enable_vram_management_with_default_config(self, model, vram_limit):
+        if model is not None:
+            dtype = next(iter(model.parameters())).dtype
            enable_vram_management(
-                self.text_encoder_1,
+                model,
                module_map = {
                    torch.nn.Linear: AutoWrappedLinear,
                    torch.nn.Embedding: AutoWrappedModule,
                    torch.nn.LayerNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.GroupNorm: AutoWrappedModule,
+                    RMSNorm: AutoWrappedModule,
+                    LoRALayerBlock: AutoWrappedModule,
                },
                module_config = dict(
                    offload_dtype=dtype,
@@ -209,7 +221,52 @@ class FluxImagePipeline(BasePipeline):
                ),
                vram_limit=vram_limit,
            )
+            
+            
+    def enable_lora_magic(self):
+        if self.dit is not None:
+            if not (hasattr(self.dit, "vram_management_enabled") and self.dit.vram_management_enabled):
+                dtype = next(iter(self.dit.parameters())).dtype
+                enable_vram_management(
+                    self.dit,
+                    module_map = {
+                        torch.nn.Linear: AutoWrappedLinear,
+                    },
+                    module_config = dict(
+                        offload_dtype=dtype,
+                        offload_device=self.device,
+                        onload_dtype=dtype,
+                        onload_device=self.device,
+                        computation_dtype=self.torch_dtype,
+                        computation_device=self.device,
+                    ),
+                    vram_limit=None,
+                )
+        if self.lora_patcher is not None:
+            for name, module in self.dit.named_modules():
+                if isinstance(module, AutoWrappedLinear):
+                    merger_name = name.replace(".", "___")
+                    if merger_name in self.lora_patcher.model_dict:
+                        module.lora_merger = self.lora_patcher.model_dict[merger_name]
+    
+    
+    def enable_vram_management(self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5):
+        self.vram_management_enabled = True
+        if num_persistent_param_in_dit is not None:
+            vram_limit = None
+        else:
+            if vram_limit is None:
+                vram_limit = self.get_vram()
+            vram_limit = vram_limit - vram_buffer
+
+        # Default config
+        default_vram_management_models = ["text_encoder_1", "vae_decoder", "vae_encoder", "controlnet", "image_proj_model", "ipadapter", "lora_patcher", "value_controller", "step1x_connector", "lora_encoder"]
+        for model_name in default_vram_management_models:
+            self._enable_vram_management_with_default_config(getattr(self, model_name), vram_limit)
+
+        # Special config
        if self.text_encoder_2 is not None:
+            from transformers.models.t5.modeling_t5 import T5LayerNorm, T5DenseActDense, T5DenseGatedActDense
            dtype = next(iter(self.text_encoder_2.parameters())).dtype
            enable_vram_management(
                self.text_encoder_2,
@@ -258,14 +315,18 @@ class FluxImagePipeline(BasePipeline):
                ),
                vram_limit=vram_limit,
            )
-        if self.vae_decoder is not None:
-            dtype = next(iter(self.vae_decoder.parameters())).dtype
+        if self.ipadapter_image_encoder is not None:
+            from transformers.models.siglip.modeling_siglip import SiglipVisionEmbeddings, SiglipEncoder, SiglipMultiheadAttentionPoolingHead
+            dtype = next(iter(self.ipadapter_image_encoder.parameters())).dtype
            enable_vram_management(
-                self.vae_decoder,
+                self.ipadapter_image_encoder,
                module_map = {
+                    SiglipVisionEmbeddings: AutoWrappedModule,
+                    SiglipEncoder: AutoWrappedModule,
+                    SiglipMultiheadAttentionPoolingHead: AutoWrappedModule,
+                    torch.nn.MultiheadAttention: AutoWrappedModule,
                    torch.nn.Linear: AutoWrappedLinear,
-                    torch.nn.Conv2d: AutoWrappedModule,
-                    torch.nn.GroupNorm: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
                },
                module_config = dict(
                    offload_dtype=dtype,
@@ -277,14 +338,25 @@ class FluxImagePipeline(BasePipeline):
                ),
                vram_limit=vram_limit,
            )
-        if self.vae_encoder is not None:
-            dtype = next(iter(self.vae_encoder.parameters())).dtype
+        if self.qwenvl is not None:
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+                Qwen2_5_VisionPatchEmbed, Qwen2_5_VLVisionBlock, Qwen2_5_VLPatchMerger,
+                Qwen2_5_VLDecoderLayer, Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VLRotaryEmbedding, Qwen2RMSNorm
+            )
+            dtype = next(iter(self.qwenvl.parameters())).dtype
            enable_vram_management(
-                self.vae_encoder,
+                self.qwenvl,
                module_map = {
+                    Qwen2_5_VisionPatchEmbed: AutoWrappedModule,
+                    Qwen2_5_VLVisionBlock: AutoWrappedModule,
+                    Qwen2_5_VLPatchMerger: AutoWrappedModule,
+                    Qwen2_5_VLDecoderLayer: AutoWrappedModule,
+                    Qwen2_5_VisionRotaryEmbedding: AutoWrappedModule,
+                    Qwen2_5_VLRotaryEmbedding: AutoWrappedModule,
+                    Qwen2RMSNorm: AutoWrappedModule,
+                    torch.nn.Embedding: AutoWrappedModule,
                    torch.nn.Linear: AutoWrappedLinear,
-                    torch.nn.Conv2d: AutoWrappedModule,
-                    torch.nn.GroupNorm: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
                },
                module_config = dict(
                    offload_dtype=dtype,
@@ -303,16 +375,12 @@ class FluxImagePipeline(BasePipeline):
        torch_dtype: torch.dtype = torch.bfloat16,
        device: Union[str, torch.device] = "cuda",
        model_configs: list[ModelConfig] = [],
-        tokenizer_config: ModelConfig = ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*"),
-        local_model_path: str = "./models",
-        skip_download: bool = False,
-        redirect_common_files: bool = True,
-        use_usp=False,
+        nexus_gen_processor_config: ModelConfig = ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="processor/"),
    ):
        # Download and load models
        model_manager = ModelManager()
        for model_config in model_configs:
-            model_config.download_if_necessary(local_model_path, skip_download=skip_download)
+            model_config.download_if_necessary()
            model_manager.load_model(
                model_config.path,
                device=model_config.offload_device or device,
@@ -335,13 +403,29 @@ class FluxImagePipeline(BasePipeline):
        if pipe.image_proj_model is not None:
            pipe.infinityou_processor = InfinitYou(device=device)
        pipe.lora_patcher = model_manager.fetch_model("flux_lora_patcher")
+        pipe.lora_encoder = model_manager.fetch_model("flux_lora_encoder")
+        pipe.nexus_gen = model_manager.fetch_model("nexus_gen_llm")
+        pipe.nexus_gen_generation_adapter = model_manager.fetch_model("nexus_gen_generation_adapter")
+        pipe.nexus_gen_editing_adapter = model_manager.fetch_model("nexus_gen_editing_adapter")
+        if nexus_gen_processor_config is not None and pipe.nexus_gen is not None:
+            nexus_gen_processor_config.download_if_necessary()
+            pipe.nexus_gen.load_processor(nexus_gen_processor_config.path)
        
        # ControlNet
        controlnets = []
        for model_name, model in zip(model_manager.model_name, model_manager.model):
            if model_name == "flux_controlnet":
                controlnets.append(model)
-        pipe.controlnet = MultiControlNet(controlnets)
+        if len(controlnets) > 0:
+            pipe.controlnet = MultiControlNet(controlnets)
+
+        # Value Controller
+        value_controllers = []
+        for model_name, model in zip(model_manager.model_name, model_manager.model):
+            if model_name == "flux_value_controller":
+                value_controllers.append(model)
+        if len(value_controllers) > 0:
+            pipe.value_controller = MultiValueEncoder(value_controllers)

        return pipe
    
@@ -393,8 +477,15 @@ class FluxImagePipeline(BasePipeline):
        flex_control_image: Image.Image = None,
        flex_control_strength: float = 0.5,
        flex_control_stop: float = 0.5,
+        # Value Controller
+        value_controller_inputs: Union[list[float], float] = None,
        # Step1x
        step1x_reference_image: Image.Image = None,
+        # NexusGen
+        nexus_gen_reference_image: Image.Image = None,
+        # LoRA Encoder
+        lora_encoder_inputs: Union[list[ModelConfig], ModelConfig, str] = None,
+        lora_encoder_scale: float = 1.0,
        # TeaCache
        tea_cache_l1_thresh: float = None,
        # Tile
@@ -426,7 +517,10 @@ class FluxImagePipeline(BasePipeline):
            "eligen_entity_prompts": eligen_entity_prompts, "eligen_entity_masks": eligen_entity_masks, "eligen_enable_on_negative": eligen_enable_on_negative, "eligen_enable_inpaint": eligen_enable_inpaint,
            "infinityou_id_image": infinityou_id_image, "infinityou_guidance": infinityou_guidance,
            "flex_inpaint_image": flex_inpaint_image, "flex_inpaint_mask": flex_inpaint_mask, "flex_control_image": flex_control_image, "flex_control_strength": flex_control_strength, "flex_control_stop": flex_control_stop,
+            "value_controller_inputs": value_controller_inputs,
            "step1x_reference_image": step1x_reference_image,
+            "nexus_gen_reference_image": nexus_gen_reference_image,
+            "lora_encoder_inputs": lora_encoder_inputs, "lora_encoder_scale": lora_encoder_scale,
            "tea_cache_l1_thresh": tea_cache_l1_thresh,
            "tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride,
            "progress_bar_cmd": progress_bar_cmd,
@@ -677,15 +771,70 @@ class FluxImageUnit_EntityControl(PipelineUnit):
        if eligen_entity_prompts is None or eligen_entity_masks is None:
            return inputs_shared, inputs_posi, inputs_nega
        pipe.load_models_to_device(self.onload_model_names)
+        eligen_enable_on_negative = inputs_shared.get("eligen_enable_on_negative", False)
        eligen_kwargs_posi, eligen_kwargs_nega = self.prepare_eligen(pipe, inputs_nega,
            eligen_entity_prompts, eligen_entity_masks, inputs_shared["width"], inputs_shared["height"], 
-            inputs_shared["t5_sequence_length"], inputs_shared["eligen_enable_on_negative"], inputs_shared["cfg_scale"])
+            inputs_shared["t5_sequence_length"], eligen_enable_on_negative, inputs_shared["cfg_scale"])
        inputs_posi.update(eligen_kwargs_posi)
        if inputs_shared.get("cfg_scale", 1.0) != 1.0:
            inputs_nega.update(eligen_kwargs_nega)
        return inputs_shared, inputs_posi, inputs_nega


+class FluxImageUnit_NexusGen(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            take_over=True,
+            onload_model_names=("nexus_gen", "nexus_gen_generation_adapter", "nexus_gen_editing_adapter"),
+        )
+
+    def process(self, pipe: FluxImagePipeline, inputs_shared, inputs_posi, inputs_nega):
+        if pipe.nexus_gen is None:
+            return inputs_shared, inputs_posi, inputs_nega
+        pipe.load_models_to_device(self.onload_model_names)
+        if inputs_shared.get("nexus_gen_reference_image", None) is None:
+            assert pipe.nexus_gen_generation_adapter is not None, "NexusGen requires a generation adapter to be set."
+            embed = pipe.nexus_gen(inputs_posi["prompt"])[0].unsqueeze(0)
+            inputs_posi["prompt_emb"] = pipe.nexus_gen_generation_adapter(embed)
+            inputs_posi['text_ids'] = torch.zeros(embed.shape[0], embed.shape[1], 3).to(device=pipe.device, dtype=pipe.torch_dtype)
+        else:
+            assert pipe.nexus_gen_editing_adapter is not None, "NexusGen requires an editing adapter to be set."
+            embed, ref_embed, grids = pipe.nexus_gen(inputs_posi["prompt"], inputs_shared["nexus_gen_reference_image"])
+            embeds_grid = grids[1:2].to(device=pipe.device, dtype=torch.long)
+            ref_embeds_grid = grids[0:1].to(device=pipe.device, dtype=torch.long)
+
+            inputs_posi["prompt_emb"] = pipe.nexus_gen_editing_adapter(embed.unsqueeze(0), embeds_grid, ref_embed.unsqueeze(0), ref_embeds_grid)
+            inputs_posi["text_ids"] = self.get_editing_text_ids(
+                inputs_shared["latents"],
+                embeds_grid[0][1].item(), embeds_grid[0][2].item(),
+                ref_embeds_grid[0][1].item(), ref_embeds_grid[0][2].item(),
+                )
+        return inputs_shared, inputs_posi, inputs_nega
+
+
+    def get_editing_text_ids(self, latents, target_embed_height, target_embed_width, ref_embed_height, ref_embed_width):
+        # prepare text ids for target and reference embeddings
+        batch_size, height, width = latents.shape[0], target_embed_height, target_embed_width
+        embed_ids = torch.zeros(height // 2, width // 2, 3)
+        scale_factor_height, scale_factor_width = latents.shape[-2] / height, latents.shape[-1] / width
+        embed_ids[..., 1] = embed_ids[..., 1] + torch.arange(height // 2)[:, None] * scale_factor_height
+        embed_ids[..., 2] = embed_ids[..., 2] + torch.arange(width // 2)[None, :] * scale_factor_width
+        embed_ids = embed_ids[None, :].repeat(batch_size, 1, 1, 1).reshape(batch_size, height // 2 * width // 2, 3)
+        embed_text_ids = embed_ids.to(device=latents.device, dtype=latents.dtype)
+
+        batch_size, height, width = latents.shape[0], ref_embed_height, ref_embed_width
+        ref_embed_ids = torch.zeros(height // 2, width // 2, 3)
+        scale_factor_height, scale_factor_width = latents.shape[-2] / height, latents.shape[-1] / width
+        ref_embed_ids[..., 0] = ref_embed_ids[..., 0] + 1.0
+        ref_embed_ids[..., 1] = ref_embed_ids[..., 1] + torch.arange(height // 2)[:, None] * scale_factor_height
+        ref_embed_ids[..., 2] = ref_embed_ids[..., 2] + torch.arange(width // 2)[None, :] * scale_factor_width
+        ref_embed_ids = ref_embed_ids[None, :].repeat(batch_size, 1, 1, 1).reshape(batch_size, height // 2 * width // 2, 3)
+        ref_embed_text_ids = ref_embed_ids.to(device=latents.device, dtype=latents.dtype)
+
+        text_ids = torch.cat([embed_text_ids, ref_embed_text_ids], dim=1)
+        return text_ids
+
+
 class FluxImageUnit_Step1x(PipelineUnit):
    def __init__(self):
        super().__init__(take_over=True,onload_model_names=("qwenvl","vae_encoder"))
@@ -704,7 +853,8 @@ class FluxImageUnit_Step1x(PipelineUnit):
            image = pipe.preprocess_image(image).to(device=pipe.device, dtype=pipe.torch_dtype)
            image = pipe.vae_encoder(image)
            inputs_posi.update({"step1x_llm_embedding": embs[0:1], "step1x_mask": masks[0:1], "step1x_reference_latents": image})
-            inputs_nega.update({"step1x_llm_embedding": embs[1:2], "step1x_mask": masks[1:2], "step1x_reference_latents": image})
+            if inputs_shared.get("cfg_scale", 1) != 1:
+                inputs_nega.update({"step1x_llm_embedding": embs[1:2], "step1x_mask": masks[1:2], "step1x_reference_latents": image})
            return inputs_shared, inputs_posi, inputs_nega

            
@@ -723,10 +873,12 @@ class FluxImageUnit_Flex(PipelineUnit):
        super().__init__(
            input_params=("latents", "flex_inpaint_image", "flex_inpaint_mask", "flex_control_image", "flex_control_strength", "flex_control_stop", "tiled", "tile_size", "tile_stride"),
            onload_model_names=("vae_encoder",)
-            )
+        )

    def process(self, pipe: FluxImagePipeline, latents, flex_inpaint_image, flex_inpaint_mask, flex_control_image, flex_control_strength, flex_control_stop, tiled, tile_size, tile_stride):
        if pipe.dit.input_dim == 196:
+            if flex_control_stop is None:
+                flex_control_stop = 1
            pipe.load_models_to_device(self.onload_model_names)
            if flex_inpaint_image is None:
                flex_inpaint_image = torch.zeros_like(latents)
@@ -756,18 +908,53 @@ class FluxImageUnit_Flex(PipelineUnit):

 class FluxImageUnit_InfiniteYou(PipelineUnit):
    def __init__(self):
-        super().__init__(input_params=("infinityou_id_image", "infinityou_guidance"))
+        super().__init__(
+            input_params=("infinityou_id_image", "infinityou_guidance"),
+            onload_model_names=("infinityou_processor",)
+        )

    def process(self, pipe: FluxImagePipeline, infinityou_id_image, infinityou_guidance):
+        pipe.load_models_to_device("infinityou_processor")
        if infinityou_id_image is not None:
-            return pipe.infinityou_processor.prepare_infinite_you(pipe.image_proj_model, infinityou_id_image, infinityou_guidance)
+            return pipe.infinityou_processor.prepare_infinite_you(pipe.image_proj_model, infinityou_id_image, infinityou_guidance, pipe.device)
        else:
            return {}



-class InfinitYou:
+class FluxImageUnit_ValueControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={"prompt_emb": "prompt_emb", "text_ids": "text_ids"},
+            input_params_nega={"prompt_emb": "prompt_emb", "text_ids": "text_ids"},
+            input_params=("value_controller_inputs",),
+            onload_model_names=("value_controller",)
+        )
+        
+    def add_to_text_embedding(self, prompt_emb, text_ids, value_emb):
+        prompt_emb = torch.concat([prompt_emb, value_emb], dim=1)
+        extra_text_ids = torch.zeros((value_emb.shape[0], value_emb.shape[1], 3), device=value_emb.device, dtype=value_emb.dtype)
+        text_ids = torch.concat([text_ids, extra_text_ids], dim=1)
+        return prompt_emb, text_ids
+
+    def process(self, pipe: FluxImagePipeline, prompt_emb, text_ids, value_controller_inputs):
+        if value_controller_inputs is None:
+            return {}
+        if not isinstance(value_controller_inputs, list):
+            value_controller_inputs = [value_controller_inputs]
+        value_controller_inputs = torch.tensor(value_controller_inputs).to(dtype=pipe.torch_dtype, device=pipe.device)
+        pipe.load_models_to_device(["value_controller"])
+        value_emb = pipe.value_controller(value_controller_inputs, pipe.torch_dtype)
+        value_emb = value_emb.unsqueeze(0)
+        prompt_emb, text_ids = self.add_to_text_embedding(prompt_emb, text_ids, value_emb)
+        return {"prompt_emb": prompt_emb, "text_ids": text_ids}
+
+
+
+class InfinitYou(torch.nn.Module):
    def __init__(self, device="cuda", torch_dtype=torch.bfloat16):
+        super().__init__()
        from facexlib.recognition import init_recognition_model
        from insightface.app import FaceAnalysis
        self.device = device
@@ -779,7 +966,7 @@ class InfinitYou:
        self.app_320.prepare(ctx_id=0, det_size=(320, 320))
        self.app_160 = FaceAnalysis(name='antelopev2', root=insightface_root_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
        self.app_160.prepare(ctx_id=0, det_size=(160, 160))
-        self.arcface_model = init_recognition_model('arcface', device=self.device)
+        self.arcface_model = init_recognition_model('arcface', device=self.device).to(torch_dtype)

    def _detect_face(self, id_image_cv2):
        face_info = self.app_640.get(id_image_cv2)
@@ -791,16 +978,16 @@ class InfinitYou:
        face_info = self.app_160.get(id_image_cv2)
        return face_info

-    def extract_arcface_bgr_embedding(self, in_image, landmark):
+    def extract_arcface_bgr_embedding(self, in_image, landmark, device):
        from insightface.utils import face_align
        arc_face_image = face_align.norm_crop(in_image, landmark=np.array(landmark), image_size=112)
        arc_face_image = torch.from_numpy(arc_face_image).unsqueeze(0).permute(0, 3, 1, 2) / 255.
        arc_face_image = 2 * arc_face_image - 1
-        arc_face_image = arc_face_image.contiguous().to(self.device)
+        arc_face_image = arc_face_image.contiguous().to(device=device, dtype=self.torch_dtype)
        face_emb = self.arcface_model(arc_face_image)[0] # [512], normalized
        return face_emb

-    def prepare_infinite_you(self, model, id_image, infinityou_guidance):
+    def prepare_infinite_you(self, model, id_image, infinityou_guidance, device):
        import cv2
        if id_image is None:
            return {'id_emb': None}
@@ -809,12 +996,72 @@ class InfinitYou:
        if len(face_info) == 0:
            raise ValueError('No face detected in the input ID image')
        landmark = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]['kps'] # only use the maximum face
-        id_emb = self.extract_arcface_bgr_embedding(id_image_cv2, landmark)
+        id_emb = self.extract_arcface_bgr_embedding(id_image_cv2, landmark, device)
        id_emb = model(id_emb.unsqueeze(0).reshape([1, -1, 512]).to(dtype=self.torch_dtype))
-        infinityou_guidance = torch.Tensor([infinityou_guidance]).to(device=self.device, dtype=self.torch_dtype)
+        infinityou_guidance = torch.Tensor([infinityou_guidance]).to(device=device, dtype=self.torch_dtype)
        return {'id_emb': id_emb, 'infinityou_guidance': infinityou_guidance}


+
+class FluxImageUnit_LoRAEncode(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            take_over=True,
+            onload_model_names=("lora_encoder",)
+        )
+        
+    def parse_lora_encoder_inputs(self, lora_encoder_inputs):
+        if not isinstance(lora_encoder_inputs, list):
+            lora_encoder_inputs = [lora_encoder_inputs]
+        lora_configs = []
+        for lora_encoder_input in lora_encoder_inputs:
+            if isinstance(lora_encoder_input, str):
+                lora_encoder_input = ModelConfig(path=lora_encoder_input)
+            lora_encoder_input.download_if_necessary()
+            lora_configs.append(lora_encoder_input)
+        return lora_configs
+        
+    def load_lora(self, lora_config, dtype, device):
+        loader = FluxLoRALoader(torch_dtype=dtype, device=device)
+        lora = load_state_dict(lora_config.path, torch_dtype=dtype, device=device)
+        lora = loader.convert_state_dict(lora)
+        return lora
+    
+    def lora_embedding(self, pipe, lora_encoder_inputs):
+        lora_emb = []
+        for lora_config in self.parse_lora_encoder_inputs(lora_encoder_inputs):
+            lora = self.load_lora(lora_config, pipe.torch_dtype, pipe.device)
+            lora_emb.append(pipe.lora_encoder(lora))
+        lora_emb = torch.concat(lora_emb, dim=1)
+        return lora_emb
+    
+    def add_to_text_embedding(self, prompt_emb, text_ids, lora_emb):
+        prompt_emb = torch.concat([prompt_emb, lora_emb], dim=1)
+        extra_text_ids = torch.zeros((lora_emb.shape[0], lora_emb.shape[1], 3), device=lora_emb.device, dtype=lora_emb.dtype)
+        text_ids = torch.concat([text_ids, extra_text_ids], dim=1)
+        return prompt_emb, text_ids
+
+    def process(self, pipe: FluxImagePipeline, inputs_shared, inputs_posi, inputs_nega):
+        if inputs_shared.get("lora_encoder_inputs", None) is None:
+            return inputs_shared, inputs_posi, inputs_nega
+        
+        # Encode
+        pipe.load_models_to_device(["lora_encoder"])
+        lora_encoder_inputs = inputs_shared["lora_encoder_inputs"]
+        lora_emb = self.lora_embedding(pipe, lora_encoder_inputs)
+        
+        # Scale
+        lora_encoder_scale = inputs_shared.get("lora_encoder_scale", None)
+        if lora_encoder_scale is not None:
+            lora_emb = lora_emb * lora_encoder_scale
+        
+        # Add to prompt embedding
+        inputs_posi["prompt_emb"], inputs_posi["text_ids"] = self.add_to_text_embedding(
+            inputs_posi["prompt_emb"], inputs_posi["text_ids"], lora_emb)
+        return inputs_shared, inputs_posi, inputs_nega
+
+
+
 class TeaCache:
    def __init__(self, num_inference_steps, rel_l1_thresh):
        self.num_inference_steps = num_inference_steps
@@ -984,6 +1231,7 @@ def model_fn_flux_image(
        
    hidden_states = dit.x_embedder(hidden_states)

+    # EliGen
    if entity_prompt_emb is not None and entity_masks is not None:
        prompt_emb, image_rotary_emb, attention_mask = dit.process_entity_masks(hidden_states, prompt_emb, entity_prompt_emb, entity_masks, text_ids, image_ids)
    else:
--- a/diffsynth/pipelines/wan_video_new.py
+++ b/diffsynth/pipelines/wan_video_new.py
@@ -12,6 +12,7 @@ from tqdm import tqdm
 from typing import Optional
 from typing_extensions import Literal

+from ..utils import BasePipeline, ModelConfig, PipelineUnit, PipelineUnitRunner
 from ..models import ModelManager, load_state_dict
 from ..models.wan_video_dit import WanModel, RMSNorm, sinusoidal_embedding_1d
 from ..models.wan_video_text_encoder import WanTextEncoder, T5RelativeEmbedding, T5LayerNorm
@@ -26,194 +27,6 @@ from ..lora import GeneralLoRALoader



-class BasePipeline(torch.nn.Module):
-
-    def __init__(
-        self,
-        device="cuda", torch_dtype=torch.float16,
-        height_division_factor=64, width_division_factor=64,
-        time_division_factor=None, time_division_remainder=None,
-    ):
-        super().__init__()
-        # The device and torch_dtype is used for the storage of intermediate variables, not models.
-        self.device = device
-        self.torch_dtype = torch_dtype
-        # The following parameters are used for shape check.
-        self.height_division_factor = height_division_factor
-        self.width_division_factor = width_division_factor
-        self.time_division_factor = time_division_factor
-        self.time_division_remainder = time_division_remainder
-        self.vram_management_enabled = False
-        
-        
-    def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
-        if device is not None:
-            self.device = device
-        if dtype is not None:
-            self.torch_dtype = dtype
-        super().to(*args, **kwargs)
-        return self
-
-
-    def check_resize_height_width(self, height, width, num_frames=None):
-        # Shape check
-        if height % self.height_division_factor != 0:
-            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
-            print(f"height % {self.height_division_factor} != 0. We round it up to {height}.")
-        if width % self.width_division_factor != 0:
-            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
-            print(f"width % {self.width_division_factor} != 0. We round it up to {width}.")
-        if num_frames is None:
-            return height, width
-        else:
-            if num_frames % self.time_division_factor != self.time_division_remainder:
-                num_frames = (num_frames + self.time_division_factor - 1) // self.time_division_factor * self.time_division_factor + self.time_division_remainder
-                print(f"num_frames % {self.time_division_factor} != {self.time_division_remainder}. We round it up to {num_frames}.")
-            return height, width, num_frames
-
-
-    def preprocess_image(self, image, torch_dtype=None, device=None, pattern="B C H W", min_value=-1, max_value=1):
-        # Transform a PIL.Image to torch.Tensor
-        image = torch.Tensor(np.array(image, dtype=np.float32))
-        image = image.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
-        image = image * ((max_value - min_value) / 255) + min_value
-        image = repeat(image, f"H W C -> {pattern}", **({"B": 1} if "B" in pattern else {}))
-        return image
-
-
-    def preprocess_video(self, video, torch_dtype=None, device=None, pattern="B C T H W", min_value=-1, max_value=1):
-        # Transform a list of PIL.Image to torch.Tensor
-        video = [self.preprocess_image(image, torch_dtype=torch_dtype, device=device, min_value=min_value, max_value=max_value) for image in video]
-        video = torch.stack(video, dim=pattern.index("T") // 2)
-        return video
-
-
-    def vae_output_to_image(self, vae_output, pattern="B C H W", min_value=-1, max_value=1):
-        # Transform a torch.Tensor to PIL.Image
-        if pattern != "H W C":
-            vae_output = reduce(vae_output, f"{pattern} -> H W C", reduction="mean")
-        image = ((vae_output - min_value) * (255 / (max_value - min_value))).clip(0, 255)
-        image = image.to(device="cpu", dtype=torch.uint8)
-        image = Image.fromarray(image.numpy())
-        return image
-
-
-    def vae_output_to_video(self, vae_output, pattern="B C T H W", min_value=-1, max_value=1):
-        # Transform a torch.Tensor to list of PIL.Image
-        if pattern != "T H W C":
-            vae_output = reduce(vae_output, f"{pattern} -> T H W C", reduction="mean")
-        video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
-        return video
-
-
-    def load_models_to_device(self, model_names=[]):
-        if self.vram_management_enabled:
-            # offload models
-            for name, model in self.named_children():
-                if name not in model_names:
-                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
-                        for module in model.modules():
-                            if hasattr(module, "offload"):
-                                module.offload()
-                    else:
-                        model.cpu()
-            torch.cuda.empty_cache()
-            # onload models
-            for name, model in self.named_children():
-                if name in model_names:
-                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
-                        for module in model.modules():
-                            if hasattr(module, "onload"):
-                                module.onload()
-                    else:
-                        model.to(self.device)
-
-
-    def generate_noise(self, shape, seed=None, rand_device="cpu", rand_torch_dtype=torch.float32, device=None, torch_dtype=None):
-        # Initialize Gaussian noise
-        generator = None if seed is None else torch.Generator(rand_device).manual_seed(seed)
-        noise = torch.randn(shape, generator=generator, device=rand_device, dtype=rand_torch_dtype)
-        noise = noise.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
-        return noise
-
-
-    def enable_cpu_offload(self):
-        warnings.warn("`enable_cpu_offload` will be deprecated. Please use `enable_vram_management`.")
-        self.vram_management_enabled = True
-        
-        
-    def get_vram(self):
-        return torch.cuda.mem_get_info(self.device)[1] / (1024 ** 3)
-    
-    
-    def freeze_except(self, model_names):
-        for name, model in self.named_children():
-            if name in model_names:
-                model.train()
-                model.requires_grad_(True)
-            else:
-                model.eval()
-                model.requires_grad_(False)
-
-
-@dataclass
-class ModelConfig:
-    path: Union[str, list[str]] = None
-    model_id: str = None
-    origin_file_pattern: Union[str, list[str]] = None
-    download_resource: str = "ModelScope"
-    offload_device: Optional[Union[str, torch.device]] = None
-    offload_dtype: Optional[torch.dtype] = None
-
-    def download_if_necessary(self, local_model_path="./models", skip_download=False, use_usp=False):
-        if self.path is None:
-            # Check model_id and origin_file_pattern
-            if self.model_id is None:
-                raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`.""")
-            
-            # Skip if not in rank 0
-            if use_usp:
-                import torch.distributed as dist
-                skip_download = dist.get_rank() != 0
-                
-            # Check whether the origin path is a folder
-            if self.origin_file_pattern is None or self.origin_file_pattern == "":
-                self.origin_file_pattern = ""
-                allow_file_pattern = None
-                is_folder = True
-            elif isinstance(self.origin_file_pattern, str) and self.origin_file_pattern.endswith("/"):
-                allow_file_pattern = self.origin_file_pattern + "*"
-                is_folder = True
-            else:
-                allow_file_pattern = self.origin_file_pattern
-                is_folder = False
-            
-            # Download
-            if not skip_download:
-                downloaded_files = glob.glob(self.origin_file_pattern, root_dir=os.path.join(local_model_path, self.model_id))
-                snapshot_download(
-                    self.model_id,
-                    local_dir=os.path.join(local_model_path, self.model_id),
-                    allow_file_pattern=allow_file_pattern,
-                    ignore_file_pattern=downloaded_files,
-                    local_files_only=False
-                )
-            
-            # Let rank 1, 2, ... wait for rank 0
-            if use_usp:
-                import torch.distributed as dist
-                dist.barrier(device_ids=[dist.get_rank()])
-                
-            # Return downloaded files
-            if is_folder:
-                self.path = os.path.join(local_model_path, self.model_id, self.origin_file_pattern)
-            else:
-                self.path = glob.glob(os.path.join(local_model_path, self.model_id, self.origin_file_pattern))
-            if isinstance(self.path, list) and len(self.path) == 1:
-                self.path = self.path[0]
-
-
 class WanVideoPipeline(BasePipeline):

    def __init__(self, device="cuda", torch_dtype=torch.bfloat16, tokenizer_path=None):
@@ -226,17 +39,21 @@ class WanVideoPipeline(BasePipeline):
        self.text_encoder: WanTextEncoder = None
        self.image_encoder: WanImageEncoder = None
        self.dit: WanModel = None
+        self.dit2: WanModel = None
        self.vae: WanVideoVAE = None
        self.motion_controller: WanMotionControllerModel = None
        self.vace: VaceWanModel = None
        self.in_iteration_models = ("dit", "motion_controller", "vace")
+        self.in_iteration_models_2 = ("dit2", "motion_controller", "vace")
        self.unit_runner = PipelineUnitRunner()
        self.units = [
            WanVideoUnit_ShapeChecker(),
            WanVideoUnit_NoiseInitializer(),
            WanVideoUnit_InputVideoEmbedder(),
            WanVideoUnit_PromptEmbedder(),
-            WanVideoUnit_ImageEmbedder(),
+            WanVideoUnit_ImageEmbedderVAE(),
+            WanVideoUnit_ImageEmbedderCLIP(),
+            WanVideoUnit_ImageEmbedderFused(),
            WanVideoUnit_FunControl(),
            WanVideoUnit_FunReference(),
            WanVideoUnit_FunCameraControl(),
@@ -256,7 +73,9 @@ class WanVideoPipeline(BasePipeline):

        
    def training_loss(self, **inputs):
-        timestep_id = torch.randint(0, self.scheduler.num_train_timesteps, (1,))
+        max_timestep_boundary = int(inputs.get("max_timestep_boundary", 1) * self.scheduler.num_train_timesteps)
+        min_timestep_boundary = int(inputs.get("min_timestep_boundary", 0) * self.scheduler.num_train_timesteps)
+        timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
        timestep = self.scheduler.timesteps[timestep_id].to(dtype=self.torch_dtype, device=self.device)
        
        inputs["latents"] = self.scheduler.add_noise(inputs["input_latents"], inputs["noise"], timestep)
@@ -328,6 +147,37 @@ class WanVideoPipeline(BasePipeline):
                ),
                vram_limit=vram_limit,
            )
+        if self.dit2 is not None:
+            dtype = next(iter(self.dit2.parameters())).dtype
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.dit2,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: WanAutoCastLayerNorm,
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
        if self.vae is not None:
            dtype = next(iter(self.vae.parameters())).dtype
            enable_vram_management(
@@ -426,6 +276,10 @@ class WanVideoPipeline(BasePipeline):
        for block in self.dit.blocks:
            block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
        self.dit.forward = types.MethodType(usp_dit_forward, self.dit)
+        if self.dit2 is not None:
+            for block in self.dit2.blocks:
+                block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+            self.dit2.forward = types.MethodType(usp_dit_forward, self.dit2)
        self.sp_size = get_sequence_parallel_world_size()
        self.use_unified_sequence_parallel = True

@@ -436,8 +290,6 @@ class WanVideoPipeline(BasePipeline):
        device: Union[str, torch.device] = "cuda",
        model_configs: list[ModelConfig] = [],
        tokenizer_config: ModelConfig = ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*"),
-        local_model_path: str = "./models",
-        skip_download: bool = False,
        redirect_common_files: bool = True,
        use_usp=False,
    ):
@@ -462,7 +314,7 @@ class WanVideoPipeline(BasePipeline):
        # Download and load models
        model_manager = ModelManager()
        for model_config in model_configs:
-            model_config.download_if_necessary(local_model_path, skip_download=skip_download, use_usp=use_usp)
+            model_config.download_if_necessary(use_usp=use_usp)
            model_manager.load_model(
                model_config.path,
                device=model_config.offload_device or device,
@@ -471,14 +323,23 @@ class WanVideoPipeline(BasePipeline):
        
        # Load models
        pipe.text_encoder = model_manager.fetch_model("wan_video_text_encoder")
-        pipe.dit = model_manager.fetch_model("wan_video_dit")
+        dit = model_manager.fetch_model("wan_video_dit", index=2)
+        if isinstance(dit, list):
+            pipe.dit, pipe.dit2 = dit
+        else:
+            pipe.dit = dit
        pipe.vae = model_manager.fetch_model("wan_video_vae")
        pipe.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
        pipe.motion_controller = model_manager.fetch_model("wan_video_motion_controller")
        pipe.vace = model_manager.fetch_model("wan_video_vace")
+        
+        # Size division factor
+        if pipe.vae is not None:
+            pipe.height_division_factor = pipe.vae.upsampling_factor * 2
+            pipe.width_division_factor = pipe.vae.upsampling_factor * 2

        # Initialize tokenizer
-        tokenizer_config.download_if_necessary(local_model_path, skip_download=skip_download)
+        tokenizer_config.download_if_necessary(use_usp=use_usp)
        pipe.prompter.fetch_models(pipe.text_encoder)
        pipe.prompter.fetch_tokenizer(tokenizer_config.path)
        
@@ -522,6 +383,8 @@ class WanVideoPipeline(BasePipeline):
        # Classifier-free guidance
        cfg_scale: Optional[float] = 5.0,
        cfg_merge: Optional[bool] = False,
+        # Boundary
+        switch_DiT_boundary: Optional[float] = 0.875,
        # Scheduler
        num_inference_steps: Optional[int] = 50,
        sigma_shift: Optional[float] = 5.0,
@@ -574,8 +437,14 @@ class WanVideoPipeline(BasePipeline):
        self.load_models_to_device(self.in_iteration_models)
        models = {name: getattr(self, name) for name in self.in_iteration_models}
        for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
+            # Switch DiT if necessary
+            if timestep.item() < switch_DiT_boundary * self.scheduler.num_train_timesteps and self.dit2 is not None and not models["dit"] is self.dit2:
+                self.load_models_to_device(self.in_iteration_models_2)
+                models["dit"] = self.dit2
+                
+            # Timestep
            timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device)
-
+            
            # Inference
            noise_pred_posi = self.model_fn(**models, **inputs_shared, **inputs_posi, timestep=timestep)
            if cfg_scale != 1.0:
@@ -589,6 +458,8 @@ class WanVideoPipeline(BasePipeline):

            # Scheduler
            inputs_shared["latents"] = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], inputs_shared["latents"])
+            if "first_frame_latents" in inputs_shared:
+                inputs_shared["latents"][:, :, 0:1] = inputs_shared["first_frame_latents"]
        
        # VACE (TODO: remove it)
        if vace_reference_image is not None:
@@ -604,63 +475,6 @@ class WanVideoPipeline(BasePipeline):



-class PipelineUnit:
-    def __init__(
-        self,
-        seperate_cfg: bool = False,
-        take_over: bool = False,
-        input_params: tuple[str] = None,
-        input_params_posi: dict[str, str] = None,
-        input_params_nega: dict[str, str] = None,
-        onload_model_names: tuple[str] = None
-    ):
-        self.seperate_cfg = seperate_cfg
-        self.take_over = take_over
-        self.input_params = input_params
-        self.input_params_posi = input_params_posi
-        self.input_params_nega = input_params_nega
-        self.onload_model_names = onload_model_names
-
-
-    def process(self, pipe: WanVideoPipeline, inputs: dict, positive=True, **kwargs) -> dict:
-        raise NotImplementedError("`process` is not implemented.")
-
-
-
-class PipelineUnitRunner:
-    def __init__(self):
-        pass
-
-    def __call__(self, unit: PipelineUnit, pipe: WanVideoPipeline, inputs_shared: dict, inputs_posi: dict, inputs_nega: dict) -> tuple[dict, dict]:
-        if unit.take_over:
-            # Let the pipeline unit take over this function.
-            inputs_shared, inputs_posi, inputs_nega = unit.process(pipe, inputs_shared=inputs_shared, inputs_posi=inputs_posi, inputs_nega=inputs_nega)
-        elif unit.seperate_cfg:
-            # Positive side
-            processor_inputs = {name: inputs_posi.get(name_) for name, name_ in unit.input_params_posi.items()}
-            if unit.input_params is not None:
-                for name in unit.input_params:
-                    processor_inputs[name] = inputs_shared.get(name)
-            processor_outputs = unit.process(pipe, **processor_inputs)
-            inputs_posi.update(processor_outputs)
-            # Negative side
-            if inputs_shared["cfg_scale"] != 1:
-                processor_inputs = {name: inputs_nega.get(name_) for name, name_ in unit.input_params_nega.items()}
-                if unit.input_params is not None:
-                    for name in unit.input_params:
-                        processor_inputs[name] = inputs_shared.get(name)
-                processor_outputs = unit.process(pipe, **processor_inputs)
-                inputs_nega.update(processor_outputs)
-            else:
-                inputs_nega.update(processor_outputs)
-        else:
-            processor_inputs = {name: inputs_shared.get(name) for name in unit.input_params}
-            processor_outputs = unit.process(pipe, **processor_inputs)
-            inputs_shared.update(processor_outputs)
-        return inputs_shared, inputs_posi, inputs_nega
-
-
-
 class WanVideoUnit_ShapeChecker(PipelineUnit):
    def __init__(self):
        super().__init__(input_params=("height", "width", "num_frames"))
@@ -679,7 +493,8 @@ class WanVideoUnit_NoiseInitializer(PipelineUnit):
        length = (num_frames - 1) // 4 + 1
        if vace_reference_image is not None:
            length += 1
-        noise = pipe.generate_noise((1, 16, length, height//8, width//8), seed=seed, rand_device=rand_device)
+        shape = (1, pipe.vae.model.z_dim, length, height // pipe.vae.upsampling_factor, width // pipe.vae.upsampling_factor)
+        noise = pipe.generate_noise(shape, seed=seed, rand_device=rand_device)
        if vace_reference_image is not None:
            noise = torch.concat((noise[:, :, -1:], noise[:, :, :-1]), dim=2)
        return {"noise": noise}
@@ -728,6 +543,9 @@ class WanVideoUnit_PromptEmbedder(PipelineUnit):


 class WanVideoUnit_ImageEmbedder(PipelineUnit):
+    """
+    Deprecated
+    """
    def __init__(self):
        super().__init__(
            input_params=("input_image", "end_image", "num_frames", "height", "width", "tiled", "tile_size", "tile_stride"),
@@ -735,7 +553,7 @@ class WanVideoUnit_ImageEmbedder(PipelineUnit):
        )

    def process(self, pipe: WanVideoPipeline, input_image, end_image, num_frames, height, width, tiled, tile_size, tile_stride):
-        if input_image is None:
+        if input_image is None or pipe.image_encoder is None:
            return {}
        pipe.load_models_to_device(self.onload_model_names)
        image = pipe.preprocess_image(input_image.resize((width, height))).to(pipe.device)
@@ -763,13 +581,90 @@ class WanVideoUnit_ImageEmbedder(PipelineUnit):
        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
        return {"clip_feature": clip_context, "y": y}

+
+
+class WanVideoUnit_ImageEmbedderCLIP(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=("input_image", "end_image", "height", "width"),
+            onload_model_names=("image_encoder",)
+        )
+
+    def process(self, pipe: WanVideoPipeline, input_image, end_image, height, width):
+        if input_image is None or pipe.image_encoder is None or not pipe.dit.require_clip_embedding:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(pipe.device)
+        clip_context = pipe.image_encoder.encode_image([image])
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(pipe.device)
+            if pipe.dit.has_image_pos_emb:
+                clip_context = torch.concat([clip_context, pipe.image_encoder.encode_image([end_image])], dim=1)
+        clip_context = clip_context.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"clip_feature": clip_context}
+    
+
+
+class WanVideoUnit_ImageEmbedderVAE(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=("input_image", "end_image", "num_frames", "height", "width", "tiled", "tile_size", "tile_stride"),
+            onload_model_names=("vae",)
+        )
+
+    def process(self, pipe: WanVideoPipeline, input_image, end_image, num_frames, height, width, tiled, tile_size, tile_stride):
+        if input_image is None or not pipe.dit.require_vae_embedding:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(pipe.device)
+        msk = torch.ones(1, num_frames, height//8, width//8, device=pipe.device)
+        msk[:, 1:] = 0
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(pipe.device)
+            vae_input = torch.concat([image.transpose(0,1), torch.zeros(3, num_frames-2, height, width).to(image.device), end_image.transpose(0,1)],dim=1)
+            msk[:, -1:] = 1
+        else:
+            vae_input = torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device)], dim=1)
+
+        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, height//8, width//8)
+        msk = msk.transpose(1, 2)[0]
        
- 
+        y = pipe.vae.encode([vae_input.to(dtype=pipe.torch_dtype, device=pipe.device)], device=pipe.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)[0]
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = torch.concat([msk, y])
+        y = y.unsqueeze(0)
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"y": y}
+
+
+
+class WanVideoUnit_ImageEmbedderFused(PipelineUnit):
+    """
+    Encode input image to latents using VAE. This unit is for Wan-AI/Wan2.2-TI2V-5B.
+    """
+    def __init__(self):
+        super().__init__(
+            input_params=("input_image", "latents", "height", "width", "tiled", "tile_size", "tile_stride"),
+            onload_model_names=("vae",)
+        )
+
+    def process(self, pipe: WanVideoPipeline, input_image, latents, height, width, tiled, tile_size, tile_stride):
+        if input_image is None or not pipe.dit.fuse_vae_embedding_in_latents:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).transpose(0, 1)
+        z = pipe.vae.encode([image], device=pipe.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        latents[:, :, 0: 1] = z
+        return {"latents": latents, "fuse_vae_embedding_in_latents": True, "first_frame_latents": z}
+
+
+
 class WanVideoUnit_FunControl(PipelineUnit):
    def __init__(self):
        super().__init__(
            input_params=("control_video", "num_frames", "height", "width", "tiled", "tile_size", "tile_stride", "clip_feature", "y"),
-            onload_model_names=("vae")
+            onload_model_names=("vae",)
        )

    def process(self, pipe: WanVideoPipeline, control_video, num_frames, height, width, tiled, tile_size, tile_stride, clip_feature, y):
@@ -793,7 +688,7 @@ class WanVideoUnit_FunReference(PipelineUnit):
    def __init__(self):
        super().__init__(
            input_params=("reference_image", "height", "width", "reference_image"),
-            onload_model_names=("vae")
+            onload_model_names=("vae",)
        )

    def process(self, pipe: WanVideoPipeline, reference_image, height, width):
@@ -812,7 +707,8 @@ class WanVideoUnit_FunReference(PipelineUnit):
 class WanVideoUnit_FunCameraControl(PipelineUnit):
    def __init__(self):
        super().__init__(
-            input_params=("height", "width", "num_frames", "camera_control_direction", "camera_control_speed", "camera_control_origin", "latents", "input_image")
+            input_params=("height", "width", "num_frames", "camera_control_direction", "camera_control_speed", "camera_control_origin", "latents", "input_image"),
+            onload_model_names=("vae",)
        )

    def process(self, pipe: WanVideoPipeline, height, width, num_frames, camera_control_direction, camera_control_speed, camera_control_origin, latents, input_image):
@@ -835,6 +731,7 @@ class WanVideoUnit_FunCameraControl(PipelineUnit):

        input_image = input_image.resize((width, height))
        input_latents = pipe.preprocess_video([input_image])
+        pipe.load_models_to_device(self.onload_model_names)
        input_latents = pipe.vae.encode(input_latents, device=pipe.device)
        y = torch.zeros_like(latents).to(pipe.device)
        y[:, :, :1] = input_latents
@@ -1014,10 +911,14 @@ class TemporalTiler_BCTHW:

    def build_1d_mask(self, length, left_bound, right_bound, border_width):
        x = torch.ones((length,))
+        if border_width == 0:
+            return x
+        
+        shift = 0.5
        if not left_bound:
-            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+            x[:border_width] = (torch.arange(border_width) + shift) / border_width
        if not right_bound:
-            x[-border_width:] = torch.flip((torch.arange(border_width) + 1) / border_width, dims=(0,))
+            x[-border_width:] = torch.flip((torch.arange(border_width) + shift) / border_width, dims=(0,))
        return x

    def build_mask(self, data, is_bound, border_width):
@@ -1078,6 +979,7 @@ def model_fn_wan_video(
    use_gradient_checkpointing: bool = False,
    use_gradient_checkpointing_offload: bool = False,
    control_camera_latents_input = None,
+    fuse_vae_embedding_in_latents: bool = False,
    **kwargs,
 ):
    if sliding_window_size is not None and sliding_window_stride is not None:
@@ -1111,9 +1013,20 @@ def model_fn_wan_video(
        from xfuser.core.distributed import (get_sequence_parallel_rank,
                                            get_sequence_parallel_world_size,
                                            get_sp_group)
+
+    # Timestep
+    if dit.seperated_timestep and fuse_vae_embedding_in_latents:
+        timestep = torch.concat([
+            torch.zeros((1, latents.shape[3] * latents.shape[4] // 4), dtype=latents.dtype, device=latents.device),
+            torch.ones((latents.shape[2] - 1, latents.shape[3] * latents.shape[4] // 4), dtype=latents.dtype, device=latents.device) * timestep
+        ]).flatten()
+        t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep).unsqueeze(0))
+        t_mod = dit.time_projection(t).unflatten(2, (6, dit.dim))
+    else:
+        t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep))
+        t_mod = dit.time_projection(t).unflatten(1, (6, dit.dim))
    
-    t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep))
-    t_mod = dit.time_projection(t).unflatten(1, (6, dit.dim))
+    # Motion Controller
    if motion_bucket_id is not None and motion_controller is not None:
        t_mod = t_mod + motion_controller(motion_bucket_id).unflatten(1, (6, dit.dim))
    context = dit.text_embedding(context)
@@ -1124,15 +1037,16 @@ def model_fn_wan_video(
        x = torch.concat([x] * context.shape[0], dim=0)
    if timestep.shape[0] != context.shape[0]:
        timestep = torch.concat([timestep] * context.shape[0], dim=0)
-    
-    if dit.has_image_input:
-        x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
+
+    # Image Embedding
+    if y is not None and dit.require_vae_embedding:
+        x = torch.cat([x, y], dim=1)
+    if clip_feature is not None and dit.require_clip_embedding:
        clip_embdding = dit.img_emb(clip_feature)
        context = torch.cat([clip_embdding, context], dim=1)
    
    # Add camera control
    x, (f, h, w) = dit.patchify(x, control_camera_latents_input)
-
    
    # Reference image
    if reference_latents is not None:
--- a/diffsynth/trainers/utils.py
+++ b/diffsynth/trainers/utils.py
@@ -120,8 +120,12 @@ class ImageDataset(torch.utils.data.Dataset):
        data = self.data[data_id % len(self.data)].copy()
        for key in self.data_file_keys:
            if key in data:
-                path = os.path.join(self.base_path, data[key])
-                data[key] = self.load_data(path)
+                if isinstance(data[key], list):
+                    path = [os.path.join(self.base_path, p) for p in data[key]]
+                    data[key] = [self.load_data(p) for p in path]
+                else:
+                    path = os.path.join(self.base_path, data[key])
+                    data[key] = self.load_data(path)
                if data[key] is None:
                    warnings.warn(f"cannot load file {data[key]}.")
                    return None
@@ -434,6 +438,8 @@ def wan_parser():
    parser.add_argument("--extra_inputs", default=None, help="Additional model inputs, comma-separated.")
    parser.add_argument("--use_gradient_checkpointing_offload", default=False, action="store_true", help="Whether to offload gradient checkpointing to CPU memory.")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Gradient accumulation steps.")
+    parser.add_argument("--max_timestep_boundary", type=float, default=1.0, help="Max timestep boundary (for mixed models, e.g., Wan-AI/Wan2.2-I2V-A14B).")
+    parser.add_argument("--min_timestep_boundary", type=float, default=0.0, help="Min timestep boundary (for mixed models, e.g., Wan-AI/Wan2.2-I2V-A14B).")
    return parser


--- a/diffsynth/utils/init.py
+++ b/diffsynth/utils/init.py
@@ -0,0 +1,261 @@
+import torch, warnings, glob, os
+import numpy as np
+from PIL import Image
+from einops import repeat, reduce
+from typing import Optional, Union
+from dataclasses import dataclass
+from modelscope import snapshot_download
+import numpy as np
+from PIL import Image
+from typing import Optional
+
+
+class BasePipeline(torch.nn.Module):
+
+    def __init__(
+        self,
+        device="cuda", torch_dtype=torch.float16,
+        height_division_factor=64, width_division_factor=64,
+        time_division_factor=None, time_division_remainder=None,
+    ):
+        super().__init__()
+        # The device and torch_dtype is used for the storage of intermediate variables, not models.
+        self.device = device
+        self.torch_dtype = torch_dtype
+        # The following parameters are used for shape check.
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.time_division_factor = time_division_factor
+        self.time_division_remainder = time_division_remainder
+        self.vram_management_enabled = False
+        
+        
+    def to(self, *args, **kwargs):
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        if device is not None:
+            self.device = device
+        if dtype is not None:
+            self.torch_dtype = dtype
+        super().to(*args, **kwargs)
+        return self
+
+
+    def check_resize_height_width(self, height, width, num_frames=None):
+        # Shape check
+        if height % self.height_division_factor != 0:
+            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
+            print(f"height % {self.height_division_factor} != 0. We round it up to {height}.")
+        if width % self.width_division_factor != 0:
+            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
+            print(f"width % {self.width_division_factor} != 0. We round it up to {width}.")
+        if num_frames is None:
+            return height, width
+        else:
+            if num_frames % self.time_division_factor != self.time_division_remainder:
+                num_frames = (num_frames + self.time_division_factor - 1) // self.time_division_factor * self.time_division_factor + self.time_division_remainder
+                print(f"num_frames % {self.time_division_factor} != {self.time_division_remainder}. We round it up to {num_frames}.")
+            return height, width, num_frames
+
+
+    def preprocess_image(self, image, torch_dtype=None, device=None, pattern="B C H W", min_value=-1, max_value=1):
+        # Transform a PIL.Image to torch.Tensor
+        image = torch.Tensor(np.array(image, dtype=np.float32))
+        image = image.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
+        image = image * ((max_value - min_value) / 255) + min_value
+        image = repeat(image, f"H W C -> {pattern}", **({"B": 1} if "B" in pattern else {}))
+        return image
+
+
+    def preprocess_video(self, video, torch_dtype=None, device=None, pattern="B C T H W", min_value=-1, max_value=1):
+        # Transform a list of PIL.Image to torch.Tensor
+        video = [self.preprocess_image(image, torch_dtype=torch_dtype, device=device, min_value=min_value, max_value=max_value) for image in video]
+        video = torch.stack(video, dim=pattern.index("T") // 2)
+        return video
+
+
+    def vae_output_to_image(self, vae_output, pattern="B C H W", min_value=-1, max_value=1):
+        # Transform a torch.Tensor to PIL.Image
+        if pattern != "H W C":
+            vae_output = reduce(vae_output, f"{pattern} -> H W C", reduction="mean")
+        image = ((vae_output - min_value) * (255 / (max_value - min_value))).clip(0, 255)
+        image = image.to(device="cpu", dtype=torch.uint8)
+        image = Image.fromarray(image.numpy())
+        return image
+
+
+    def vae_output_to_video(self, vae_output, pattern="B C T H W", min_value=-1, max_value=1):
+        # Transform a torch.Tensor to list of PIL.Image
+        if pattern != "T H W C":
+            vae_output = reduce(vae_output, f"{pattern} -> T H W C", reduction="mean")
+        video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
+        return video
+
+
+    def load_models_to_device(self, model_names=[]):
+        if self.vram_management_enabled:
+            # offload models
+            for name, model in self.named_children():
+                if name not in model_names:
+                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                        for module in model.modules():
+                            if hasattr(module, "offload"):
+                                module.offload()
+                    else:
+                        model.cpu()
+            torch.cuda.empty_cache()
+            # onload models
+            for name, model in self.named_children():
+                if name in model_names:
+                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                        for module in model.modules():
+                            if hasattr(module, "onload"):
+                                module.onload()
+                    else:
+                        model.to(self.device)
+
+
+    def generate_noise(self, shape, seed=None, rand_device="cpu", rand_torch_dtype=torch.float32, device=None, torch_dtype=None):
+        # Initialize Gaussian noise
+        generator = None if seed is None else torch.Generator(rand_device).manual_seed(seed)
+        noise = torch.randn(shape, generator=generator, device=rand_device, dtype=rand_torch_dtype)
+        noise = noise.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
+        return noise
+
+
+    def enable_cpu_offload(self):
+        warnings.warn("`enable_cpu_offload` will be deprecated. Please use `enable_vram_management`.")
+        self.vram_management_enabled = True
+        
+        
+    def get_vram(self):
+        return torch.cuda.mem_get_info(self.device)[1] / (1024 ** 3)
+    
+    
+    def freeze_except(self, model_names):
+        for name, model in self.named_children():
+            if name in model_names:
+                model.train()
+                model.requires_grad_(True)
+            else:
+                model.eval()
+                model.requires_grad_(False)
+
+
+@dataclass
+class ModelConfig:
+    path: Union[str, list[str]] = None
+    model_id: str = None
+    origin_file_pattern: Union[str, list[str]] = None
+    download_resource: str = "ModelScope"
+    offload_device: Optional[Union[str, torch.device]] = None
+    offload_dtype: Optional[torch.dtype] = None
+    local_model_path: str = None
+    skip_download: bool = False
+
+    def download_if_necessary(self, use_usp=False):
+        if self.path is None:
+            # Check model_id and origin_file_pattern
+            if self.model_id is None:
+                raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`.""")
+            
+            # Skip if not in rank 0
+            if use_usp:
+                import torch.distributed as dist
+                skip_download = self.skip_download or dist.get_rank() != 0
+            else:
+                skip_download = self.skip_download
+                
+            # Check whether the origin path is a folder
+            if self.origin_file_pattern is None or self.origin_file_pattern == "":
+                self.origin_file_pattern = ""
+                allow_file_pattern = None
+                is_folder = True
+            elif isinstance(self.origin_file_pattern, str) and self.origin_file_pattern.endswith("/"):
+                allow_file_pattern = self.origin_file_pattern + "*"
+                is_folder = True
+            else:
+                allow_file_pattern = self.origin_file_pattern
+                is_folder = False
+            
+            # Download
+            if not skip_download:
+                if self.local_model_path is None:
+                    self.local_model_path = "./models"
+                downloaded_files = glob.glob(self.origin_file_pattern, root_dir=os.path.join(self.local_model_path, self.model_id))
+                snapshot_download(
+                    self.model_id,
+                    local_dir=os.path.join(self.local_model_path, self.model_id),
+                    allow_file_pattern=allow_file_pattern,
+                    ignore_file_pattern=downloaded_files,
+                    local_files_only=False
+                )
+            
+            # Let rank 1, 2, ... wait for rank 0
+            if use_usp:
+                import torch.distributed as dist
+                dist.barrier(device_ids=[dist.get_rank()])
+                
+            # Return downloaded files
+            if is_folder:
+                self.path = os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern)
+            else:
+                self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern))
+            if isinstance(self.path, list) and len(self.path) == 1:
+                self.path = self.path[0]
+
+
+
+class PipelineUnit:
+    def __init__(
+        self,
+        seperate_cfg: bool = False,
+        take_over: bool = False,
+        input_params: tuple[str] = None,
+        input_params_posi: dict[str, str] = None,
+        input_params_nega: dict[str, str] = None,
+        onload_model_names: tuple[str] = None
+    ):
+        self.seperate_cfg = seperate_cfg
+        self.take_over = take_over
+        self.input_params = input_params
+        self.input_params_posi = input_params_posi
+        self.input_params_nega = input_params_nega
+        self.onload_model_names = onload_model_names
+
+
+    def process(self, pipe: BasePipeline, inputs: dict, positive=True, **kwargs) -> dict:
+        raise NotImplementedError("`process` is not implemented.")
+
+
+
+class PipelineUnitRunner:
+    def __init__(self):
+        pass
+
+    def __call__(self, unit: PipelineUnit, pipe: BasePipeline, inputs_shared: dict, inputs_posi: dict, inputs_nega: dict) -> tuple[dict, dict]:
+        if unit.take_over:
+            # Let the pipeline unit take over this function.
+            inputs_shared, inputs_posi, inputs_nega = unit.process(pipe, inputs_shared=inputs_shared, inputs_posi=inputs_posi, inputs_nega=inputs_nega)
+        elif unit.seperate_cfg:
+            # Positive side
+            processor_inputs = {name: inputs_posi.get(name_) for name, name_ in unit.input_params_posi.items()}
+            if unit.input_params is not None:
+                for name in unit.input_params:
+                    processor_inputs[name] = inputs_shared.get(name)
+            processor_outputs = unit.process(pipe, **processor_inputs)
+            inputs_posi.update(processor_outputs)
+            # Negative side
+            if inputs_shared["cfg_scale"] != 1:
+                processor_inputs = {name: inputs_nega.get(name_) for name, name_ in unit.input_params_nega.items()}
+                if unit.input_params is not None:
+                    for name in unit.input_params:
+                        processor_inputs[name] = inputs_shared.get(name)
+                processor_outputs = unit.process(pipe, **processor_inputs)
+                inputs_nega.update(processor_outputs)
+            else:
+                inputs_nega.update(processor_outputs)
+        else:
+            processor_inputs = {name: inputs_shared.get(name) for name in unit.input_params}
+            processor_outputs = unit.process(pipe, **processor_inputs)
+            inputs_shared.update(processor_outputs)
+        return inputs_shared, inputs_posi, inputs_nega