support hunyuanvideo_i2v

2026-03-18 22:08:13 +00:00 · 2025-03-11 16:20:09 +08:00
parent 945b43492e
commit 4bec2983a9
9 changed files with 327 additions and 161 deletions
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -112,7 +112,6 @@ model_loader_configs = [
    (None, "aeb82dce778a03dcb4d726cb03f3c43f", ["hunyuan_video_vae_decoder", "hunyuan_video_vae_encoder"], [HunyuanVideoVAEDecoder, HunyuanVideoVAEEncoder], "diffusers"),
    (None, "b9588f02e78f5ccafc9d7c0294e46308", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
    (None, "84ef4bd4757f60e906b54aa6a7815dc6", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
-    (None, "ae3c22aaa28bfae6f3688f796c9814ae", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
    (None, "68beaf8429b7c11aa8ca05b1bd0058bd", ["stepvideo_vae"], [StepVideoVAE], "civitai"),
    (None, "5c0216a2132b082c10cb7a0e0377e681", ["stepvideo_dit"], [StepVideoModel], "civitai"),
    (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
--- a/diffsynth/models/hunyuan_video_dit.py
+++ b/diffsynth/models/hunyuan_video_dit.py
@@ -237,7 +237,7 @@ class IndividualTokenRefinerBlock(torch.nn.Module):
        x = x + self.mlp(self.norm2(x)) * gate_mlp.unsqueeze(1)

        return x
-    
+

 class SingleTokenRefiner(torch.nn.Module):
    def __init__(self, in_channels=4096, hidden_size=3072, depth=2):
@@ -270,7 +270,7 @@ class SingleTokenRefiner(torch.nn.Module):
            x = block(x, c, mask)

        return x
-    
+

 class ModulateDiT(torch.nn.Module):
    def __init__(self, hidden_size, factor=6):
@@ -280,9 +280,14 @@ class ModulateDiT(torch.nn.Module):

    def forward(self, x):
        return self.linear(self.act(x))
-    

-def modulate(x, shift=None, scale=None):
+
+def modulate(x, shift=None, scale=None, tr_shift=None, tr_scale=None, tr_token=None):
+    if tr_shift is not None:
+        x_zero = x[:, :tr_token] * (1 + tr_scale.unsqueeze(1)) + tr_shift.unsqueeze(1)
+        x_orig = x[:, tr_token:] * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        x = torch.concat((x_zero, x_orig), dim=1)
+        return x
    if scale is None and shift is None:
        return x
    elif shift is None:
@@ -291,7 +296,7 @@ def modulate(x, shift=None, scale=None):
        return x + shift.unsqueeze(1)
    else:
        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-    
+

 def reshape_for_broadcast(
    freqs_cis,
@@ -344,7 +349,7 @@ def rotate_half(x):
        x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
    )  # [B, S, H, D//2]
    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-    
+

 def apply_rotary_emb(
    xq: torch.Tensor,
@@ -386,6 +391,15 @@ def attention(q, k, v):
    return x


+def apply_gate(x, gate, tr_gate=None, tr_token=None):
+    if tr_gate is not None:
+        x_zero = x[:, :tr_token] * tr_gate.unsqueeze(1)
+        x_orig = x[:, tr_token:] * gate.unsqueeze(1)
+        return torch.concat((x_zero, x_orig), dim=1)
+    else:
+        return x * gate.unsqueeze(1)
+
+
 class MMDoubleStreamBlockComponent(torch.nn.Module):
    def __init__(self, hidden_size=3072, heads_num=24, mlp_width_ratio=4):
        super().__init__()
@@ -406,11 +420,17 @@ class MMDoubleStreamBlockComponent(torch.nn.Module):
            torch.nn.Linear(hidden_size * mlp_width_ratio, hidden_size)
        )

-    def forward(self, hidden_states, conditioning, freqs_cis=None):
+    def forward(self, hidden_states, conditioning, freqs_cis=None, token_replace_vec=None, tr_token=None):
        mod1_shift, mod1_scale, mod1_gate, mod2_shift, mod2_scale, mod2_gate = self.mod(conditioning).chunk(6, dim=-1)
+        if token_replace_vec is not None:
+            assert tr_token is not None
+            tr_mod1_shift, tr_mod1_scale, tr_mod1_gate, tr_mod2_shift, tr_mod2_scale, tr_mod2_gate = self.mod(token_replace_vec).chunk(6, dim=-1)
+        else:
+            tr_mod1_shift, tr_mod1_scale, tr_mod1_gate, tr_mod2_shift, tr_mod2_scale, tr_mod2_gate = None, None, None, None, None, None

        norm_hidden_states = self.norm1(hidden_states)
-        norm_hidden_states = modulate(norm_hidden_states, shift=mod1_shift, scale=mod1_scale)
+        norm_hidden_states = modulate(norm_hidden_states, shift=mod1_shift, scale=mod1_scale,
+                                      tr_shift=tr_mod1_shift, tr_scale=tr_mod1_scale, tr_token=tr_token)
        qkv = self.to_qkv(norm_hidden_states)
        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)

@@ -419,15 +439,19 @@ class MMDoubleStreamBlockComponent(torch.nn.Module):

        if freqs_cis is not None:
            q, k = apply_rotary_emb(q, k, freqs_cis, head_first=False)
+        return (q, k, v), (mod1_gate, mod2_shift, mod2_scale, mod2_gate), (tr_mod1_gate, tr_mod2_shift, tr_mod2_scale, tr_mod2_gate)

-        return (q, k, v), (mod1_gate, mod2_shift, mod2_scale, mod2_gate)
-    
-    def process_ff(self, hidden_states, attn_output, mod):
+    def process_ff(self, hidden_states, attn_output, mod, mod_tr=None, tr_token=None):
        mod1_gate, mod2_shift, mod2_scale, mod2_gate = mod
-        hidden_states = hidden_states + self.to_out(attn_output) * mod1_gate.unsqueeze(1)
-        hidden_states = hidden_states + self.ff(modulate(self.norm2(hidden_states), shift=mod2_shift, scale=mod2_scale)) * mod2_gate.unsqueeze(1)
+        if mod_tr is not None:
+            tr_mod1_gate, tr_mod2_shift, tr_mod2_scale, tr_mod2_gate = mod_tr
+        else:
+            tr_mod1_gate, tr_mod2_shift, tr_mod2_scale, tr_mod2_gate = None, None, None, None
+        hidden_states = hidden_states + apply_gate(self.to_out(attn_output), mod1_gate, tr_mod1_gate, tr_token)
+        x = self.ff(modulate(self.norm2(hidden_states), shift=mod2_shift, scale=mod2_scale, tr_shift=tr_mod2_shift, tr_scale=tr_mod2_scale, tr_token=tr_token))
+        hidden_states = hidden_states + apply_gate(x, mod2_gate, tr_mod2_gate, tr_token)
        return hidden_states
-    
+

 class MMDoubleStreamBlock(torch.nn.Module):
    def __init__(self, hidden_size=3072, heads_num=24, mlp_width_ratio=4):
@@ -435,18 +459,18 @@ class MMDoubleStreamBlock(torch.nn.Module):
        self.component_a = MMDoubleStreamBlockComponent(hidden_size, heads_num, mlp_width_ratio)
        self.component_b = MMDoubleStreamBlockComponent(hidden_size, heads_num, mlp_width_ratio)

-    def forward(self, hidden_states_a, hidden_states_b, conditioning, freqs_cis):
-        (q_a, k_a, v_a), mod_a = self.component_a(hidden_states_a, conditioning, freqs_cis)
-        (q_b, k_b, v_b), mod_b = self.component_b(hidden_states_b, conditioning, freqs_cis=None)
+    def forward(self, hidden_states_a, hidden_states_b, conditioning, freqs_cis, token_replace_vec=None, tr_token=None, split_token=71):
+        (q_a, k_a, v_a), mod_a, mod_tr = self.component_a(hidden_states_a, conditioning, freqs_cis, token_replace_vec, tr_token)
+        (q_b, k_b, v_b), mod_b, _ = self.component_b(hidden_states_b, conditioning, freqs_cis=None)

-        q_a, q_b = torch.concat([q_a, q_b[:, :71]], dim=1), q_b[:, 71:].contiguous()
-        k_a, k_b = torch.concat([k_a, k_b[:, :71]], dim=1), k_b[:, 71:].contiguous()
-        v_a, v_b = torch.concat([v_a, v_b[:, :71]], dim=1), v_b[:, 71:].contiguous()
+        q_a, q_b = torch.concat([q_a, q_b[:, :split_token]], dim=1), q_b[:, split_token:].contiguous()
+        k_a, k_b = torch.concat([k_a, k_b[:, :split_token]], dim=1), k_b[:, split_token:].contiguous()
+        v_a, v_b = torch.concat([v_a, v_b[:, :split_token]], dim=1), v_b[:, split_token:].contiguous()
        attn_output_a = attention(q_a, k_a, v_a)
        attn_output_b = attention(q_b, k_b, v_b)
-        attn_output_a, attn_output_b = attn_output_a[:, :-71].contiguous(), torch.concat([attn_output_a[:, -71:], attn_output_b], dim=1)
+        attn_output_a, attn_output_b = attn_output_a[:, :-split_token].contiguous(), torch.concat([attn_output_a[:, -split_token:], attn_output_b], dim=1)

-        hidden_states_a = self.component_a.process_ff(hidden_states_a, attn_output_a, mod_a)
+        hidden_states_a = self.component_a.process_ff(hidden_states_a, attn_output_a, mod_a, mod_tr, tr_token)
        hidden_states_b = self.component_b.process_ff(hidden_states_b, attn_output_b, mod_b)
        return hidden_states_a, hidden_states_b

@@ -489,7 +513,7 @@ class MMSingleStreamBlockOriginal(torch.nn.Module):

        output = self.linear2(torch.cat((attn_output, self.mlp_act(mlp)), 2))
        return x + output * mod_gate.unsqueeze(1)
-    
+

 class MMSingleStreamBlock(torch.nn.Module):
    def __init__(self, hidden_size=3072, heads_num=24, mlp_width_ratio=4):
@@ -510,11 +534,17 @@ class MMSingleStreamBlock(torch.nn.Module):
            torch.nn.Linear(hidden_size * mlp_width_ratio, hidden_size, bias=False)
        )

-    def forward(self, hidden_states, conditioning, freqs_cis=None, txt_len=256):
+    def forward(self, hidden_states, conditioning, freqs_cis=None, txt_len=256, token_replace_vec=None, tr_token=None, split_token=71):
        mod_shift, mod_scale, mod_gate = self.mod(conditioning).chunk(3, dim=-1)
+        if token_replace_vec is not None:
+            assert tr_token is not None
+            tr_mod_shift, tr_mod_scale, tr_mod_gate = self.mod(token_replace_vec).chunk(3, dim=-1)
+        else:
+            tr_mod_shift, tr_mod_scale, tr_mod_gate = None, None, None

        norm_hidden_states = self.norm(hidden_states)
-        norm_hidden_states = modulate(norm_hidden_states, shift=mod_shift, scale=mod_scale)
+        norm_hidden_states = modulate(norm_hidden_states, shift=mod_shift, scale=mod_scale,
+                                      tr_shift=tr_mod_shift, tr_scale=tr_mod_scale, tr_token=tr_token)
        qkv = self.to_qkv(norm_hidden_states)

        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
@@ -526,16 +556,17 @@ class MMSingleStreamBlock(torch.nn.Module):
        k_a, k_b = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
        q_a, k_a = apply_rotary_emb(q_a, k_a, freqs_cis, head_first=False)

-        q_a, q_b = torch.concat([q_a, q_b[:, :71]], dim=1), q_b[:, 71:].contiguous()
-        k_a, k_b = torch.concat([k_a, k_b[:, :71]], dim=1), k_b[:, 71:].contiguous()
-        v_a, v_b = v[:, :-185].contiguous(), v[:, -185:].contiguous()
+        v_len = txt_len - split_token
+        q_a, q_b = torch.concat([q_a, q_b[:, :split_token]], dim=1), q_b[:, split_token:].contiguous()
+        k_a, k_b = torch.concat([k_a, k_b[:, :split_token]], dim=1), k_b[:, split_token:].contiguous()
+        v_a, v_b = v[:, :-v_len].contiguous(), v[:, -v_len:].contiguous()

        attn_output_a = attention(q_a, k_a, v_a)
        attn_output_b = attention(q_b, k_b, v_b)
        attn_output = torch.concat([attn_output_a, attn_output_b], dim=1)

-        hidden_states = hidden_states + self.to_out(attn_output) * mod_gate.unsqueeze(1)
-        hidden_states = hidden_states + self.ff(norm_hidden_states) * mod_gate.unsqueeze(1)
+        hidden_states = hidden_states + apply_gate(self.to_out(attn_output), mod_gate, tr_mod_gate, tr_token)
+        hidden_states = hidden_states + apply_gate(self.ff(norm_hidden_states), mod_gate, tr_mod_gate, tr_token)
        return hidden_states


@@ -581,7 +612,7 @@ class HunyuanVideoDiT(torch.nn.Module):
    def unpatchify(self, x, T, H, W):
        x = rearrange(x, "B (T H W) (C pT pH pW) -> B C (T pT) (H pH) (W pW)", H=H, W=W, pT=1, pH=2, pW=2)
        return x
-    
+
    def enable_block_wise_offload(self, warm_device="cuda", cold_device="cpu"):
        self.warm_device = warm_device
        self.cold_device = cold_device
@@ -616,7 +647,7 @@ class HunyuanVideoDiT(torch.nn.Module):
            vec += self.guidance_in(guidance * 1000, dtype=torch.float32)
        img = self.img_in(x)
        txt = self.txt_in(prompt_emb, t, text_mask)
-        
+
        for block in tqdm(self.double_blocks, desc="Double stream blocks"):
            img, txt = block(img, txt, vec, (freqs_cos, freqs_sin))

@@ -628,7 +659,7 @@ class HunyuanVideoDiT(torch.nn.Module):
        img = self.final_layer(img, vec)
        img = self.unpatchify(img, T=T//1, H=H//2, W=W//2)
        return img
-    
+

    def enable_auto_offload(self, dtype=torch.bfloat16, device="cuda"):
        def cast_to(weight, dtype=None, device=None, copy=False):
@@ -684,7 +715,7 @@ class HunyuanVideoDiT(torch.nn.Module):
                    del x_, weight_, bias_
                    torch.cuda.empty_cache()
                    return y_
-                
+
                def block_forward(self, x, **kwargs):
                    # This feature can only reduce 2GB VRAM, so we disable it.
                    y = torch.zeros(x.shape[:-1] + (self.out_features,), dtype=x.dtype, device=x.device)
@@ -692,19 +723,19 @@ class HunyuanVideoDiT(torch.nn.Module):
                        for j in range((self.out_features + self.block_size - 1) // self.block_size):
                            y[..., j * self.block_size: (j + 1) * self.block_size] += self.block_forward_(x, i, j, dtype=x.dtype, device=x.device)
                    return y
-                    
+
                def forward(self, x, **kwargs):
                    weight, bias = cast_bias_weight(self, x, dtype=self.dtype, device=self.device)
                    return torch.nn.functional.linear(x, weight, bias)

-            
+
            class RMSNorm(torch.nn.Module):
                def __init__(self, module, dtype=torch.bfloat16, device="cuda"):
                    super().__init__()
                    self.module = module
                    self.dtype = dtype
                    self.device = device
-                    
+
                def forward(self, hidden_states, **kwargs):
                    input_dtype = hidden_states.dtype
                    variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
@@ -714,30 +745,30 @@ class HunyuanVideoDiT(torch.nn.Module):
                        weight = cast_weight(self.module, hidden_states, dtype=torch.bfloat16, device="cuda")
                        hidden_states = hidden_states * weight
                    return hidden_states
-                
+
            class Conv3d(torch.nn.Conv3d):
                def __init__(self, *args, dtype=torch.bfloat16, device="cuda", **kwargs):
                    super().__init__(*args, **kwargs)
                    self.dtype = dtype
                    self.device = device
-                    
+
                def forward(self, x):
                    weight, bias = cast_bias_weight(self, x, dtype=self.dtype, device=self.device)
                    return torch.nn.functional.conv3d(x, weight, bias, self.stride, self.padding, self.dilation, self.groups)
-                
+
            class LayerNorm(torch.nn.LayerNorm):
                def __init__(self, *args, dtype=torch.bfloat16, device="cuda", **kwargs):
                    super().__init__(*args, **kwargs)
                    self.dtype = dtype
                    self.device = device
-                    
+
                def forward(self, x):
                    if self.weight is not None and self.bias is not None:
                        weight, bias = cast_bias_weight(self, x, dtype=self.dtype, device=self.device)
                        return torch.nn.functional.layer_norm(x, self.normalized_shape, weight, bias, self.eps)
                    else:
                        return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-            
+
        def replace_layer(model, dtype=torch.bfloat16, device="cuda"):
            for name, module in model.named_children():
                if isinstance(module, torch.nn.Linear):
@@ -780,7 +811,6 @@ class HunyuanVideoDiT(torch.nn.Module):
        return HunyuanVideoDiTStateDictConverter()


-
 class HunyuanVideoDiTStateDictConverter:
    def __init__(self):
        pass
@@ -886,6 +916,5 @@ class HunyuanVideoDiTStateDictConverter:
                state_dict_[name_] = param
            else:
                pass
-        if origin_hash_key == "ae3c22aaa28bfae6f3688f796c9814ae":
-            return state_dict_, {"in_channels": 33, "guidance_embed":False}
+
        return state_dict_
--- a/diffsynth/pipelines/hunyuan_video.py
+++ b/diffsynth/pipelines/hunyuan_video.py
@@ -5,13 +5,13 @@ from ..schedulers.flow_match import FlowMatchScheduler
 from .base import BasePipeline
 from ..prompters import HunyuanVideoPrompter
 import torch
+import torchvision.transforms as transforms
 from einops import rearrange
 import numpy as np
 from PIL import Image
 from tqdm import tqdm


-
 class HunyuanVideoPipeline(BasePipeline):

    def __init__(self, device="cuda", torch_dtype=torch.float16):
@@ -53,10 +53,58 @@ class HunyuanVideoPipeline(BasePipeline):
            pipe.enable_vram_management()
        return pipe

+    def generate_crop_size_list(self, base_size=256, patch_size=32, max_ratio=4.0):
+        num_patches = round((base_size / patch_size)**2)
+        assert max_ratio >= 1.0
+        crop_size_list = []
+        wp, hp = num_patches, 1
+        while wp > 0:
+            if max(wp, hp) / min(wp, hp) <= max_ratio:
+                crop_size_list.append((wp * patch_size, hp * patch_size))
+            if (hp + 1) * wp <= num_patches:
+                hp += 1
+            else:
+                wp -= 1
+        return crop_size_list

-    def encode_prompt(self, prompt, positive=True, clip_sequence_length=77, llm_sequence_length=256):
+
+    def get_closest_ratio(self, height: float, width: float, ratios: list, buckets: list):
+        aspect_ratio = float(height) / float(width)
+        closest_ratio_id = np.abs(ratios - aspect_ratio).argmin()
+        closest_ratio = min(ratios, key=lambda ratio: abs(float(ratio) - aspect_ratio))
+        return buckets[closest_ratio_id], float(closest_ratio)
+
+
+    def prepare_vae_images_inputs(self, semantic_images, i2v_resolution="720p"):
+        if i2v_resolution == "720p":
+            bucket_hw_base_size = 960
+        elif i2v_resolution == "540p":
+            bucket_hw_base_size = 720
+        elif i2v_resolution == "360p":
+            bucket_hw_base_size = 480
+        else:
+            raise ValueError(f"i2v_resolution: {i2v_resolution} must be in [360p, 540p, 720p]")
+        origin_size = semantic_images[0].size
+
+        crop_size_list = self.generate_crop_size_list(bucket_hw_base_size, 32)
+        aspect_ratios = np.array([round(float(h) / float(w), 5) for h, w in crop_size_list])
+        closest_size, closest_ratio = self.get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
+        ref_image_transform = transforms.Compose([
+            transforms.Resize(closest_size),
+            transforms.CenterCrop(closest_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5])
+        ])
+
+        semantic_image_pixel_values = [ref_image_transform(semantic_image) for semantic_image in semantic_images]
+        semantic_image_pixel_values = torch.cat(semantic_image_pixel_values).unsqueeze(0).unsqueeze(2).to(self.device)
+        target_height, target_width = closest_size
+        return semantic_image_pixel_values, target_height, target_width
+
+
+    def encode_prompt(self, prompt, positive=True, clip_sequence_length=77, llm_sequence_length=256, input_images=None):
        prompt_emb, pooled_prompt_emb, text_mask = self.prompter.encode_prompt(
-            prompt, device=self.device, positive=positive, clip_sequence_length=clip_sequence_length, llm_sequence_length=llm_sequence_length
+            prompt, device=self.device, positive=positive, clip_sequence_length=clip_sequence_length, llm_sequence_length=llm_sequence_length, images=input_images
        )
        return {"prompt_emb": prompt_emb, "pooled_prompt_emb": pooled_prompt_emb, "text_mask": text_mask}

@@ -87,6 +135,9 @@ class HunyuanVideoPipeline(BasePipeline):
        prompt,
        negative_prompt="",
        input_video=None,
+        input_images=None,
+        i2v_resolution="720p",
+        i2v_stability=True,
        denoising_strength=1.0,
        seed=None,
        rand_device=None,
@@ -105,10 +156,17 @@ class HunyuanVideoPipeline(BasePipeline):
    ):
        # Tiler parameters
        tiler_kwargs = {"tile_size": tile_size, "tile_stride": tile_stride}
-        
+
        # Scheduler
        self.scheduler.set_timesteps(num_inference_steps, denoising_strength)

+        # encoder input images
+        if input_images is not None:
+            self.load_models_to_device(['vae_encoder'])
+            image_pixel_values, height, width = self.prepare_vae_images_inputs(input_images, i2v_resolution=i2v_resolution)
+            with torch.autocast(device_type=self.device, dtype=torch.float16, enabled=True):
+                image_latents = self.vae_encoder(image_pixel_values)
+
        # Initialize noise
        rand_device = self.device if rand_device is None else rand_device
        noise = self.generate_noise((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), seed=seed, device=rand_device, dtype=self.torch_dtype).to(self.device)
@@ -118,12 +176,18 @@ class HunyuanVideoPipeline(BasePipeline):
            input_video = torch.stack(input_video, dim=2)
            latents = self.encode_video(input_video, **tiler_kwargs).to(dtype=self.torch_dtype, device=self.device)
            latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0])
+        elif input_images is not None and i2v_stability:
+            noise = self.generate_noise((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), seed=seed, device=rand_device, dtype=image_latents.dtype).to(self.device)
+            t = torch.tensor([0.999]).to(device=self.device)
+            latents = noise * t + image_latents.repeat(1, 1, (num_frames - 1) // 4 + 1, 1, 1) * (1 - t)
+            latents = latents.to(dtype=image_latents.dtype)
        else:
            latents = noise
-        
+
        # Encode prompts
-        self.load_models_to_device(["text_encoder_1"] if self.vram_management else ["text_encoder_1", "text_encoder_2"])
-        prompt_emb_posi = self.encode_prompt(prompt, positive=True)
+        # current mllm does not support vram_management
+        self.load_models_to_device(["text_encoder_1"] if self.vram_management and input_images is None else ["text_encoder_1", "text_encoder_2"])
+        prompt_emb_posi = self.encode_prompt(prompt, positive=True, input_images=input_images)
        if cfg_scale != 1.0:
            prompt_emb_nega = self.encode_prompt(negative_prompt, positive=False)

@@ -139,11 +203,16 @@ class HunyuanVideoPipeline(BasePipeline):
            timestep = timestep.unsqueeze(0).to(self.device)
            print(f"Step {progress_id + 1} / {len(self.scheduler.timesteps)}")

+            forward_func = lets_dance_hunyuan_video
+            if input_images is not None:
+                latents = torch.concat([image_latents, latents[:, :, 1:, :, :]], dim=2)
+                forward_func = lets_dance_hunyuan_video_i2v
+
            # Inference
            with torch.autocast(device_type=self.device, dtype=self.torch_dtype):
-                noise_pred_posi = lets_dance_hunyuan_video(self.dit, latents, timestep, **prompt_emb_posi, **extra_input, **tea_cache_kwargs)
+                noise_pred_posi = forward_func(self.dit, latents, timestep, **prompt_emb_posi, **extra_input, **tea_cache_kwargs)
                if cfg_scale != 1.0:
-                    noise_pred_nega = lets_dance_hunyuan_video(self.dit, latents, timestep, **prompt_emb_nega, **extra_input)
+                    noise_pred_nega = forward_func(self.dit, latents, timestep, **prompt_emb_nega, **extra_input)
                    noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
                else:
                    noise_pred = noise_pred_posi
@@ -163,7 +232,11 @@ class HunyuanVideoPipeline(BasePipeline):
                self.load_models_to_device([] if self.vram_management else ["dit"])

            # Scheduler
-            latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents)
+            if input_images is not None:
+                latents = self.scheduler.step(noise_pred[:, :, 1:, :, :], self.scheduler.timesteps[progress_id], latents[:, :, 1:, :, :])
+                latents = torch.concat([image_latents, latents], dim=2)
+            else:
+                latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents)

        # Decode
        self.load_models_to_device(['vae_decoder'])
@@ -194,7 +267,7 @@ class TeaCache:
        if self.step == 0 or self.step == self.num_inference_steps - 1:
            should_calc = True
            self.accumulated_rel_l1_distance = 0
-        else: 
+        else:
            coefficients = [7.33226126e+02, -4.01131952e+02,  6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
            rescale_func = np.poly1d(coefficients)
            self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
@@ -203,14 +276,14 @@ class TeaCache:
            else:
                should_calc = True
                self.accumulated_rel_l1_distance = 0
-        self.previous_modulated_input = modulated_inp 
+        self.previous_modulated_input = modulated_inp
        self.step += 1
        if self.step == self.num_inference_steps:
            self.step = 0
        if should_calc:
            self.previous_hidden_states = img.clone()
        return not should_calc
-    
+
    def store(self, hidden_states):
        self.previous_residual = hidden_states - self.previous_hidden_states
        self.previous_hidden_states = None
@@ -250,13 +323,70 @@ def lets_dance_hunyuan_video(
        print("TeaCache skip forward.")
        img = tea_cache.update(img)
    else:
+        split_token = int(text_mask.sum(dim=1))
+        txt_len = int(txt.shape[1])
        for block in tqdm(dit.double_blocks, desc="Double stream blocks"):
-            img, txt = block(img, txt, vec, (freqs_cos, freqs_sin))
-        
+            img, txt = block(img, txt, vec, (freqs_cos, freqs_sin), split_token=split_token)
+
        x = torch.concat([img, txt], dim=1)
        for block in tqdm(dit.single_blocks, desc="Single stream blocks"):
-            x = block(x, vec, (freqs_cos, freqs_sin))
-        img = x[:, :-256]
+            x = block(x, vec, (freqs_cos, freqs_sin), txt_len=txt_len, split_token=split_token)
+        img = x[:, :-txt_len]
+
+        if tea_cache is not None:
+            tea_cache.store(img)
+    img = dit.final_layer(img, vec)
+    img = dit.unpatchify(img, T=T//1, H=H//2, W=W//2)
+    return img
+
+
+def lets_dance_hunyuan_video_i2v(
+    dit: HunyuanVideoDiT,
+    x: torch.Tensor,
+    t: torch.Tensor,
+    prompt_emb: torch.Tensor = None,
+    text_mask: torch.Tensor = None,
+    pooled_prompt_emb: torch.Tensor = None,
+    freqs_cos: torch.Tensor = None,
+    freqs_sin: torch.Tensor = None,
+    guidance: torch.Tensor = None,
+    tea_cache: TeaCache = None,
+    **kwargs
+):
+    B, C, T, H, W = x.shape
+    # Uncomment below to keep same as official implementation
+    # guidance = guidance.to(dtype=torch.float32).to(torch.bfloat16)
+    vec = dit.time_in(t, dtype=torch.bfloat16)
+    vec_2 = dit.vector_in(pooled_prompt_emb)
+    vec = vec + vec_2
+    vec = vec + dit.guidance_in(guidance * 1000., dtype=torch.bfloat16)
+
+    token_replace_vec = dit.time_in(torch.zeros_like(t), dtype=torch.bfloat16)
+    tr_token = (H // 2) * (W // 2)
+    token_replace_vec = token_replace_vec + vec_2
+
+    img = dit.img_in(x)
+    txt = dit.txt_in(prompt_emb, t, text_mask)
+
+    # TeaCache
+    if tea_cache is not None:
+        tea_cache_update = tea_cache.check(dit, img, vec)
+    else:
+        tea_cache_update = False
+
+    if tea_cache_update:
+        print("TeaCache skip forward.")
+        img = tea_cache.update(img)
+    else:
+        split_token = int(text_mask.sum(dim=1))
+        txt_len = int(txt.shape[1])
+        for block in tqdm(dit.double_blocks, desc="Double stream blocks"):
+            img, txt = block(img, txt, vec, (freqs_cos, freqs_sin), token_replace_vec, tr_token, split_token)
+
+        x = torch.concat([img, txt], dim=1)
+        for block in tqdm(dit.single_blocks, desc="Single stream blocks"):
+            x = block(x, vec, (freqs_cos, freqs_sin), txt_len, token_replace_vec, tr_token, split_token)
+        img = x[:, :-txt_len]

        if tea_cache is not None:
            tea_cache.store(img)
--- a/diffsynth/prompters/hunyuan_video_prompter.py
+++ b/diffsynth/prompters/hunyuan_video_prompter.py
@@ -87,7 +87,6 @@ class HunyuanVideoPrompter(BasePrompter):
        self.tokenizer_2 = LlamaTokenizerFast.from_pretrained(tokenizer_2_path, padding_side='right')
        self.text_encoder_1: SD3TextEncoder1 = None
        self.text_encoder_2: HunyuanVideoLLMEncoder = None
-        self.i2v_mode = False

        self.prompt_template = PROMPT_TEMPLATE['dit-llm-encode']
        self.prompt_template_video = PROMPT_TEMPLATE['dit-llm-encode-video']
@@ -106,8 +105,6 @@ class HunyuanVideoPrompter(BasePrompter):
            # template
            self.prompt_template = PROMPT_TEMPLATE['dit-llm-encode-i2v']
            self.prompt_template_video = PROMPT_TEMPLATE['dit-llm-encode-video-i2v']
-            # mode setting
-            self.i2v_mode = True

    def apply_text_to_template(self, text, template):
        assert isinstance(template, str)
@@ -164,10 +161,8 @@ class HunyuanVideoPrompter(BasePrompter):
                                crop_start,
                                hidden_state_skip_layer=2,
                                use_attention_mask=True,
-                                image_embed_interleave=2):
-        image_outputs = self.processor(images, return_tensors="pt")[
-                "pixel_values"
-        ].to(device)
+                                image_embed_interleave=4):
+        image_outputs = self.processor(images, return_tensors="pt")["pixel_values"].to(device)
        max_length += crop_start
        inputs = self.tokenizer_2(prompt,
                                  return_tensors="pt",
@@ -248,7 +243,8 @@ class HunyuanVideoPrompter(BasePrompter):
                      data_type='video',
                      use_template=True,
                      hidden_state_skip_layer=2,
-                      use_attention_mask=True):
+                      use_attention_mask=True,
+                      image_embed_interleave=4):

        prompt = self.process_prompt(prompt, positive=positive)

@@ -273,6 +269,7 @@ class HunyuanVideoPrompter(BasePrompter):
                                                                      hidden_state_skip_layer, use_attention_mask)
        else:
            prompt_emb, attention_mask = self.encode_prompt_using_mllm(prompt_formated, images, llm_sequence_length, device,
-                                                                       crop_start, hidden_state_skip_layer, use_attention_mask)
+                                                                       crop_start, hidden_state_skip_layer, use_attention_mask,
+                                                                       image_embed_interleave)

        return prompt_emb, pooled_prompt_emb, attention_mask