support hunyuanvideo_i2v

2026-03-21 16:18:13 +00:00 · 2025-03-11 16:20:09 +08:00
parent 945b43492e
commit 4bec2983a9
9 changed files with 327 additions and 161 deletions
--- a/examples/HunyuanVideo/README.md
+++ b/examples/HunyuanVideo/README.md
@@ -8,6 +8,12 @@
 |24G|[hunyuanvideo_24G.py](hunyuanvideo_24G.py)|129|720*1280|The video is consistent with the original implementation, but it requires 5%~10% more time than [hunyuanvideo_80G.py](hunyuanvideo_80G.py)|
 |6G|[hunyuanvideo_6G.py](hunyuanvideo_6G.py)|129|512*384|The base model doesn't support low resolutions. We recommend users to use some LoRA ([example](https://civitai.com/models/1032126/walking-animation-hunyuan-video)) trained using low resolutions.|

+[HunyuanVideo-I2V](https://github.com/Tencent/HunyuanVideo-I2V) is the image-to-video generation version of HunyuanVideo. We also provide advanced VRAM management for this model.
+|VRAM required|Example script|Frames|Resolution|Note|
+|-|-|-|-|-|
+|80G|[hunyuanvideo_i2v_80G.py](hunyuanvideo_i2v_80G.py)|129|720p|No VRAM management.|
+|24G|[hunyuanvideo_i2v_24G.py](hunyuanvideo_i2v_24G.py)|129|720p|The video is consistent with the original implementation, but it requires 5%~10% more time than [hunyuanvideo_80G.py](hunyuanvideo_80G.py)|
+
 ## Gallery

 Video generated by [hunyuanvideo_80G.py](hunyuanvideo_80G.py) and [hunyuanvideo_24G.py](hunyuanvideo_24G.py):
@@ -21,3 +27,7 @@ https://github.com/user-attachments/assets/2997f107-d02d-4ecb-89bb-5ce1a7f93817
 Video to video generated by [hunyuanvideo_v2v_6G.py](./hunyuanvideo_v2v_6G.py) using [this LoRA](https://civitai.com/models/1032126/walking-animation-hunyuan-video):

 https://github.com/user-attachments/assets/4b89e52e-ce42-434e-aa57-08f09dfa2b10
+
+Video generated by [hunyuanvideo_i2v_80G.py](hunyuanvideo_i2v_80G.py) and [hunyuanvideo_i2v_24G.py](hunyuanvideo_i2v_24G.py):
+
+https://github.com/user-attachments/assets/494f252a-c9af-440d-84ba-a8ddcdcc538a
--- a/examples/HunyuanVideo/hunyuanvideo_i2v.py
+++ b/examples/HunyuanVideo/hunyuanvideo_i2v.py
@@ -1,88 +0,0 @@
-import torch
-from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
-from diffsynth.prompters.hunyuan_video_prompter import HunyuanVideoPrompter
-from PIL import Image
-import numpy as np
-import torchvision.transforms as transforms
-
-
-def generate_crop_size_list(base_size=256, patch_size=32, max_ratio=4.0):
-    num_patches = round((base_size / patch_size)**2)
-    assert max_ratio >= 1.0
-    crop_size_list = []
-    wp, hp = num_patches, 1
-    while wp > 0:
-        if max(wp, hp) / min(wp, hp) <= max_ratio:
-            crop_size_list.append((wp * patch_size, hp * patch_size))
-        if (hp + 1) * wp <= num_patches:
-            hp += 1
-        else:
-            wp -= 1
-    return crop_size_list
-
-
-def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
-    aspect_ratio = float(height) / float(width)
-    closest_ratio_id = np.abs(ratios - aspect_ratio).argmin()
-    closest_ratio = min(ratios, key=lambda ratio: abs(float(ratio) - aspect_ratio))
-    return buckets[closest_ratio_id], float(closest_ratio)
-
-
-def prepare_vae_inputs(semantic_images, i2v_resolution="720p"):
-    if i2v_resolution == "720p":
-        bucket_hw_base_size = 960
-    elif i2v_resolution == "540p":
-        bucket_hw_base_size = 720
-    elif i2v_resolution == "360p":
-        bucket_hw_base_size = 480
-    else:
-        raise ValueError(f"i2v_resolution: {i2v_resolution} must be in [360p, 540p, 720p]")
-    origin_size = semantic_images[0].size
-
-    crop_size_list = generate_crop_size_list(bucket_hw_base_size, 32)
-    aspect_ratios = np.array([round(float(h) / float(w), 5) for h, w in crop_size_list])
-    closest_size, closest_ratio = get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
-    ref_image_transform = transforms.Compose([
-        transforms.Resize(closest_size),
-        transforms.CenterCrop(closest_size),
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5])
-    ])
-
-    semantic_image_pixel_values = [ref_image_transform(semantic_image) for semantic_image in semantic_images]
-    semantic_image_pixel_values = torch.cat(semantic_image_pixel_values).unsqueeze(0).unsqueeze(2)
-    return semantic_image_pixel_values
-
-
-model_manager = ModelManager()
-
-# The other modules are loaded in float16.
-
-model_manager.load_models(
-    [
-        "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
-    ],
-    torch_dtype=torch.bfloat16, # you can use torch_dtype=torch.float8_e4m3fn to enable quantization.
-    device="cuda"
-)
-
-model_manager.load_models(
-    [
-        "models/HunyuanVideo/text_encoder/model.safetensors",
-        "models/HunyuanVideoI2V/text_encoder_2",
-        'models/HunyuanVideoI2V/vae/pytorch_model.pt'
-        
-    ],
-    torch_dtype=torch.float16,
-    device="cuda"
-)
-# The computation device is "cuda".
-pipe = HunyuanVideoPipeline.from_model_manager(
-    model_manager,
-    torch_dtype=torch.bfloat16,
-    device="cuda",
-    enable_vram_management=False
-)
-# Although you have enough VRAM, we still recommend you to enable offload.
-pipe.enable_cpu_offload()
-print()
--- a/examples/HunyuanVideo/hunyuanvideo_i2v_24G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_i2v_24G.py
@@ -0,0 +1,43 @@
+import torch
+from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
+from modelscope import dataset_snapshot_download
+from PIL import Image
+
+
+download_models(["HunyuanVideoI2V"])
+model_manager = ModelManager()
+
+# The DiT model is loaded in bfloat16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
+    ],
+    torch_dtype=torch.bfloat16,
+    device="cpu"
+)
+
+# The other modules are loaded in float16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideoI2V/text_encoder/model.safetensors",
+        "models/HunyuanVideoI2V/text_encoder_2",
+        'models/HunyuanVideoI2V/vae/pytorch_model.pt'
+    ],
+    torch_dtype=torch.float16,
+    device="cpu"
+)
+# The computation device is "cuda".
+pipe = HunyuanVideoPipeline.from_model_manager(model_manager,
+                                               torch_dtype=torch.bfloat16,
+                                               device="cuda",
+                                               enable_vram_management=True)
+
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+                          local_dir="./",
+                          allow_file_pattern=f"data/examples/hunyuanvideo/*")
+
+i2v_resolution = "720p"
+prompt = "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick."
+images = [Image.open("data/examples/hunyuanvideo/0.jpg").convert('RGB')]
+video = pipe(prompt, input_images=images, num_inference_steps=50, seed=0, i2v_resolution=i2v_resolution)
+save_video(video, f"video_{i2v_resolution}_low_vram.mp4", fps=30, quality=6)
--- a/examples/HunyuanVideo/hunyuanvideo_i2v_80G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_i2v_80G.py
@@ -0,0 +1,45 @@
+import torch
+from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
+from modelscope import dataset_snapshot_download
+from PIL import Image
+
+
+download_models(["HunyuanVideoI2V"])
+model_manager = ModelManager()
+
+# The DiT model is loaded in bfloat16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
+    ],
+    torch_dtype=torch.bfloat16,
+    device="cuda"
+)
+
+# The other modules are loaded in float16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideoI2V/text_encoder/model.safetensors",
+        "models/HunyuanVideoI2V/text_encoder_2",
+        'models/HunyuanVideoI2V/vae/pytorch_model.pt'
+    ],
+    torch_dtype=torch.float16,
+    device="cuda"
+)
+# The computation device is "cuda".
+pipe = HunyuanVideoPipeline.from_model_manager(model_manager,
+                                               torch_dtype=torch.bfloat16,
+                                               device="cuda",
+                                               enable_vram_management=False)
+# Although you have enough VRAM, we still recommend you to enable offload.
+pipe.enable_cpu_offload()
+
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+                          local_dir="./",
+                          allow_file_pattern=f"data/examples/hunyuanvideo/*")
+
+i2v_resolution = "720p"
+prompt = "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick."
+images = [Image.open("data/examples/hunyuanvideo/0.jpg").convert('RGB')]
+video = pipe(prompt, input_images=images, num_inference_steps=50, seed=0, i2v_resolution=i2v_resolution)
+save_video(video, f"video_{i2v_resolution}.mp4", fps=30, quality=6)