hunyuanvideo examples

2026-04-08 17:18:21 +00:00 · 2024-12-19 13:15:06 +08:00
parent 46f191ffe7
commit 3d48b287a3
6 changed files with 76 additions and 48 deletions
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -100,6 +100,7 @@ model_loader_configs = [
    (None, "5da81baee73198a7c19e6d2fe8b5148e", ["sd3_text_encoder_1"], [SD3TextEncoder1], "diffusers"),
    (None, "aeb82dce778a03dcb4d726cb03f3c43f", ["hunyuan_video_vae_decoder", "hunyuan_video_vae_encoder"], [HunyuanVideoVAEDecoder, HunyuanVideoVAEEncoder], "diffusers"),
    (None, "b9588f02e78f5ccafc9d7c0294e46308", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
+    (None, "84ef4bd4757f60e906b54aa6a7815dc6", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
 ]
 huggingface_model_loader_configs = [
    # These configs are provided for detecting model type automatically.
--- a/examples/HunyuanVideo/README.md
+++ b/examples/HunyuanVideo/README.md
@@ -0,0 +1,19 @@
+# HunyuanVideo
+
+HunyuanVideo is a video generation model trained by Tencent. We provide advanced VRAM management for this model, including three stages:
+
+|VRAM required|Example script|Frames|Resolution|Note|
+|-|-|-|-|-|
+|80G|[hunyuanvideo_80G.py](hunyuanvideo_80G.py)|129|720*1280|No VRAM management.|
+|24G|[hunyuanvideo_24G.py](hunyuanvideo_24G.py)|129|720*1280|The video is consistent with the original implementation, but it requires 5%~10% more time than [hunyuanvideo_80G.py](hunyuanvideo_80G.py)|
+|6G|[hunyuanvideo_6G.py](hunyuanvideo_6G.py)|129|512*384|The base model doesn't support low resolutions. We recommend users to use some LoRA ([example](https://civitai.com/models/1032126/walking-animation-hunyuan-video)) trained using low resolutions.|
+
+## Gallery
+
+Video generated by [hunyuanvideo_80G.py](hunyuanvideo_80G.py) and [hunyuanvideo_24G.py](hunyuanvideo_24G.py):
+
+https://github.com/user-attachments/assets/48dd24bb-0cc6-40d2-88c3-10feed3267e9
+
+Video generated by [hunyuanvideo_6G.py](hunyuanvideo_6G.py) using [this LoRA](https://civitai.com/models/1032126/walking-animation-hunyuan-video):
+
+https://github.com/user-attachments/assets/2997f107-d02d-4ecb-89bb-5ce1a7f93817
--- a/examples/HunyuanVideo/hunyuanvideo_24G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_24G.py
@@ -39,4 +39,4 @@ pipe = HunyuanVideoPipeline.from_model_manager(
 # Enjoy!
 prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
 video = pipe(prompt, seed=0)
-save_video(video, "video.mp4", fps=30, quality=5)
+save_video(video, "video_girl.mp4", fps=30, quality=6)
--- a/examples/HunyuanVideo/hunyuanvideo_6G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_6G.py
@@ -0,0 +1,47 @@
+import torch
+torch.cuda.set_per_process_memory_fraction(1.0, 0)
+from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video, FlowMatchScheduler
+
+
+download_models(["HunyuanVideo"])
+model_manager = ModelManager()
+
+# The DiT model is loaded in bfloat16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
+    ],
+    torch_dtype=torch.bfloat16,
+    device="cpu"
+)
+
+# The other modules are loaded in float16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideo/text_encoder/model.safetensors",
+        "models/HunyuanVideo/text_encoder_2",
+        "models/HunyuanVideo/vae/pytorch_model.pt",
+    ],
+    torch_dtype=torch.float16,
+    device="cpu"
+)
+
+# We support LoRA inference. You can use the following code to load your LoRA model.
+# Example LoRA: https://civitai.com/models/1032126/walking-animation-hunyuan-video
+model_manager.load_lora("models/lora/kxsr_walking_anim_v1-5.safetensors", lora_alpha=1.0)
+
+# The computation device is "cuda".
+pipe = HunyuanVideoPipeline.from_model_manager(
+    model_manager,
+    torch_dtype=torch.bfloat16,
+    device="cuda"
+)
+# This LoRA requires shift=9.0.
+pipe.scheduler = FlowMatchScheduler(shift=9.0, sigma_min=0.0, extra_one_step=True)
+
+# Enjoy!
+for clothes_up in ["white t-shirt", "black t-shirt", "orange t-shirt"]:
+    for clothes_down in ["blue sports skirt", "red sports skirt", "white sports skirt"]:
+        prompt = f"kxsr, full body, no crop, A 3D-rendered CG animation video featuring a Gorgeous, mature, curvaceous, fair-skinned female girl with long silver hair and blue eyes. She wears a {clothes_up} and a {clothes_down}, walking offering a sense of fluid movement and vivid animation."
+        video = pipe(prompt, seed=0, height=512, width=384, num_frames=129, num_inference_steps=18, tile_size=(17, 16, 16), tile_stride=(12, 12, 12))
+        save_video(video, f"video-{clothes_up}-{clothes_down}.mp4", fps=30, quality=6)
--- a/examples/HunyuanVideo/hunyuanvideo_80G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_80G.py
@@ -12,7 +12,7 @@ model_manager.load_models(
        "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
    ],
    torch_dtype=torch.bfloat16,
-    device="cpu"
+    device="cuda"
 )

 # The other modules are loaded in float16.
@@ -23,7 +23,7 @@ model_manager.load_models(
        "models/HunyuanVideo/vae/pytorch_model.pt",
    ],
    torch_dtype=torch.float16,
-    device="cpu"
+    device="cuda"
 )

 # We support LoRA inference. You can use the following code to load your LoRA model.
@@ -33,10 +33,13 @@ model_manager.load_models(
 pipe = HunyuanVideoPipeline.from_model_manager(
    model_manager,
    torch_dtype=torch.bfloat16,
-    device="cuda"
+    device="cuda",
+    enable_vram_management=False
 )
+# Although you have enough VRAM, we still recommend you to enable offload.
+pipe.enable_cpu_offload()

 # Enjoy!
 prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
-video = pipe(prompt, seed=0, height=720, width=960)
-save_video(video, "video.mp4", fps=30, quality=5)
+video = pipe(prompt, seed=0)
+save_video(video, "video.mp4", fps=30, quality=6)
--- a/examples/HunyuanVideo/hunyuanvideo_8G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_8G.py
@@ -1,42 +0,0 @@
-import torch
-torch.cuda.set_per_process_memory_fraction(1.0, 0)
-from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
-
-
-download_models(["HunyuanVideo"])
-model_manager = ModelManager()
-
-# The DiT model is loaded in bfloat16.
-model_manager.load_models(
-    [
-        "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
-    ],
-    torch_dtype=torch.bfloat16,
-    device="cpu"
-)
-
-# The other modules are loaded in float16.
-model_manager.load_models(
-    [
-        "models/HunyuanVideo/text_encoder/model.safetensors",
-        "models/HunyuanVideo/text_encoder_2",
-        "models/HunyuanVideo/vae/pytorch_model.pt",
-    ],
-    torch_dtype=torch.float16,
-    device="cpu"
-)
-
-# We support LoRA inference. You can use the following code to load your LoRA model.
-model_manager.load_lora("models/lora/Rem_hunyuan_video_v3.safetensors", lora_alpha=1.0)
-
-# The computation device is "cuda".
-pipe = HunyuanVideoPipeline.from_model_manager(
-    model_manager,
-    torch_dtype=torch.bfloat16,
-    device="cuda"
-)
-
-# Enjoy!
-prompt = "a woman with blue hair wearing a white and black dress, sitting on a bed with a white wall in the background. she is wearing a re:zero starting life in another world rem cosplay costume, complete with a black and white dress, black gloves, and a black bow tie."
-video = pipe(prompt, seed=0, height=512, width=512, tile_size=(17, 16, 16), tile_stride=(12, 12, 12))
-save_video(video, "video.mp4", fps=30, quality=5)