From 3d48b287a3dc73ed2d0004f5c2cb51c54faf55e9 Mon Sep 17 00:00:00 2001
From: Artiprocher <wangye87v5@hotmail.com>
Date: Thu, 19 Dec 2024 13:15:06 +0800
Subject: [PATCH] hunyuanvideo examples

---
 diffsynth/configs/model_config.py             |  1 +
 examples/HunyuanVideo/README.md               | 19 ++++++++
 examples/HunyuanVideo/hunyuanvideo_24G.py     |  2 +-
 examples/HunyuanVideo/hunyuanvideo_6G.py      | 47 +++++++++++++++++++
 ...unyuanvideo_16G.py => hunyuanvideo_80G.py} | 13 +++--
 examples/HunyuanVideo/hunyuanvideo_8G.py      | 42 -----------------
 6 files changed, 76 insertions(+), 48 deletions(-)
 create mode 100644 examples/HunyuanVideo/README.md
 create mode 100644 examples/HunyuanVideo/hunyuanvideo_6G.py
 rename examples/HunyuanVideo/{hunyuanvideo_16G.py => hunyuanvideo_80G.py} (84%)
 delete mode 100644 examples/HunyuanVideo/hunyuanvideo_8G.py

diff --git a/diffsynth/configs/model_config.py b/diffsynth/configs/model_config.py
index 86cc951..1a43700 100644
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -100,6 +100,7 @@ model_loader_configs = [
     (None, "5da81baee73198a7c19e6d2fe8b5148e", ["sd3_text_encoder_1"], [SD3TextEncoder1], "diffusers"),
     (None, "aeb82dce778a03dcb4d726cb03f3c43f", ["hunyuan_video_vae_decoder", "hunyuan_video_vae_encoder"], [HunyuanVideoVAEDecoder, HunyuanVideoVAEEncoder], "diffusers"),
     (None, "b9588f02e78f5ccafc9d7c0294e46308", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
+    (None, "84ef4bd4757f60e906b54aa6a7815dc6", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
 ]
 huggingface_model_loader_configs = [
     # These configs are provided for detecting model type automatically.
diff --git a/examples/HunyuanVideo/README.md b/examples/HunyuanVideo/README.md
new file mode 100644
index 0000000..a0afa37
--- /dev/null
+++ b/examples/HunyuanVideo/README.md
@@ -0,0 +1,19 @@
+# HunyuanVideo
+
+HunyuanVideo is a video generation model trained by Tencent. We provide advanced VRAM management for this model, including three stages:
+
+|VRAM required|Example script|Frames|Resolution|Note|
+|-|-|-|-|-|
+|80G|[hunyuanvideo_80G.py](hunyuanvideo_80G.py)|129|720*1280|No VRAM management.|
+|24G|[hunyuanvideo_24G.py](hunyuanvideo_24G.py)|129|720*1280|The video is consistent with the original implementation, but it requires 5%~10% more time than [hunyuanvideo_80G.py](hunyuanvideo_80G.py)|
+|6G|[hunyuanvideo_6G.py](hunyuanvideo_6G.py)|129|512*384|The base model doesn't support low resolutions. We recommend users to use some LoRA ([example](https://civitai.com/models/1032126/walking-animation-hunyuan-video)) trained using low resolutions.|
+
+## Gallery
+
+Video generated by [hunyuanvideo_80G.py](hunyuanvideo_80G.py) and [hunyuanvideo_24G.py](hunyuanvideo_24G.py):
+
+https://github.com/user-attachments/assets/48dd24bb-0cc6-40d2-88c3-10feed3267e9
+
+Video generated by [hunyuanvideo_6G.py](hunyuanvideo_6G.py) using [this LoRA](https://civitai.com/models/1032126/walking-animation-hunyuan-video):
+
+https://github.com/user-attachments/assets/2997f107-d02d-4ecb-89bb-5ce1a7f93817
diff --git a/examples/HunyuanVideo/hunyuanvideo_24G.py b/examples/HunyuanVideo/hunyuanvideo_24G.py
index 0b4c8bb..87cb5f7 100644
--- a/examples/HunyuanVideo/hunyuanvideo_24G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_24G.py
@@ -39,4 +39,4 @@ pipe = HunyuanVideoPipeline.from_model_manager(
 # Enjoy!
 prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
 video = pipe(prompt, seed=0)
-save_video(video, "video.mp4", fps=30, quality=5)
+save_video(video, "video_girl.mp4", fps=30, quality=6)
diff --git a/examples/HunyuanVideo/hunyuanvideo_6G.py b/examples/HunyuanVideo/hunyuanvideo_6G.py
new file mode 100644
index 0000000..7d895fc
--- /dev/null
+++ b/examples/HunyuanVideo/hunyuanvideo_6G.py
@@ -0,0 +1,47 @@
+import torch
+torch.cuda.set_per_process_memory_fraction(1.0, 0)
+from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video, FlowMatchScheduler
+
+
+download_models(["HunyuanVideo"])
+model_manager = ModelManager()
+
+# The DiT model is loaded in bfloat16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
+    ],
+    torch_dtype=torch.bfloat16,
+    device="cpu"
+)
+
+# The other modules are loaded in float16.
+model_manager.load_models(
+    [
+        "models/HunyuanVideo/text_encoder/model.safetensors",
+        "models/HunyuanVideo/text_encoder_2",
+        "models/HunyuanVideo/vae/pytorch_model.pt",
+    ],
+    torch_dtype=torch.float16,
+    device="cpu"
+)
+
+# We support LoRA inference. You can use the following code to load your LoRA model.
+# Example LoRA: https://civitai.com/models/1032126/walking-animation-hunyuan-video
+model_manager.load_lora("models/lora/kxsr_walking_anim_v1-5.safetensors", lora_alpha=1.0)
+
+# The computation device is "cuda".
+pipe = HunyuanVideoPipeline.from_model_manager(
+    model_manager,
+    torch_dtype=torch.bfloat16,
+    device="cuda"
+)
+# This LoRA requires shift=9.0.
+pipe.scheduler = FlowMatchScheduler(shift=9.0, sigma_min=0.0, extra_one_step=True)
+
+# Enjoy!
+for clothes_up in ["white t-shirt", "black t-shirt", "orange t-shirt"]:
+    for clothes_down in ["blue sports skirt", "red sports skirt", "white sports skirt"]:
+        prompt = f"kxsr, full body, no crop, A 3D-rendered CG animation video featuring a Gorgeous, mature, curvaceous, fair-skinned female girl with long silver hair and blue eyes. She wears a {clothes_up} and a {clothes_down}, walking offering a sense of fluid movement and vivid animation."
+        video = pipe(prompt, seed=0, height=512, width=384, num_frames=129, num_inference_steps=18, tile_size=(17, 16, 16), tile_stride=(12, 12, 12))
+        save_video(video, f"video-{clothes_up}-{clothes_down}.mp4", fps=30, quality=6)
diff --git a/examples/HunyuanVideo/hunyuanvideo_16G.py b/examples/HunyuanVideo/hunyuanvideo_80G.py
similarity index 84%
rename from examples/HunyuanVideo/hunyuanvideo_16G.py
rename to examples/HunyuanVideo/hunyuanvideo_80G.py
index 860d575..b498c94 100644
--- a/examples/HunyuanVideo/hunyuanvideo_16G.py
+++ b/examples/HunyuanVideo/hunyuanvideo_80G.py
@@ -12,7 +12,7 @@ model_manager.load_models(
         "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
     ],
     torch_dtype=torch.bfloat16,
-    device="cpu"
+    device="cuda"
 )
 
 # The other modules are loaded in float16.
@@ -23,7 +23,7 @@ model_manager.load_models(
         "models/HunyuanVideo/vae/pytorch_model.pt",
     ],
     torch_dtype=torch.float16,
-    device="cpu"
+    device="cuda"
 )
 
 # We support LoRA inference. You can use the following code to load your LoRA model.
@@ -33,10 +33,13 @@ model_manager.load_models(
 pipe = HunyuanVideoPipeline.from_model_manager(
     model_manager,
     torch_dtype=torch.bfloat16,
-    device="cuda"
+    device="cuda",
+    enable_vram_management=False
 )
+# Although you have enough VRAM, we still recommend you to enable offload.
+pipe.enable_cpu_offload()
 
 # Enjoy!
 prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
-video = pipe(prompt, seed=0, height=720, width=960)
-save_video(video, "video.mp4", fps=30, quality=5)
+video = pipe(prompt, seed=0)
+save_video(video, "video.mp4", fps=30, quality=6)
diff --git a/examples/HunyuanVideo/hunyuanvideo_8G.py b/examples/HunyuanVideo/hunyuanvideo_8G.py
deleted file mode 100644
index 336034b..0000000
--- a/examples/HunyuanVideo/hunyuanvideo_8G.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-torch.cuda.set_per_process_memory_fraction(1.0, 0)
-from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
-
-
-download_models(["HunyuanVideo"])
-model_manager = ModelManager()
-
-# The DiT model is loaded in bfloat16.
-model_manager.load_models(
-    [
-        "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
-    ],
-    torch_dtype=torch.bfloat16,
-    device="cpu"
-)
-
-# The other modules are loaded in float16.
-model_manager.load_models(
-    [
-        "models/HunyuanVideo/text_encoder/model.safetensors",
-        "models/HunyuanVideo/text_encoder_2",
-        "models/HunyuanVideo/vae/pytorch_model.pt",
-    ],
-    torch_dtype=torch.float16,
-    device="cpu"
-)
-
-# We support LoRA inference. You can use the following code to load your LoRA model.
-model_manager.load_lora("models/lora/Rem_hunyuan_video_v3.safetensors", lora_alpha=1.0)
-
-# The computation device is "cuda".
-pipe = HunyuanVideoPipeline.from_model_manager(
-    model_manager,
-    torch_dtype=torch.bfloat16,
-    device="cuda"
-)
-
-# Enjoy!
-prompt = "a woman with blue hair wearing a white and black dress, sitting on a bed with a white wall in the background. she is wearing a re:zero starting life in another world rem cosplay costume, complete with a black and white dress, black gloves, and a black bow tie."
-video = pipe(prompt, seed=0, height=512, width=512, tile_size=(17, 16, 16), tile_stride=(12, 12, 12))
-save_video(video, "video.mp4", fps=30, quality=5)