optimize stepvideo vae

2026-03-19 06:48:12 +00:00 · 2025-02-18 17:28:05 +08:00
parent f191353cf4
commit 9cff769fbd
7 changed files with 197 additions and 28 deletions
--- a/examples/stepvideo/README.md
+++ b/examples/stepvideo/README.md
@@ -10,6 +10,8 @@ StepVideo is a state-of-the-art (SoTA) text-to-video pre-trained model with 30 b

 For original BF16 version, please see [`./stepvideo_text_to_video.py`](./stepvideo_text_to_video.py). 80G VRAM required.

+We also support auto-offload, which can reduce the VRAM requirement to **24GB**; however, it requires 2x time for inference. Please see [`./stepvideo_text_to_video_low_vram.py`](./stepvideo_text_to_video_low_vram.py).
+
 https://github.com/user-attachments/assets/5954fdaa-a3cf-45a3-bd35-886e3cc4581b

 For FP8 quantized version, please see [`./stepvideo_text_to_video_quantized.py`](./stepvideo_text_to_video_quantized.py). 40G VRAM required.
--- a/examples/stepvideo/stepvideo_text_to_video.py
+++ b/examples/stepvideo/stepvideo_text_to_video.py
@@ -44,4 +44,7 @@ video = pipe(
    negative_prompt="画面暗、低分辨率、不良手、文本、缺少手指、多余的手指、裁剪、低质量、颗粒状、签名、水印、用户名、模糊。",
    num_inference_steps=30, cfg_scale=9, num_frames=51, seed=1
 )
-save_video(video, "video.mp4", fps=25, quality=5)
+save_video(
+    video, "video.mp4", fps=25, quality=5,
+    ffmpeg_params=["-vf", "atadenoise=0a=0.1:0b=0.1:1a=0.1:1b=0.1"]
+)
--- a/examples/stepvideo/stepvideo_text_to_video_low_vram.py
+++ b/examples/stepvideo/stepvideo_text_to_video_low_vram.py
@@ -0,0 +1,54 @@
+from modelscope import snapshot_download
+from diffsynth import ModelManager, StepVideoPipeline, save_video
+import torch
+
+
+# Download models
+snapshot_download(model_id="stepfun-ai/stepvideo-t2v", cache_dir="models")
+
+# Load the compiled attention for the LLM text encoder.
+# If you encounter errors here. Please select other compiled file that matches your environment or delete this line.
+torch.ops.load_library("models/stepfun-ai/stepvideo-t2v/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so")
+
+# Load models
+model_manager = ModelManager()
+model_manager.load_models(
+    ["models/stepfun-ai/stepvideo-t2v/hunyuan_clip/clip_text_encoder/pytorch_model.bin"],
+    torch_dtype=torch.float32, device="cpu"
+)
+model_manager.load_models(
+    [
+        "models/stepfun-ai/stepvideo-t2v/step_llm",
+        [
+            "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00001-of-00006.safetensors",
+            "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00002-of-00006.safetensors",
+            "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00003-of-00006.safetensors",
+            "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00004-of-00006.safetensors",
+            "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00005-of-00006.safetensors",
+            "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00006-of-00006.safetensors",
+        ]
+    ],
+    torch_dtype=torch.float8_e4m3fn, device="cpu"
+)
+model_manager.load_models(
+    ["models/stepfun-ai/stepvideo-t2v/vae/vae_v2.safetensors"],
+    torch_dtype=torch.bfloat16, device="cpu"
+)
+pipe = StepVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
+
+# Enable VRAM management
+# This model requires 24G VRAM.
+# In order to speed up, please set `num_persistent_param_in_dit` to a large number or None (unlimited).
+pipe.enable_vram_management(num_persistent_param_in_dit=0)
+
+# Run!
+video = pipe(
+    prompt="一名宇航员在月球上发现一块石碑，上面印有“stepfun”字样，闪闪发光。超高清、HDR 视频、环境光、杜比全景声、画面稳定、流畅动作、逼真的细节、专业级构图、超现实主义、自然、生动、超细节、清晰。",
+    negative_prompt="画面暗、低分辨率、不良手、文本、缺少手指、多余的手指、裁剪、低质量、颗粒状、签名、水印、用户名、模糊。",
+    num_inference_steps=30, cfg_scale=9, num_frames=51, seed=1,
+    tiled=True, tile_size=(34, 34), tile_stride=(16, 16)
+)
+save_video(
+    video, "video.mp4", fps=25, quality=5,
+    ffmpeg_params=["-vf", "atadenoise=0a=0.1:0b=0.1:1a=0.1:1b=0.1"]
+)
--- a/examples/stepvideo/stepvideo_text_to_video_quantized.py
+++ b/examples/stepvideo/stepvideo_text_to_video_quantized.py
@@ -37,7 +37,7 @@ model_manager.load_models(
 pipe = StepVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")

 # Enable VRAM management
-# This model requires 80G VRAM.
+# This model requires 40G VRAM.
 # In order to reduce VRAM required, please set `num_persistent_param_in_dit` to a small number.
 pipe.enable_vram_management(num_persistent_param_in_dit=None)

@@ -47,4 +47,7 @@ video = pipe(
    negative_prompt="画面暗、低分辨率、不良手、文本、缺少手指、多余的手指、裁剪、低质量、颗粒状、签名、水印、用户名、模糊。",
    num_inference_steps=30, cfg_scale=9, num_frames=51, seed=1
 )
-save_video(video, "video.mp4", fps=25, quality=5)
+save_video(
+    video, "video.mp4", fps=25, quality=5,
+    ffmpeg_params=["-vf", "atadenoise=0a=0.1:0b=0.1:1a=0.1:1b=0.1"]
+)