diff --git a/README.md b/README.md index fc07c8d..0fd7212 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ DiffSynth Studio is a Diffusion engine. We have restructured architectures inclu Until now, DiffSynth Studio has supported the following models: +* [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V) * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo) * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b) * [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev) @@ -34,6 +35,9 @@ Until now, DiffSynth Studio has supported the following models: * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5) ## News + +- **February 17, 2024** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/). + - **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter and In-Context LoRA, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/). - Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097) - Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen) diff --git a/diffsynth/models/stepvideo_dit.py b/diffsynth/models/stepvideo_dit.py index 18403d6..ccfb8f1 100644 --- a/diffsynth/models/stepvideo_dit.py +++ b/diffsynth/models/stepvideo_dit.py @@ -238,7 +238,7 @@ class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module): self.fps_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) def forward(self, timestep, resolution=None, nframe=None, fps=None): - hidden_dtype = next(self.timestep_embedder.parameters()).dtype + hidden_dtype = timestep.dtype timesteps_proj = self.time_proj(timestep) timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (N, D) diff --git a/diffsynth/pipelines/step_video.py b/diffsynth/pipelines/step_video.py index f9e6072..c2dd463 100644 --- a/diffsynth/pipelines/step_video.py +++ b/diffsynth/pipelines/step_video.py @@ -181,7 +181,7 @@ class StepVideoPipeline(BasePipeline): # Denoise self.load_models_to_device(["dit"]) for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): - timestep = timestep.unsqueeze(0).to(self.device) + timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device) print(f"Step {progress_id + 1} / {len(self.scheduler.timesteps)}") # Inference diff --git a/examples/stepvideo/README.md b/examples/stepvideo/README.md index 8b14ae8..261dcdb 100644 --- a/examples/stepvideo/README.md +++ b/examples/stepvideo/README.md @@ -8,6 +8,10 @@ StepVideo is a state-of-the-art (SoTA) text-to-video pre-trained model with 30 b ## Examples -See [`./stepvideo_text_to_video.py`](./stepvideo_text_to_video.py). +For original BF16 version, please see [`./stepvideo_text_to_video.py`](./stepvideo_text_to_video.py). 80G VRAM required. https://github.com/user-attachments/assets/5954fdaa-a3cf-45a3-bd35-886e3cc4581b + +For FP8 quantized version, please see [`./stepvideo_text_to_video_quantized.py`](./stepvideo_text_to_video_quantized.py). 40G VRAM required. + +https://github.com/user-attachments/assets/f3697f4e-bc08-47d2-b00a-32d7dfa272ad diff --git a/examples/stepvideo/stepvideo_text_to_video.py b/examples/stepvideo/stepvideo_text_to_video.py index aaa2b16..937f5d1 100644 --- a/examples/stepvideo/stepvideo_text_to_video.py +++ b/examples/stepvideo/stepvideo_text_to_video.py @@ -13,7 +13,7 @@ torch.ops.load_library("models/stepfun-ai/stepvideo-t2v/lib/liboptimus_ths-torch # Load models model_manager = ModelManager() model_manager.load_models( - ["models/stepvideo-t2v/hunyuan_clip/clip_text_encoder/pytorch_model.bin"], + ["models/stepfun-ai/stepvideo-t2v/hunyuan_clip/clip_text_encoder/pytorch_model.bin"], torch_dtype=torch.float32, device="cpu" ) model_manager.load_models( @@ -42,6 +42,6 @@ pipe.enable_vram_management(num_persistent_param_in_dit=None) video = pipe( prompt="一名宇航员在月球上发现一块石碑,上面印有“stepfun”字样,闪闪发光。超高清、HDR 视频、环境光、杜比全景声、画面稳定、流畅动作、逼真的细节、专业级构图、超现实主义、自然、生动、超细节、清晰。", negative_prompt="画面暗、低分辨率、不良手、文本、缺少手指、多余的手指、裁剪、低质量、颗粒状、签名、水印、用户名、模糊。", - num_inference_steps=30, cfg_scale=9, num_frames=204, seed=1 + num_inference_steps=30, cfg_scale=9, num_frames=51, seed=1 ) save_video(video, "video.mp4", fps=25, quality=5) diff --git a/examples/stepvideo/stepvideo_text_to_video_quantized.py b/examples/stepvideo/stepvideo_text_to_video_quantized.py new file mode 100644 index 0000000..734b2fa --- /dev/null +++ b/examples/stepvideo/stepvideo_text_to_video_quantized.py @@ -0,0 +1,50 @@ +from modelscope import snapshot_download +from diffsynth import ModelManager, StepVideoPipeline, save_video +import torch + + +# Download models +snapshot_download(model_id="stepfun-ai/stepvideo-t2v", cache_dir="models") + +# Load the compiled attention for the LLM text encoder. +# If you encounter errors here. Please select other compiled file that matches your environment or delete this line. +torch.ops.load_library("models/stepfun-ai/stepvideo-t2v/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so") + +# Load models +model_manager = ModelManager() +model_manager.load_models( + ["models/stepfun-ai/stepvideo-t2v/hunyuan_clip/clip_text_encoder/pytorch_model.bin"], + torch_dtype=torch.float32, device="cpu" +) +model_manager.load_models( + [ + "models/stepfun-ai/stepvideo-t2v/step_llm", + [ + "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00001-of-00006.safetensors", + "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00002-of-00006.safetensors", + "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00003-of-00006.safetensors", + "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00004-of-00006.safetensors", + "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00005-of-00006.safetensors", + "models/stepfun-ai/stepvideo-t2v/transformer/diffusion_pytorch_model-00006-of-00006.safetensors", + ] + ], + torch_dtype=torch.float8_e4m3fn, device="cpu" +) +model_manager.load_models( + ["models/stepfun-ai/stepvideo-t2v/vae/vae_v2.safetensors"], + torch_dtype=torch.bfloat16, device="cpu" +) +pipe = StepVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda") + +# Enable VRAM management +# This model requires 80G VRAM. +# In order to reduce VRAM required, please set `num_persistent_param_in_dit` to a small number. +pipe.enable_vram_management(num_persistent_param_in_dit=None) + +# Run! +video = pipe( + prompt="一名宇航员在月球上发现一块石碑,上面印有“stepfun”字样,闪闪发光。超高清、HDR 视频、环境光、杜比全景声、画面稳定、流畅动作、逼真的细节、专业级构图、超现实主义、自然、生动、超细节、清晰。", + negative_prompt="画面暗、低分辨率、不良手、文本、缺少手指、多余的手指、裁剪、低质量、颗粒状、签名、水印、用户名、模糊。", + num_inference_steps=30, cfg_scale=9, num_frames=51, seed=1 +) +save_video(video, "video.mp4", fps=25, quality=5)