diff --git a/examples/wanvideo/README.md b/examples/wanvideo/README.md index b3f5ade..1b8ac6c 100644 --- a/examples/wanvideo/README.md +++ b/examples/wanvideo/README.md @@ -49,6 +49,20 @@ We present a detailed table here. The model is tested on a single A100. https://github.com/user-attachments/assets/3908bc64-d451-485a-8b61-28f6d32dd92f +### Parallel Inference + +1. Unified Sequence Parallel (USP) + +```bash +pip install xfuser>=0.4.3 +``` + +```bash +torchrun --standalone --nproc_per_node=8 ./wan_14b_text_to_video_usp.py +``` + +2. Tensor Parallel + Tensor parallel module of Wan-Video-14B-T2V is still under development. An example script is provided in [`./wan_14b_text_to_video_tensor_parallel.py`](./wan_14b_text_to_video_tensor_parallel.py). ### Wan-Video-14B-I2V diff --git a/examples/wanvideo/wan_14b_text_to_video.py b/examples/wanvideo/wan_14b_text_to_video.py index dcb2f29..2c4f15b 100644 --- a/examples/wanvideo/wan_14b_text_to_video.py +++ b/examples/wanvideo/wan_14b_text_to_video.py @@ -1,7 +1,6 @@ import torch from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData from modelscope import snapshot_download -import torch.distributed as dist # Download models @@ -24,27 +23,7 @@ model_manager.load_models( ], torch_dtype=torch.float8_e4m3fn, # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization. ) - -dist.init_process_group( - backend="nccl", - init_method="env://", -) -from xfuser.core.distributed import (initialize_model_parallel, - init_distributed_environment) -init_distributed_environment( - rank=dist.get_rank(), world_size=dist.get_world_size()) - -initialize_model_parallel( - sequence_parallel_degree=dist.get_world_size(), - ring_degree=1, - ulysses_degree=dist.get_world_size(), -) -torch.cuda.set_device(dist.get_rank()) - -pipe = WanVideoPipeline.from_model_manager(model_manager, - torch_dtype=torch.bfloat16, - device=f"cuda:{dist.get_rank()}", - use_usp=True if dist.get_world_size() > 1 else False) +pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda") pipe.enable_vram_management(num_persistent_param_in_dit=None) # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required. # Text-to-video @@ -54,4 +33,4 @@ video = pipe( num_inference_steps=50, seed=0, tiled=True ) -save_video(video, "video1.mp4", fps=25, quality=5) +save_video(video, "video1.mp4", fps=25, quality=5) \ No newline at end of file diff --git a/examples/wanvideo/wan_14b_text_to_video_usp.py b/examples/wanvideo/wan_14b_text_to_video_usp.py new file mode 100644 index 0000000..dcb2f29 --- /dev/null +++ b/examples/wanvideo/wan_14b_text_to_video_usp.py @@ -0,0 +1,57 @@ +import torch +from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData +from modelscope import snapshot_download +import torch.distributed as dist + + +# Download models +snapshot_download("Wan-AI/Wan2.1-T2V-14B", local_dir="models/Wan-AI/Wan2.1-T2V-14B") + +# Load models +model_manager = ModelManager(device="cpu") +model_manager.load_models( + [ + [ + "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors", + "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors", + "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors", + "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors", + "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors", + "models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors", + ], + "models/Wan-AI/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth", + "models/Wan-AI/Wan2.1-T2V-14B/Wan2.1_VAE.pth", + ], + torch_dtype=torch.float8_e4m3fn, # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization. +) + +dist.init_process_group( + backend="nccl", + init_method="env://", +) +from xfuser.core.distributed import (initialize_model_parallel, + init_distributed_environment) +init_distributed_environment( + rank=dist.get_rank(), world_size=dist.get_world_size()) + +initialize_model_parallel( + sequence_parallel_degree=dist.get_world_size(), + ring_degree=1, + ulysses_degree=dist.get_world_size(), +) +torch.cuda.set_device(dist.get_rank()) + +pipe = WanVideoPipeline.from_model_manager(model_manager, + torch_dtype=torch.bfloat16, + device=f"cuda:{dist.get_rank()}", + use_usp=True if dist.get_world_size() > 1 else False) +pipe.enable_vram_management(num_persistent_param_in_dit=None) # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required. + +# Text-to-video +video = pipe( + prompt="一名宇航员身穿太空服,面朝镜头骑着一匹机械马在火星表面驰骋。红色的荒凉地表延伸至远方,点缀着巨大的陨石坑和奇特的岩石结构。机械马的步伐稳健,扬起微弱的尘埃,展现出未来科技与原始探索的完美结合。宇航员手持操控装置,目光坚定,仿佛正在开辟人类的新疆域。背景是深邃的宇宙和蔚蓝的地球,画面既科幻又充满希望,让人不禁畅想未来的星际生活。", + negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", + num_inference_steps=50, + seed=0, tiled=True +) +save_video(video, "video1.mp4", fps=25, quality=5) diff --git a/requirements.txt b/requirements.txt index 7dc3846..63a871b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,3 @@ sentencepiece protobuf modelscope ftfy -xfuser>=0.4.2