Merge pull request #623 from modelscope/usp

Usp
2026-04-09 09:38:18 +00:00 · 2025-06-19 10:15:39 +08:00
parent c2d35a2157 24426e3a32
commit 3b3e1e4d44
3 changed files with 23 additions and 12 deletions
--- a/diffsynth/pipelines/wan_video_new.py
+++ b/diffsynth/pipelines/wan_video_new.py
@@ -166,10 +166,13 @@ class ModelConfig:
    offload_device: Optional[Union[str, torch.device]] = None
    offload_dtype: Optional[torch.dtype] = None

-    def download_if_necessary(self, local_model_path="./models", skip_download=False):
+    def download_if_necessary(self, local_model_path="./models", skip_download=False, use_usp=False):
        if self.path is None:
            if self.model_id is None or self.origin_file_pattern is None:
                raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`.""")
+            if use_usp:
+                import torch.distributed as dist
+                skip_download = dist.get_rank() != 0
            if not skip_download:
                downloaded_files = glob.glob(self.origin_file_pattern, root_dir=os.path.join(local_model_path, self.model_id))
                snapshot_download(
@@ -179,6 +182,9 @@ class ModelConfig:
                    ignore_file_pattern=downloaded_files,
                    local_files_only=False
                )
+            if use_usp:
+                import torch.distributed as dist
+                dist.barrier(device_ids=[dist.get_rank()])
            self.path = glob.glob(os.path.join(local_model_path, self.model_id, self.origin_file_pattern))
            if isinstance(self.path, list) and len(self.path) == 1:
                self.path = self.path[0]
@@ -425,19 +431,21 @@ class WanVideoPipeline(BasePipeline):
                    print(f"To avoid repeatedly downloading model files, ({model_config.model_id}, {model_config.origin_file_pattern}) is redirected to ({redirect_dict[model_config.origin_file_pattern]}, {model_config.origin_file_pattern}). You can use `redirect_common_files=False` to disable file redirection.")
                    model_config.model_id = redirect_dict[model_config.origin_file_pattern]
        
+        # Initialize pipeline
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        if use_usp: pipe.initialize_usp()
+        
        # Download and load models
        model_manager = ModelManager()
        for model_config in model_configs:
-            model_config.download_if_necessary(local_model_path, skip_download=skip_download)
+            model_config.download_if_necessary(local_model_path, skip_download=skip_download, use_usp=use_usp)
            model_manager.load_model(
                model_config.path,
                device=model_config.offload_device or device,
                torch_dtype=model_config.offload_dtype or torch_dtype
            )
        
-        # Initialize pipeline
-        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
-        if use_usp: pipe.initialize_usp()
+        # Load models
        pipe.text_encoder = model_manager.fetch_model("wan_video_text_encoder")
        pipe.dit = model_manager.fetch_model("wan_video_dit")
        pipe.vae = model_manager.fetch_model("wan_video_vae")
@@ -1148,17 +1156,20 @@ def model_fn_wan_video(
            else:
                x = block(x, context, t_mod, freqs)
            if vace_context is not None and block_id in vace.vace_layers_mapping:
-                x = x + vace_hints[vace.vace_layers_mapping[block_id]] * vace_scale
+                current_vace_hint = vace_hints[vace.vace_layers_mapping[block_id]]
+                if use_unified_sequence_parallel and dist.is_initialized() and dist.get_world_size() > 1:
+                    current_vace_hint = torch.chunk(current_vace_hint, get_sequence_parallel_world_size(), dim=1)[get_sequence_parallel_rank()]
+                x = x + current_vace_hint * vace_scale
        if tea_cache is not None:
            tea_cache.store(x)
            
-    if reference_latents is not None:
-        x = x[:, reference_latents.shape[1]:]
-        f -= 1
-
    x = dit.head(x, t)
    if use_unified_sequence_parallel:
        if dist.is_initialized() and dist.get_world_size() > 1:
            x = get_sp_group().all_gather(x, dim=1)
+    # Remove reference latents
+    if reference_latents is not None:
+        x = x[:, reference_latents.shape[1]:]
+        f -= 1
    x = dit.unpatchify(x, (f, h, w))
    return x
--- a/examples/wanvideo/README.md
+++ b/examples/wanvideo/README.md
@@ -173,7 +173,7 @@ Wan supports multiple acceleration techniques, including:
 * **Unified Sequence Parallel**: Sequence parallelism based on [xDiT](https://github.com/xdit-project/xDiT). Please refer to [this example](./acceleration/unified_sequence_parallel.py), and run it using the command: 

 ```shell
-pip install xfuser>=0.4.3
+pip install "xfuser[flash-attn]>=0.4.3"
 torchrun --standalone --nproc_per_node=8 examples/wanvideo/acceleration/unified_sequence_parallel.py
 ```

--- a/examples/wanvideo/README_zh.md
+++ b/examples/wanvideo/README_zh.md
@@ -175,7 +175,7 @@ Wan 支持多种加速方案，包括
 * 统一序列并行：基于 [xDiT](https://github.com/xdit-project/xDiT) 实现的序列并行，请参考[示例代码](./acceleration/unified_sequence_parallel.py)，使用以下命令运行：

 ```shell
-pip install xfuser>=0.4.3
+pip install "xfuser[flash-attn]>=0.4.3"
 torchrun --standalone --nproc_per_node=8 examples/wanvideo/acceleration/unified_sequence_parallel.py
 ```