Merge pull request #1272 from modelscope/zero3-fix

Support DeepSpeed ZeRO 3
2026-03-19 06:48:12 +00:00 · 2026-02-06 16:33:12 +08:00
parent abdf66d09e b0bf78e915
commit 1b47e1dc22
26 changed files with 353 additions and 188 deletions
--- a/diffsynth/utils/xfuser/xdit_context_parallel.py
+++ b/diffsynth/utils/xfuser/xdit_context_parallel.py
@@ -9,6 +9,7 @@ from xfuser.core.long_ctx_attention import xFuserLongContextAttention

 from ... import IS_NPU_AVAILABLE
 from ...core.device import parse_nccl_backend, parse_device_type
+from ...core.gradient import gradient_checkpoint_forward


 def initialize_usp(device_type):
@@ -87,11 +88,6 @@ def usp_dit_forward(self,
        self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
        self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
    ], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
-    
-    def create_custom_forward(module):
-        def custom_forward(*inputs):
-            return module(*inputs)
-        return custom_forward

    # Context Parallel
    chunks = torch.chunk(x, get_sequence_parallel_world_size(), dim=1)
@@ -100,20 +96,13 @@ def usp_dit_forward(self,
    x = chunks[get_sequence_parallel_rank()]

    for block in self.blocks:
-        if self.training and use_gradient_checkpointing:
-            if use_gradient_checkpointing_offload:
-                with torch.autograd.graph.save_on_cpu():
-                    x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
-                        x, context, t_mod, freqs,
-                        use_reentrant=False,
-                    )
-            else:
-                x = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    x, context, t_mod, freqs,
-                    use_reentrant=False,
-                )
+        if self.training:
+            x = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing,
+                use_gradient_checkpointing_offload,
+                x, context, t_mod, freqs
+            )
        else:
            x = block(x, context, t_mod, freqs)