mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
Merge pull request #1354 from mi804/low_vram_training_ds
low vram training with deepspeed zero3
This commit is contained in:
@@ -1,12 +1,32 @@
|
||||
import torch
|
||||
|
||||
|
||||
try:
|
||||
import deepspeed
|
||||
_HAS_DEEPSPEED = True
|
||||
except ModuleNotFoundError:
|
||||
_HAS_DEEPSPEED = False
|
||||
|
||||
|
||||
def create_custom_forward(module):
|
||||
def custom_forward(*inputs, **kwargs):
|
||||
return module(*inputs, **kwargs)
|
||||
return custom_forward
|
||||
|
||||
|
||||
def create_custom_forward_use_reentrant(module):
|
||||
def custom_forward(*inputs):
|
||||
return module(*inputs)
|
||||
return custom_forward
|
||||
|
||||
|
||||
def judge_args_requires_grad(*args):
|
||||
for arg in args:
|
||||
if isinstance(arg, torch.Tensor) and arg.requires_grad:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def gradient_checkpoint_forward(
|
||||
model,
|
||||
use_gradient_checkpointing,
|
||||
@@ -14,6 +34,17 @@ def gradient_checkpoint_forward(
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
if use_gradient_checkpointing and _HAS_DEEPSPEED and deepspeed.checkpointing.is_configured():
|
||||
all_args = args + tuple(kwargs.values())
|
||||
if not judge_args_requires_grad(*all_args):
|
||||
# get the first grad_enabled tensor from un_checkpointed forward
|
||||
model_output = model(*args, **kwargs)
|
||||
else:
|
||||
model_output = deepspeed.checkpointing.checkpoint(
|
||||
create_custom_forward_use_reentrant(model),
|
||||
*all_args,
|
||||
)
|
||||
return model_output
|
||||
if use_gradient_checkpointing_offload:
|
||||
with torch.autograd.graph.save_on_cpu():
|
||||
model_output = torch.utils.checkpoint.checkpoint(
|
||||
|
||||
Reference in New Issue
Block a user