mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
Merge pull request #1354 from mi804/low_vram_training_ds
low vram training with deepspeed zero3
This commit is contained in:
@@ -29,7 +29,7 @@ def launch_training_task(
|
||||
dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, collate_fn=lambda x: x[0], num_workers=num_workers)
|
||||
model.to(device=accelerator.device)
|
||||
model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
|
||||
|
||||
initialize_deepspeed_gradient_checkpointing(accelerator)
|
||||
for epoch_id in range(num_epochs):
|
||||
for data in tqdm(dataloader):
|
||||
with accelerator.accumulate(model):
|
||||
@@ -70,3 +70,19 @@ def launch_data_process_task(
|
||||
save_path = os.path.join(model_logger.output_path, str(accelerator.process_index), f"{data_id}.pth")
|
||||
data = model(data)
|
||||
torch.save(data, save_path)
|
||||
|
||||
|
||||
def initialize_deepspeed_gradient_checkpointing(accelerator: Accelerator):
|
||||
if getattr(accelerator.state, "deepspeed_plugin", None) is not None:
|
||||
ds_config = accelerator.state.deepspeed_plugin.deepspeed_config
|
||||
if "activation_checkpointing" in ds_config:
|
||||
import deepspeed
|
||||
act_config = ds_config["activation_checkpointing"]
|
||||
deepspeed.checkpointing.configure(
|
||||
mpu_=None,
|
||||
partition_activations=act_config.get("partition_activations", False),
|
||||
checkpoint_in_cpu=act_config.get("cpu_checkpointing", False),
|
||||
contiguous_checkpointing=act_config.get("contiguous_memory_optimization", False)
|
||||
)
|
||||
else:
|
||||
print("Do not find activation_checkpointing config in deepspeed config, skip initializing deepspeed gradient checkpointing.")
|
||||
|
||||
Reference in New Issue
Block a user