support sd35-lora

2026-03-19 06:32:27 +00:00 · 2024-11-29 11:45:40 +08:00
parent 7f2a5424d4
commit 9d09121fbc
2 changed files with 90 additions and 6 deletions
--- a/examples/train/README.md
+++ b/examples/train/README.md
@@ -256,6 +256,72 @@ image = pipe(
 image.save("image_with_lora.jpg")
 ```

+### Stable Diffusion 3.5 Series
+
+
+You need to download the text encoders and DiT model files. Please use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion3.5-large"])
+```
+
+```
+models/stable_diffusion_3
+├── Put Stable Diffusion 3 checkpoints here.txt
+├── sd3.5_large.safetensors
+└── text_encoders
+    ├── clip_g.safetensors
+    ├── clip_l.safetensors
+    └── t5xxl_fp16.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
+  --pretrained_path models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=[
+                                 "models/stable_diffusion_3/text_encoders/clip_g.safetensors",
+                                 "models/stable_diffusion_3/text_encoders/clip_l.safetensors",
+                                 "models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors",
+                                 "models/stable_diffusion_3/sd3.5_large.safetensors"
+                             ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+    num_inference_steps=30, cfg_scale=7
+)
+image.save("image_with_lora.jpg")
+```
+
 ### Stable Diffusion 3

 Only one file is required in the training script. You can use [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors) (without T5 encoder) or [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors) (with T5 encoder). Please use the following code to download these files:
@@ -285,7 +351,7 @@ CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora
  --height 1024 \
  --width 1024 \
  --center_crop \
-  --precision "16-mixed" \
+  --precision "16" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
--- a/examples/train/stable_diffusion_3/train_sd3_lora.py
+++ b/examples/train/stable_diffusion_3/train_sd3_lora.py
@@ -7,7 +7,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "True"
 class LightningModel(LightningModelForT2ILoRA):
    def __init__(
        self,
-        torch_dtype=torch.float16, pretrained_weights=[],
+        torch_dtype=torch.float16, pretrained_weights=[], preset_lora_path=None,
        learning_rate=1e-4, use_gradient_checkpointing=True,
        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian",
    ):
@@ -16,7 +16,12 @@ class LightningModel(LightningModelForT2ILoRA):
        model_manager = ModelManager(torch_dtype=torch_dtype, device=self.device)
        model_manager.load_models(pretrained_weights)
        self.pipe = SD3ImagePipeline.from_model_manager(model_manager)
-        self.pipe.scheduler.set_timesteps(1000)
+        self.pipe.scheduler.set_timesteps(1000, training=True)
+
+        if preset_lora_path is not None:
+            preset_lora_path = preset_lora_path.split(",")
+            for path in preset_lora_path:
+                model_manager.load_lora(path)

        self.freeze_parameters()
        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
@@ -29,14 +34,26 @@ def parse_args():
        type=str,
        default=None,
        required=True,
-        help="Path to pretrained model. For example, `models/stable_diffusion_3/sd3_medium_incl_clips.safetensors` or `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`.",
+        help="Path to pretrained models, seperated by comma. For example, SD3: `models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors`, SD3.5-large: `models/stable_diffusion_3/text_encoders/clip_g.safetensors,models/stable_diffusion_3/text_encoders/clip_l.safetensors,models/stable_diffusion_3/text_encoders/t5xxl_fp16.safetensors,models/stable_diffusion_3/sd3.5_large.safetensors`",
    )
    parser.add_argument(
        "--lora_target_modules",
        type=str,
-        default="a_to_qkv,b_to_qkv",
+        default="a_to_qkv,b_to_qkv,norm_1_a.linear,norm_1_b.linear,a_to_out,b_to_out,ff_a.0,ff_a.2,ff_b.0,ff_b.2",
        help="Layers with LoRA modules.",
    )
+    parser.add_argument(
+        "--preset_lora_path",
+        type=str,
+        default=None,
+        help="Preset LoRA path.",
+    )
+    parser.add_argument(
+        "--num_timesteps",
+        type=int,
+        default=1000,
+        help="Number of total timesteps. For turbo models, please set this parameter to the number of expected number of inference steps.",
+    )
    parser = add_general_parsers(parser)
    args = parser.parse_args()
    return args
@@ -46,7 +63,8 @@ if __name__ == '__main__':
    args = parse_args()
    model = LightningModel(
        torch_dtype=torch.float32 if args.precision == "32" else torch.float16,
-        pretrained_weights=[args.pretrained_path],
+        pretrained_weights=args.pretrained_path.split(","),
+        preset_lora_path=args.preset_lora_path,
        learning_rate=args.learning_rate,
        use_gradient_checkpointing=args.use_gradient_checkpointing,
        lora_rank=args.lora_rank,