support SD3 LoRA

2026-03-23 00:58:11 +00:00 · 2024-07-10 10:07:02 +08:00
parent 8113f95278
commit 979a8814f1
13 changed files with 1030 additions and 32 deletions
--- a/examples/image_synthesis/README.md
+++ b/examples/image_synthesis/README.md
@@ -1,10 +1,10 @@
 # Image Synthesis

-Image synthesis is the base feature of DiffSynth Studio.
+Image synthesis is the base feature of DiffSynth Studio. We can generate images with very high resolution.

 ### Example: Stable Diffusion

-We can generate images with very high resolution. Please see [`sd_text_to_image.py`](./sd_text_to_image.py) for more details.
+Example script: [`sd_text_to_image.py`](./sd_text_to_image.py)

 |512*512|1024*1024|2048*2048|4096*4096|
 |-|-|-|-|
@@ -12,7 +12,7 @@ We can generate images with very high resolution. Please see [`sd_text_to_image.

 ### Example: Stable Diffusion XL

-Generate images with Stable Diffusion XL. Please see [`sdxl_text_to_image.py`](./sdxl_text_to_image.py) for more details.
+Example script: [`sdxl_text_to_image.py`](./sdxl_text_to_image.py)

 |1024*1024|2048*2048|
 |-|-|
@@ -20,15 +20,29 @@ Generate images with Stable Diffusion XL. Please see [`sdxl_text_to_image.py`](.

 ### Example: Stable Diffusion 3

-Generate images with Stable Diffusion 3. High resolution is also supported in this model. See [`sd3_text_to_image.py`](./sd3_text_to_image.py).
+Example script: [`sd3_text_to_image.py`](./sd3_text_to_image.py)
+
+LoRA Training: [`../train/stable_diffusion_3/`](../train/stable_diffusion_3/)

 |1024*1024|2048*2048|
 |-|-|
 |![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/1386c802-e580-4101-939d-f1596802df9d)|

+### Example: Hunyuan-DiT
+
+Example script: [`hunyuan_dit_text_to_image.py`](./hunyuan_dit_text_to_image.py)
+
+LoRA Training: [`../train/hunyuan_dit/`](../train/hunyuan_dit/)
+
+|1024*1024|2048*2048|
+|-|-|
+|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/87919ea8-d428-4963-8257-da05f3901bbb)|
+
 ### Example: Stable Diffusion XL Turbo

-Generate images with Stable Diffusion XL Turbo. You can see [`sdxl_turbo.py`](./sdxl_turbo.py) for more details, but we highly recommend you to use it in the WebUI.
+Example script: [`sdxl_turbo.py`](./sdxl_turbo.py)
+
+We highly recommend you to use this model in the WebUI.

 |"black car"|"red car"|
 |-|-|
--- a/examples/image_synthesis/hunyuan_dit_text_to_image.py
+++ b/examples/image_synthesis/hunyuan_dit_text_to_image.py
@@ -0,0 +1,42 @@
+from diffsynth import ModelManager, HunyuanDiTImagePipeline, download_models
+import torch
+
+
+# Download models (automatically)
+# `models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/clip_text_encoder/pytorch_model.bin)
+# `models/HunyuanDiT/t2i/mt5/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/mt5/pytorch_model.bin)
+# `models/HunyuanDiT/t2i/model/pytorch_model_ema.pt`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/model/pytorch_model_ema.pt)
+# `models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin)
+download_models(["HunyuanDiT"])
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+    "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+    "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+    "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+    "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+])
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+prompt = "一幅充满诗意美感的全身肖像画，画中一位银发、蓝色眼睛、身穿蓝色连衣裙的少女漂浮在水下，周围是光彩的气泡，和煦的阳光透过水面折射进水下"
+negative_prompt = "错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，"
+
+# Enjoy!
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=50, height=1024, width=1024,
+)
+image.save("image_1024.png")
+
+# Highres fix
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    input_image=image.resize((2048, 2048)),
+    num_inference_steps=50, height=2048, width=2048,
+    denoising_strength=0.4, tiled=True,
+)
+image.save("image_2048.png")
--- a/examples/image_synthesis/sd3_text_to_image.py
+++ b/examples/image_synthesis/sd3_text_to_image.py
@@ -6,7 +6,7 @@ import torch
 # `models/stable_diffusion_3/sd3_medium_incl_clips.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)
 download_models(["StableDiffusion3"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
-                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors"])
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 pipe = SD3ImagePipeline.from_model_manager(model_manager)