update SD3 examples

2026-04-08 08:58:20 +00:00 · 2024-07-05 16:35:41 +08:00
parent 466b37994e
commit 9ca6c646df
3 changed files with 39 additions and 12 deletions
--- a/diffsynth/pipelines/stable_diffusion_3.py
+++ b/diffsynth/pipelines/stable_diffusion_3.py
@@ -65,8 +65,8 @@ class SD3ImagePipeline(torch.nn.Module):
    def __call__(
        self,
        prompt,
-        negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi",
-        cfg_scale=4.5,
+        negative_prompt="",
+        cfg_scale=7.5,
        input_image=None,
        denoising_strength=1.0,
        height=1024,
--- a/examples/image_synthesis/README.md
+++ b/examples/image_synthesis/README.md
@@ -18,6 +18,14 @@ Generate images with Stable Diffusion XL. Please see [`sdxl_text_to_image.py`](.
 |-|-|
 |![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|

+### Example: Stable Diffusion 3
+
+Generate images with Stable Diffusion 3. High resolution is also supported in this model. See [`sd3_text_to_image.py`](./sd3_text_to_image.py).
+
+|1024*1024|2048*2048|
+|-|-|
+|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/1386c802-e580-4101-939d-f1596802df9d)|
+
 ### Example: Stable Diffusion XL Turbo

 Generate images with Stable Diffusion XL Turbo. You can see [`sdxl_turbo.py`](./sdxl_turbo.py) for more details, but we highly recommend you to use it in the WebUI.
@@ -41,3 +49,19 @@ Prompt: "一个漂亮的女孩". The [translation model](https://huggingface.co/
 |seed=0|seed=1|seed=2|seed=3|
 |-|-|-|-|
 |![0](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/778b1bd9-44e0-46ac-a99c-712b3fc9aaa4)|![1](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/c03479b8-2082-4c6e-8e1c-3582b98686f6)|![2](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edb33d21-3288-4a55-96ca-a4bfe1b50b00)|![3](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/7848cfc1-cad5-4848-8373-41d24e98e584)|
+
+### Example: Stable Diffusion 3 with Textual Inversions (Experimental)
+
+Since Stable Diffusion 3 utilizes the same text encoder as Stable Diffusion 1.x, it supports the textual inversions designed for Stable Diffusion 1.x. However, we found that the textual inversions may cause unpredictable effects to the model. We can only guarantee that these textual inversions can be loaded into the model. The example script is [`sd3_text_to_image_textual_inversion.py`](./sd3_text_to_image_textual_inversion.py)
+
+Prompt: "a girl, highly detailed, absurd res, perfect image". Without any textual inversions.
+
+|seed=0|seed=1|seed=2|seed=3|
+|-|-|-|-|
+|![image_without_textual_inversion_0](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4e918bf8-6081-4f79-a043-87adc4047d92)|![image_without_textual_inversion_1](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/2e90a01f-6a83-46ba-99b6-ab085582a5b7)|![image_without_textual_inversion_2](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/83570a6f-cddd-4d0a-8b2f-f50388e2ca8a)|![image_without_textual_inversion_3](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/f4d0f2d4-80ee-4281-923e-77d87e3d37b1)|
+
+Prompt: "a girl, highly detailed, absurd res, perfect image". With [`verybadimagenegative_v1.3`](https://civitai.com/models/11772/verybadimagenegative) on the negative side.
+
+|seed=0|seed=1|seed=2|seed=3|
+|-|-|-|-|
+|![image_with_textual_inversion_0](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/1b3485ee-e7c1-4306-8f93-c9f32d1ac937)|![image_with_textual_inversion_1](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/5d7c6c4b-afdf-42b0-8e94-1959f1a44491)|![image_with_textual_inversion_2](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/92e93c4e-2781-41df-a246-2d2e9bde97c4)|![image_with_textual_inversion_3](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/070966a0-3d5c-48d8-8199-9d7c80408689)|
--- a/examples/image_synthesis/sd3_text_to_image.py
+++ b/examples/image_synthesis/sd3_text_to_image.py
@@ -4,26 +4,29 @@ import torch

 # Download models (automatically)
 # `models/stable_diffusion_3/sd3_medium_incl_clips.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)
-download_models(["StableDiffusion3_without_T5"])
+download_models(["StableDiffusion3"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
-                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors"])
 pipe = SD3ImagePipeline.from_model_manager(model_manager)


-torch.manual_seed(0)
+prompt = "masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,"
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+
+torch.manual_seed(7)
 image = pipe(
-    prompt="a white cat, colorful ink painting, cyberpunk, unreal", 
-    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi",
-    cfg_scale=4.5,
-    num_inference_steps=50, width=1024, height=1024,
+    prompt=prompt, 
+    negative_prompt=negative_prompt,
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_1024.jpg")

 image = pipe(
-    prompt="a white cat, colorful ink painting, cyberpunk, unreal", 
-    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi",
+    prompt=prompt, 
+    negative_prompt=negative_prompt,
+    cfg_scale=7.5,
    input_image=image.resize((2048, 2048)), denoising_strength=0.5,
-    cfg_scale=4.5,
    num_inference_steps=50, width=2048, height=2048,
    tiled=True
 )