update eligen examples and readme

2026-04-08 08:58:20 +00:00 · 2025-01-09 15:47:23 +08:00
parent a60bf3cd5f
commit c2478ff284
7 changed files with 101 additions and 53 deletions
--- a/README.md
+++ b/README.md
@@ -34,9 +34,9 @@ Until now, DiffSynth Studio has supported the following models:
 * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)

 ## News
- **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/README.md).
-  * Paper: Comming soon
-  * Github: https://github.com/modelscope/DiffSynth-Studio
+- **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter and In-Context LoRA, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/).
+  * Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
+  * Github: [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
  * Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)
  * Training dataset: Coming soon

--- a/apps/gradio/eligen_ui.py
+++ b/apps/gradio/eligen_ui.py
@@ -9,6 +9,7 @@ import json

 def save_mask_prompts(masks, mask_prompts, global_prompt, seed=0, random_dir='0000000'):
    save_dir = os.path.join('workdirs/tmp_mask', random_dir)
+    print(f'save to {save_dir}')
    os.makedirs(save_dir, exist_ok=True)
    for i, mask in enumerate(masks):
        save_path = os.path.join(save_dir, f'{i}.png')
@@ -205,8 +206,8 @@ with gr.Blocks() as app:
                        local_prompt = gr.Textbox(label="Local prompt", key=f"local_prompt_{painter_layer_id}")
                        mask_scale = gr.Slider(minimum=0.0, maximum=5.0, value=1.0, step=0.1, interactive=True, label="Mask scale", key=f"mask_scale_{painter_layer_id}")
                        canvas = gr.ImageEditor(canvas_size=(512, 1), sources=None, layers=False, interactive=True, image_mode="RGBA",
-                                                brush=gr.Brush(default_size=30, default_color="#000000", colors=["#000000"]),
-                                                label="Painter", key=f"canvas_{painter_layer_id}")
+                                                brush=gr.Brush(default_size=50, default_color="#000000", colors=["#000000"]),
+                                                label="Painter", key=f"canvas_{painter_layer_id}", width=width, height=height)
                        @gr.on(inputs=[height, width, canvas], outputs=canvas, triggers=[height.change, width.change, canvas.clear, enable_local_prompt.change], show_progress="hidden")
                        def resize_canvas(height, width, canvas):
                            h, w = canvas["background"].shape[:2]
--- a/examples/EntityControl/README.md
+++ b/examples/EntityControl/README.md
@@ -2,10 +2,10 @@

 ## Introduction

-We propose EliGen, a novel approach that leverages fine-grained entity-level information to enable precise and controllable text-to-image generation. EliGen excels in tasks such as entity-level controlled image generation and image inpainting, while its applicability is not limited to these areas. Additionally, it can be seamlessly integrated with existing community models, such as the IP-Adpater.
+We propose EliGen, a novel approach that leverages fine-grained entity-level information to enable precise and controllable text-to-image generation. EliGen excels in tasks such as entity-level controlled image generation and image inpainting, while its applicability is not limited to these areas. Additionally, it can be seamlessly integrated with existing community models, such as the IP-Adpater and In-Cotext LoRA.

-* Paper: Comming soon
-* Github: https://github.com/modelscope/DiffSynth-Studio
+* Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
+* Github: [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
 * Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)
 * Training dataset: Coming soon

@@ -13,23 +13,25 @@ We propose EliGen, a novel approach that leverages fine-grained entity-level inf

 ![regional-attention](https://github.com/user-attachments/assets/9a147201-15ab-421f-a6c5-701075754478)

-We introduce a regional attention mechanism within the DiT framework to effectively process the conditions of each entity. This mechanism enables the local prompt associated with each entity to semantically influence specific regions through regional attention. To further enhance the layout control capabilities of EliGen, we meticulously curate an entity-annotated dataset and fine-tune the model using the LoRA framework. 
+We introduce a regional attention mechanism within the DiT framework to effectively process the conditions of each entity. This mechanism enables the local prompt associated with each entity to semantically influence specific regions through regional attention. To further enhance the layout control capabilities of EliGen, we meticulously contribute an entity-annotated dataset and fine-tune the model using the LoRA framework. 

 1. **Regional Attention**: Regional attention is shown in above figure, which can be easily applied to other text-to-image models. Its core principle involves transforming the positional information of each entity into an attention mask, ensuring that the mechanism only affects the designated regions.
   
-2. **Dataset with Entity Annotation**: To curate a dedicated entity control dataset, we start by randomly selecting captions from DiffusionDB and generating the corresponding source image using Flux. Next, we employ Qwen2-VL 72B, recognized for its advanced grounding capabilities among MLLMs, to randomly identify entities within the image. These entities are annotated with local prompts and bounding boxes for precise localization, forming the foundation of our dataset for further training.
+2. **Dataset with Entity Annotation**: To construct a dedicated entity control dataset, we start by randomly selecting captions from DiffusionDB and generating the corresponding source image using Flux. Next, we employ Qwen2-VL 72B, recognized for its advanced grounding capabilities among MLLMs, to randomly identify entities within the image. These entities are annotated with local prompts and bounding boxes for precise localization, forming the foundation of our dataset for further training.

-3. **Training**: We apply LoRA and deepspeed to finetune regional attention with curated dataset, enabling our EliGen performing effective entity-level control.
+3. **Training**: We utilize LoRA (Low-Rank Adaptation) and DeepSpeed to fine-tune regional attention mechanisms using a curated dataset, enabling our EliGen model to achieve effective entity-level control.

 ## Usage
 1. **Entity-Level Controlled Image Generation**
-See [./entity_control.py](./entity_control.py) for usage.
+   EliGen achieves effective entity-level control results. See [./entity_control.py](./entity_control.py) for usage.
 2. **Image Inpainting**
   To apply EliGen to image inpainting task, we propose a inpainting fusion pipeline to preserve the non-painting areas while enabling precise, entity-level modifications over inpaining regions.
   See [./entity_inpaint.py](./entity_inpaint.py) for usage.
 3. **Styled Entity Control**
   EliGen can be seamlessly integrated with existing community models. We have provided an example of how to integrate it with the IP-Adpater. See [./entity_control_ipadapter.py](./entity_control_ipadapter.py) for usage.
-4. **Play with EliGen using UI**
+4. **Entity Transfer**
+   We have provided an example of how to integrate EliGen with In-Cotext LoRA, which achieves interesting entity transfer results. See [./entity_transfer.py](./entity_transfer.py) for usage.
+5. **Play with EliGen using UI**
   Download the checkpoint of EliGen from [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen) to `models/lora/entity_control` and run the following command to try interactive UI: 
   ```bash
   python apps/gradio/entity_level_control.py
@@ -41,20 +43,34 @@ See [./entity_control.py](./entity_control.py) for usage.

 https://github.com/user-attachments/assets/4fc76df1-b26a-46e8-a950-865cdf02a38d

-2. The image generation effect of complex Entity combinations, demonstrating the strong generalization of EliGen.
+2. The image generation effect of complex Entity combinations, demonstrating the strong generalization of EliGen. See [./entity_control.py](./entity_control.py) `example_1-6` for generation prompts.

-|![image_1_base](https://github.com/user-attachments/assets/b8564b28-19b5-424f-bf3c-6476f2923ff9)|![image_1_base](https://github.com/user-attachments/assets/20793715-42d3-46f7-8d62-0cb4cacef38d)|
+|![image_1_base](https://github.com/user-attachments/assets/4b9fb79f-cb3c-45a5-8d22-14e52865387c)|![image_1_base](https://github.com/user-attachments/assets/2e60e51b-f8d5-4b25-ae21-f64531e20b1b)|
 |-|-|
-|![image_1_base](https://github.com/user-attachments/assets/70ef12fe-d300-4b52-9d11-eabc9b5464a8)|![image_1_enhance](https://github.com/user-attachments/assets/7645ce0e-4aa7-4b1e-b7a7-bccfd9796461)|
-|![image_2_base](https://github.com/user-attachments/assets/2f1e44e1-8f1f-4c6e-ab7a-1b6861a33a69)|![image_2_enhance](https://github.com/user-attachments/assets/faf78498-57ba-41bd-b516-570c86984515)|
-|![image_3_base](https://github.com/user-attachments/assets/206d1cef-2e96-4469-aed5-cdeb06ab9e99)|![image_3_enhance](https://github.com/user-attachments/assets/75d784d6-d5a1-474f-a5d5-ef8074135f35)|
+|![image_1_base](https://github.com/user-attachments/assets/5bdfebc2-8c1e-4619-87f3-579883a3671e)|![image_1_enhance](https://github.com/user-attachments/assets/d38be37d-68ed-4123-9cb2-429069dbd870)|
+|![image_2_base](https://github.com/user-attachments/assets/e4b37440-fde0-4d7c-9658-98995d335097)|![image_2_enhance](https://github.com/user-attachments/assets/aa4ccae9-1074-4200-b890-5687f3409a70)|
+
+3. Demonstration of the robustness of EliGen. The following examples are generated using the same prompt but different seeds. Refer to [./entity_control.py](./entity_control.py) `example_7` for the prompts.
+
+|![image_1_base](https://github.com/user-attachments/assets/fb39ca42-074b-4d7c-85c8-55e4dec9a851)|![image_1_base](https://github.com/user-attachments/assets/34d7f17a-06b6-492a-8522-44aa1c75f233)|
+|-|-|
+|![image_1_base](https://github.com/user-attachments/assets/cecfee76-7e44-496b-8d02-ffa29f5142a3)|![image_1_enhance](https://github.com/user-attachments/assets/a51d3bba-52e6-483f-9c75-f7e87120b30c)|
+
 ### Image Inpainting
+Demonstration of the inpainting mode of EliGen, see [./entity_inpaint.py](./entity_inpaint.py) for generation prompts.
 |Inpainting Input|Inpainting Output|
 |-|-|
+|![image_2_base](https://github.com/user-attachments/assets/5ef499f3-3d8a-49cc-8ceb-86af7f5cb9f8)|![image_2_enhance](https://github.com/user-attachments/assets/88fc3bde-0984-4b3c-8ca9-d63de660855b)|
 |![image_1_base](https://github.com/user-attachments/assets/5f74c710-bf30-4db1-ae40-a1e1995ccef6)|![image_1_enhance](https://github.com/user-attachments/assets/1cd71177-e956-46d3-86ce-06f774c96efd)|
-|![image_2_base](https://github.com/user-attachments/assets/5ef499f3-3d8a-49cc-8ceb-86af7f5cb9f8)|![image_2_enhance](https://github.com/user-attachments/assets/fb967035-7b28-466c-a753-c00135559121)|
 ### Styled Entity Control
+Demonstration of the styled entity control results with EliGen and IP-Adapter, see [./entity_control_ipadapter.py](./entity_control_ipadapter.py) for generation prompts.
 |Style Reference|Entity Control Variance 1|Entity Control Variance 2|Entity Control Variance 3|
 |-|-|-|-|
 |![image_1_base](https://github.com/user-attachments/assets/5e2dd3ab-37d3-4f58-8e02-ee2f9b238604)|![image_1_enhance](https://github.com/user-attachments/assets/0f6711a2-572a-41b3-938a-95deff6d732d)|![image_1_enhance](https://github.com/user-attachments/assets/ce2e66e5-1fdf-44e8-bca7-555d805a50b1)|![image_1_enhance](https://github.com/user-attachments/assets/ad2da233-2f7c-4065-ab57-b2d84dc2c0e2)|
-|![image_2_base](https://github.com/user-attachments/assets/77cf7ceb-48e3-442d-8ffc-5fa4a10fe81a)|![image_2_enhance](https://github.com/user-attachments/assets/59a4f3c2-e59d-40c7-886c-0768f14fcc89)|![image_2_enhance](https://github.com/user-attachments/assets/a9187fb0-489a-49c9-a52f-56b1bd96faf7)|![image_2_enhance](https://github.com/user-attachments/assets/a62caee4-3863-4b56-96ff-e0785c6d93bb)|
+
+### Entity Transfer
+Demonstration of the entity transfer results with EliGen and In-Context LoRA, see [./entity_transfer.py](./entity_transfer.py) for generation prompts.
+
+|Entity to Transfer|Transfer Target Image|Transfer Example 1|Transfer Example 2|
+|-|-|-|-|
+|![image_1_base](https://github.com/user-attachments/assets/bb3d4a46-8d82-4d3c-bce8-8c01a9973b8d)|![image_1_enhance](https://github.com/user-attachments/assets/44c0f422-525e-42ca-991b-f407f8faafc3)|![image_1_enhance](https://github.com/user-attachments/assets/a042ff5b-2748-4d91-8321-cec8f9eb73e4)|![image_1_enhance](https://github.com/user-attachments/assets/98f2d1b1-16e1-4c8f-b521-5cd68b567293)|
--- a/examples/EntityControl/entity_control.py
+++ b/examples/EntityControl/entity_control.py
@@ -4,6 +4,26 @@ from examples.EntityControl.utils import visualize_masks
 from PIL import Image
 import torch

+def example(pipe, seeds, example_id, global_prompt, entity_prompts):
+    dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png")
+    masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))]
+    negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+    for seed in seeds:
+        # generate image
+        image = pipe(
+            prompt=global_prompt,
+            cfg_scale=3.0,
+            negative_prompt=negative_prompt,
+            num_inference_steps=50,
+            embedded_guidance=3.5,
+            seed=seed,
+            height=1024,
+            width=1024,
+            eligen_entity_prompts=entity_prompts,
+            eligen_entity_masks=masks,
+        )
+        image.save(f"eligen_example_{example_id}_{seed}.png")
+        visualize_masks(image, masks, entity_prompts, f"eligen_example_{example_id}_mask_{seed}.png")

 # download and load model
 model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda", model_id_list=["FLUX.1-dev"])
@@ -17,27 +37,38 @@ model_manager.load_lora(
 )
 pipe = FluxImagePipeline.from_model_manager(model_manager)

-# download and load mask images
-dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/mask*")
-masks = [Image.open(f"./data/examples/eligen/mask{i}.png") for i in range(1, 8)]
+# example 1
+global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a pale blue long dress with soft glow, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea, best quality, realistic, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, maximalist style, photorealistic, concept art, sharp focus, harmony, serenity, tranquility, soft pastell colors,ambient occlusion, cozy ambient lighting, masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning, view from above\n"
+entity_prompts = ["cliff", "sea", "moon", "sailing boat", "a seated beautiful woman", "pale blue long dress with soft glow"]
+example(pipe, [0], 1, global_prompt, entity_prompts)

-entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"]
+# example 2
+global_prompt = "samurai girl wearing a kimono, she's holding a sword  glowing with red flame, her long hair is flowing in the wind, she is looking at a small bird perched on the back of her hand. ultra realist style. maximum image detail. maximum realistic render."
+entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "blue belt"]
+example(pipe, [0], 2, global_prompt, entity_prompts)
+
+# example 3
+global_prompt = "Image of a neverending staircase up to a mysterious palace in the sky, The ancient palace stood majestically atop a mist-shrouded mountain, sunrise, two traditional monk walk in the stair looking at the sunrise, fog,see-through, best quality, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, photorealistic, concept art, harmony, serenity, tranquility, ambient occlusion, halation, cozy ambient lighting, dynamic lighting,masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning,"
+entity_prompts = ["ancient palace", "stone staircase with railings", "a traditional monk", "a traditional monk"]
+example(pipe, [27], 3, global_prompt, entity_prompts)
+
+# example 4
+global_prompt = "A beautiful girl wearing shirt and shorts in the street,  holding a sign 'Entity Control'"
+entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"]
+example(pipe, [21], 4, global_prompt, entity_prompts)
+
+# example 5
+global_prompt = "A captivating, dramatic scene in a painting that exudes mystery and foreboding. A white sky, swirling blue clouds, and a crescent yellow moon illuminate a solitary woman standing near the water's edge. Her long dress flows in the wind, silhouetted against the eerie glow. The water mirrors the fiery sky and moonlight, amplifying the uneasy atmosphere."
+entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"]
+example(pipe, [0], 5, global_prompt, entity_prompts)
+
+# example 6
+global_prompt = "Snow White and the 6 Dwarfs."
+entity_prompts = ["Dwarf 1", "Dwarf 2", "Dwarf 3", "Snow White", "Dwarf 4", "Dwarf 5", "Dwarf 6"]
+example(pipe, [8], 6, global_prompt, entity_prompts)
+
+# example 7, same prompt with different seeds
+seeds = range(5, 9)
 global_prompt = "A beautiful woman wearing white dress, holding a mirror, with a warm light background;"
-negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw"
-
-# generate image
-image = pipe(
-    prompt=global_prompt,
-    cfg_scale=3.0,
-    negative_prompt=negative_prompt,
-    num_inference_steps=50,
-    embedded_guidance=3.5,
-    seed=4,
-    height=1024,
-    width=1024,
-    eligen_entity_prompts=entity_prompts,
-    eligen_entity_masks=masks,
-    enable_eligen_on_negative=False,
-)
-image.save(f"entity_control.png")
-visualize_masks(image, masks, entity_prompts, f"entity_control_with_mask.png")
+entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"]
+example(pipe, seeds, 7, global_prompt, entity_prompts)
--- a/examples/EntityControl/entity_control_ipadapter.py
+++ b/examples/EntityControl/entity_control_ipadapter.py
@@ -18,13 +18,13 @@ model_manager.load_lora(
 pipe = FluxImagePipeline.from_model_manager(model_manager)

 # download and load mask images
-dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/ipadapter*")
-masks = [Image.open(f"./data/examples/eligen/ipadapter_mask_{i}.png") for i in range(1, 4)]
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/ipadapter/*")
+masks = [Image.open(f"./data/examples/eligen/ipadapter/ipadapter_mask_{i}.png") for i in range(1, 4)]

 entity_prompts = ['A girl', 'hat', 'sunset']
 global_prompt = "A girl wearing a hat, looking at the sunset"
 negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw"
-reference_img = Image.open("./data/examples/eligen/ipadapter_image.png")
+reference_img = Image.open("./data/examples/eligen/ipadapter/ipadapter_image.png")

 # generate image
 image = pipe(
--- a/examples/EntityControl/entity_inpaint.py
+++ b/examples/EntityControl/entity_inpaint.py
@@ -17,9 +17,9 @@ model_manager.load_lora(
 pipe = FluxImagePipeline.from_model_manager(model_manager)

 # download and load mask images
-dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/inpaint*")
-masks = [Image.open(f"./data/examples/eligen/inpaint_mask_{i}.png") for i in range(1, 3)]
-input_image = Image.open("./data/examples/eligen/inpaint_image.jpg")
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/inpaint/*")
+masks = [Image.open(f"./data/examples/eligen/inpaint/inpaint_mask_{i}.png") for i in range(1, 3)]
+input_image = Image.open("./data/examples/eligen/inpaint/inpaint_image.jpg")

 entity_prompts = ["A person wear red shirt", "Airplane"]
 global_prompt = "A person walking on the path in front of a house; An airplane in the sky"
--- a/examples/EntityControl/entity_transfer.py
+++ b/examples/EntityControl/entity_transfer.py
@@ -62,14 +62,14 @@ def generate(pipe: FluxImagePipeline, logo_image, target_image, mask, height, wi

 pipe = build_pipeline()

-dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/logo_transfer*")
-logo_image = Image.open("data/examples/eligen/logo_transfer_logo.png")
-target_image = Image.open("data/examples/eligen/logo_transfer_target_image.png")
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern="data/examples/eligen/logo_transfer/*")
+logo_image = Image.open("data/examples/eligen/logo_transfer/logo_transfer_logo.png")
+target_image = Image.open("data/examples/eligen/logo_transfer/logo_transfer_target_image.png")

 prompt="The two-panel image showcases the joyful identity, with the left panel showing a rabbit graphic; [LEFT] while the right panel translates the design onto a shopping tote with the rabbit logo in black, held by a person in a market setting, emphasizing the brand's approachable and eco-friendly vibe."
 logo_prompt="a rabbit logo"

-mask = Image.open("data/examples/eligen/logo_transfer_mask_1.png")
+mask = Image.open("data/examples/eligen/logo_transfer/logo_transfer_mask_1.png")
 generate(
    pipe, logo_image, target_image, mask, 
    height=1024, width=736,
@@ -78,7 +78,7 @@ generate(
    mask_save_path="entity_transfer_with_mask_1.png"
 )

-mask = Image.open("data/examples/eligen/logo_transfer_mask_2.png")
+mask = Image.open("data/examples/eligen/logo_transfer/logo_transfer_mask_2.png")
 generate(
    pipe, logo_image, target_image, mask, 
    height=1024, width=736,