update_example entity control

2026-03-18 22:08:13 +00:00 · 2024-12-31 14:04:28 +08:00
parent e3d89cec0c
commit b6620f3dde
7 changed files with 487 additions and 113 deletions
--- a/examples/EntityControl/README.md
+++ b/examples/EntityControl/README.md
@@ -0,0 +1 @@
+# EliGen: Entity-Level Controlled Image Generation
--- a/examples/EntityControl/entity_control.py
+++ b/examples/EntityControl/entity_control.py
@@ -0,0 +1,57 @@
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_customized_models
+from examples.EntityControl.utils import visualize_masks
+from PIL import Image
+import requests
+from io import BytesIO
+
+# download and load model
+lora_path = download_customized_models(
+    model_id="DiffSynth-Studio/Eligen",
+    origin_file_path="model_bf16.safetensors",
+    local_dir="models/lora/entity_control"
+)[0]
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda", model_id_list=["FLUX.1-dev"])
+model_manager.load_lora(lora_path, lora_alpha=1.)
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+
+# prepare inputs
+image_shape = 1024
+seed = 4
+# set True to apply regional attention in negative prompt prediction for better results with more time
+use_seperated_negtive_prompt = False
+mask_urls = [
+    'https://github.com/user-attachments/assets/02905f6e-40c2-4482-9abe-b1ce50ccabbf',
+    'https://github.com/user-attachments/assets/a4cf4361-abf7-4556-ba94-74683eda4cb7',
+    'https://github.com/user-attachments/assets/b6595ff4-7269-4d8f-acf0-5df40bd6c59f',
+    'https://github.com/user-attachments/assets/941d39a7-3aa1-437f-8b2a-4adb15d2fb3e',
+    'https://github.com/user-attachments/assets/400c4086-5398-4291-b1b5-22d8483c08d9',
+    'https://github.com/user-attachments/assets/ce324c77-fa1d-4aad-a5cb-698f0d5eca70',
+    'https://github.com/user-attachments/assets/4e62325f-a60c-44f7-b53b-6da0869bb9db'
+]
+# prepare entity masks, entity prompts, global prompt and negative prompt
+masks = []
+for url in mask_urls:
+    response = requests.get(url)
+    mask = Image.open(BytesIO(response.content)).resize((image_shape, image_shape), resample=Image.NEAREST)
+    masks.append(mask)
+entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"]
+global_prompt = "A beautiful woman wearing white dress, holding a mirror, with a warm light background;"
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw"
+
+# generate image
+torch.manual_seed(seed)
+image = pipe(
+    prompt=global_prompt,
+    cfg_scale=3.0,
+    negative_prompt=negative_prompt,
+    num_inference_steps=50,
+    embedded_guidance=3.5,
+    height=image_shape,
+    width=image_shape,
+    entity_prompts=entity_prompts,
+    entity_masks=masks,
+    use_seperated_negtive_prompt=use_seperated_negtive_prompt,
+)
+image.save(f"entity_control.png")
+visualize_masks(image, masks, entity_prompts, f"entity_control_with_mask.png")
--- a/examples/EntityControl/entity_control_flux.py
+++ b/examples/EntityControl/entity_control_flux.py
@@ -1,54 +0,0 @@
-import torch
-from diffsynth import ModelManager, FluxImagePipeline, download_customized_models, FluxImageLoraPipeline
-from examples.EntityControl.utils import visualize_masks
-import os
-import json
-from PIL import Image
-
-# lora_path = download_customized_models(
-#     model_id="DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1",
-#     origin_file_path="merged_lora.safetensors",
-#     local_dir="models/lora"
-# )[0]
-
-lora_path = '/root/model_bf16.safetensors'
-model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
-model_manager.load_models([
-    "t2i_models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
-    "t2i_models/FLUX/FLUX.1-dev/text_encoder_2",
-    "t2i_models/FLUX/FLUX.1-dev/ae.safetensors",
-    "t2i_models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
-])
-model_manager.load_lora(lora_path, lora_alpha=1.)
-
-pipe = FluxImagePipeline.from_model_manager(model_manager)
-
-mask_dir = '/mnt/nas1/zhanghong/DiffSynth-Studio/workdirs/tmp_mask'
-image_shape = 1024
-guidance = 3.5
-cfg = 3.0
-negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
-names = ['row_2_1']
-seeds = [0]
-# use this to apply regional attention in negative prompt prediction for better results with more time
-use_seperated_negtive_prompt = False
-for name, seed in zip(names, seeds):
-    out_dir = f'workdirs/entity_control/{name}'
-    os.makedirs(out_dir, exist_ok=True)
-    cur_dir = os.path.join(mask_dir, name)
-    metas = json.load(open(os.path.join(mask_dir, name, 'prompts.json')))
-    for seed in range(3, 10):
-        prompt = metas['global_prompt']
-        mask_prompts = metas['mask_prompts']
-        masks = [Image.open(os.path.join(mask_dir, name, f"{mask_idx}.png")).resize((image_shape, image_shape), resample=Image.NEAREST) for mask_idx in range(len(mask_prompts))]
-        torch.manual_seed(seed)
-        image = pipe(
-            prompt=prompt,
-            cfg_scale=cfg,
-            negative_prompt=negative_prompt,
-            num_inference_steps=50, embedded_guidance=guidance, height=image_shape, width=image_shape,
-            entity_prompts=mask_prompts, entity_masks=masks,
-            use_seperated_negtive_prompt=use_seperated_negtive_prompt
-        )
-        use_sep = f'_sepneg' if use_seperated_negtive_prompt else ''
-        visualize_masks(image, masks, mask_prompts, os.path.join(out_dir, f"{name}_{seed}{use_sep}.png"))
--- a/examples/EntityControl/entity_control_ipadapter.py
+++ b/examples/EntityControl/entity_control_ipadapter.py
@@ -0,0 +1,51 @@
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_customized_models
+from examples.EntityControl.utils import visualize_masks
+from PIL import Image
+import requests
+from io import BytesIO
+
+lora_path = download_customized_models(
+    model_id="DiffSynth-Studio/Eligen",
+    origin_file_path="model_bf16.safetensors",
+    local_dir="models/lora/entity_control"
+)[0]
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda", model_id_list=["FLUX.1-dev", "InstantX/FLUX.1-dev-IP-Adapter"])
+model_manager.load_lora(lora_path, lora_alpha=1.)
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+
+# prepare inputs
+image_shape = 1024
+seed = 4
+# set True to apply regional attention in negative prompt prediction for better results with more time
+use_seperated_negtive_prompt = False
+mask_urls = [
+    'https://github.com/user-attachments/assets/e6745b3f-ab2b-4612-9bb5-b7235474a9a4',
+    'https://github.com/user-attachments/assets/5ddf9a89-32fa-4540-89ad-e956130942b3',
+    'https://github.com/user-attachments/assets/9d8a0bb0-6817-497e-af85-44f2512afe79'
+]
+# prepare entity masks, entity prompts, global prompt and negative prompt
+masks = []
+for url in mask_urls:
+    response = requests.get(url)
+    mask = Image.open(BytesIO(response.content)).resize((image_shape, image_shape), resample=Image.NEAREST)
+    masks.append(mask)
+entity_prompts = ['A girl', 'hat', 'sunset']
+global_prompt = "A girl wearing a hat, looking at the sunset"
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw"
+
+response = requests.get('https://github.com/user-attachments/assets/019bbfaa-04b3-4de6-badb-32b67c29a1bc')
+reference_img = Image.open(BytesIO(response.content)).convert('RGB').resize((image_shape, image_shape))
+
+torch.manual_seed(seed)
+image = pipe(
+    prompt=global_prompt,
+    cfg_scale=3.0,
+    negative_prompt=negative_prompt,
+    num_inference_steps=50, embedded_guidance=3.5, height=image_shape, width=image_shape,
+    entity_prompts=entity_prompts, entity_masks=masks,
+    use_seperated_negtive_prompt=use_seperated_negtive_prompt,
+    ipadapter_images=[reference_img], ipadapter_scale=0.7
+)
+image.save(f"styled_entity_control.png")
+visualize_masks(image, masks, entity_prompts, f"styled_entity_control_with_mask.png")
--- a/examples/EntityControl/entity_inpaint.py
+++ b/examples/EntityControl/entity_inpaint.py
@@ -0,0 +1,58 @@
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_customized_models, FluxImageLoraPipeline
+from examples.EntityControl.utils import visualize_masks
+import os
+import json
+from PIL import Image
+import requests
+from io import BytesIO
+
+# download and load model
+lora_path = download_customized_models(
+    model_id="DiffSynth-Studio/Eligen",
+    origin_file_path="model_bf16.safetensors",
+    local_dir="models/lora/entity_control"
+)[0]
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda", model_id_list=["FLUX.1-dev"])
+model_manager.load_lora(lora_path, lora_alpha=1.)
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+
+# prepare inputs
+image_shape = 1024
+seed = 0
+# set True to apply regional attention in negative prompt prediction for better results with more time
+use_seperated_negtive_prompt = False
+mask_urls = [
+    'https://github.com/user-attachments/assets/0cf78663-5314-4280-a065-31ded7a24a46',
+    'https://github.com/user-attachments/assets/bd3938b8-72a8-4d56-814f-f6445971b91d'
+]
+# prepare entity masks, entity prompts, global prompt and negative prompt
+masks = []
+for url in mask_urls:
+    response = requests.get(url)
+    mask = Image.open(BytesIO(response.content)).resize((image_shape, image_shape), resample=Image.NEAREST)
+    masks.append(mask)
+entity_prompts = ["A person wear red shirt", "Airplane"]
+global_prompt = "A person walking on the path in front of a house; An airplane in the sky"
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, blur"
+
+response = requests.get('https://github.com/user-attachments/assets/fa4d6ba5-08fd-4fc7-adbb-19898d839364')
+inpaint_input = Image.open(BytesIO(response.content)).convert('RGB').resize((image_shape, image_shape))
+
+# generate image
+torch.manual_seed(seed)
+image = pipe(
+    prompt=global_prompt,
+    cfg_scale=3.0,
+    negative_prompt=negative_prompt,
+    num_inference_steps=50,
+    embedded_guidance=3.5,
+    height=image_shape,
+    width=image_shape,
+    entity_prompts=entity_prompts,
+    entity_masks=masks,
+    inpaint_input=inpaint_input,
+    use_seperated_negtive_prompt=use_seperated_negtive_prompt,
+)
+image.save(f"entity_inpaint.png")
+visualize_masks(image, masks, entity_prompts, f"entity_inpaint_with_mask.png")
--- a/examples/EntityControl/entity_inpaint_flux.py
+++ b/examples/EntityControl/entity_inpaint_flux.py
@@ -1,59 +0,0 @@
-import torch
-from diffsynth import ModelManager, FluxImagePipeline, download_customized_models, FluxImageLoraPipeline
-from examples.EntityControl.utils import visualize_masks
-import os
-import json
-from PIL import Image
-
-# lora_path = download_customized_models(
-#     model_id="DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1",
-#     origin_file_path="merged_lora.safetensors",
-#     local_dir="models/lora"
-# )[0]
-
-lora_path = '/root/model_bf16.safetensors'
-model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
-model_manager.load_models([
-    "t2i_models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
-    "t2i_models/FLUX/FLUX.1-dev/text_encoder_2",
-    "t2i_models/FLUX/FLUX.1-dev/ae.safetensors",
-    "t2i_models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
-])
-model_manager.load_lora(lora_path, lora_alpha=1.)
-
-pipe = FluxImagePipeline.from_model_manager(model_manager)
-
-mask_dir = '/mnt/nas1/zhanghong/DiffSynth-Studio/workdirs/tmp_mask'
-image_shape = 1024
-guidance = 3.5
-cfg = 3.0
-negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
-names = ['inpaint2']
-seeds = [0]
-use_seperated_negtive_prompt = False
-for name, seed in zip(names, seeds):
-    out_dir = f'workdirs/paper_app/inpaint/elc/{name}'
-    os.makedirs(out_dir, exist_ok=True)
-    cur_dir = os.path.join(mask_dir, name)
-    metas = json.load(open(os.path.join(mask_dir, name, 'prompts.json')))
-    inpaint_input = Image.open(os.path.join(cur_dir, 'input.png')).convert('RGB')
-    prompt = metas['global_prompt']
-    prompt = 'A person with a dog walking on the cloud. A rocket in the sky'
-    mask_prompts = metas['mask_prompts']
-    masks = [Image.open(os.path.join(mask_dir, name, f"{mask_idx}.png")).resize((image_shape, image_shape), resample=Image.NEAREST) for mask_idx in range(len(mask_prompts))]
-    torch.manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        cfg_scale=cfg,
-        negative_prompt=negative_prompt,
-        num_inference_steps=50,
-        embedded_guidance=guidance,
-        height=image_shape,
-        width=image_shape,
-        entity_prompts=mask_prompts,
-        entity_masks=masks,
-        inpaint_input=inpaint_input,
-        use_seperated_negtive_prompt=use_seperated_negtive_prompt,
-    )
-    use_sep = f'_sepneg' if use_seperated_negtive_prompt else ''
-    visualize_masks(image, masks, mask_prompts, os.path.join(out_dir, f"{name}_{seed}{use_sep}.png"))
				`@@ -0,0 +1 @@`
				`# EliGen: Entity-Level Controlled Image Generation`