From 6452edb738910f8c0d51e56a77a0d0dde1ed9f18 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Tue, 5 Aug 2025 20:41:03 +0800 Subject: [PATCH 1/7] qwen_image eligen --- diffsynth/models/qwen_image_dit.py | 70 +++++++++++- diffsynth/pipelines/qwen_image.py | 100 +++++++++++++++++- .../model_inference/Qwen-Image-EliGen.py | 89 ++++++++++++++++ .../model_training/lora/Qwen-Image-EliGen.sh | 19 ++++ .../validate_lora/Qwen-Image-EliGen.py | 29 +++++ 5 files changed, 303 insertions(+), 4 deletions(-) create mode 100644 examples/qwen_image/model_inference/Qwen-Image-EliGen.py create mode 100644 examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh create mode 100644 examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py diff --git a/diffsynth/models/qwen_image_dit.py b/diffsynth/models/qwen_image_dit.py index 15f8747..b8f92bb 100644 --- a/diffsynth/models/qwen_image_dit.py +++ b/diffsynth/models/qwen_image_dit.py @@ -158,7 +158,8 @@ class QwenDoubleStreamAttention(nn.Module): self, image: torch.FloatTensor, text: torch.FloatTensor, - image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: img_q, img_k, img_v = self.to_q(image), self.to_k(image), self.to_v(image) txt_q, txt_k, txt_v = self.add_q_proj(text), self.add_k_proj(text), self.add_v_proj(text) @@ -186,7 +187,7 @@ class QwenDoubleStreamAttention(nn.Module): joint_k = torch.cat([txt_k, img_k], dim=2) joint_v = torch.cat([txt_v, img_v], dim=2) - joint_attn_out = torch.nn.functional.scaled_dot_product_attention(joint_q, joint_k, joint_v) + joint_attn_out = torch.nn.functional.scaled_dot_product_attention(joint_q, joint_k, joint_v, attn_mask=attention_mask) joint_attn_out = rearrange(joint_attn_out, 'b h s d -> b s (h d)').to(joint_q.dtype) @@ -245,6 +246,7 @@ class QwenImageTransformerBlock(nn.Module): text: torch.Tensor, temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: img_mod_attn, img_mod_mlp = self.img_mod(temb).chunk(2, dim=-1) # [B, 3*dim] each @@ -260,6 +262,7 @@ class QwenImageTransformerBlock(nn.Module): image=img_modulated, text=txt_modulated, image_rotary_emb=image_rotary_emb, + attention_mask=attention_mask, ) image = image + img_gate * img_attn_out @@ -309,6 +312,69 @@ class QwenImageDiT(torch.nn.Module): self.proj_out = nn.Linear(3072, 64) + def process_entity_masks(self, latents, prompt_emb, prompt_emb_mask, entity_prompt_emb, entity_prompt_emb_mask, entity_masks, height, width, image, img_shapes): + # prompt_emb + all_prompt_emb = entity_prompt_emb + [prompt_emb] + all_prompt_emb = [self.txt_in(self.txt_norm(local_prompt_emb)) for local_prompt_emb in all_prompt_emb] + all_prompt_emb = torch.cat(all_prompt_emb, dim=1) + + # image_rotary_emb + txt_seq_lens = prompt_emb_mask.sum(dim=1).tolist() + image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens, device=latents.device) + entity_seq_lens = [emb_mask.sum(dim=1).tolist() for emb_mask in entity_prompt_emb_mask] + entity_rotary_emb = [self.pos_embed(img_shapes, entity_seq_len, device=latents.device)[1] for entity_seq_len in entity_seq_lens] + txt_rotary_emb = torch.cat(entity_rotary_emb + [image_rotary_emb[1]], dim=0) + image_rotary_emb = (image_rotary_emb[0], txt_rotary_emb) + + # attention_mask + repeat_dim = latents.shape[1] + max_masks = entity_masks.shape[1] + entity_masks = entity_masks.repeat(1, 1, repeat_dim, 1, 1) + entity_masks = [entity_masks[:, i, None].squeeze(1) for i in range(max_masks)] + global_mask = torch.ones_like(entity_masks[0]).to(device=latents.device, dtype=latents.dtype) + entity_masks = entity_masks + [global_mask] + + N = len(entity_masks) + batch_size = entity_masks[0].shape[0] + seq_lens = [mask_.sum(dim=1).item() for mask_ in entity_prompt_emb_mask] + [prompt_emb_mask.sum(dim=1).item()] + total_seq_len = sum(seq_lens) + image.shape[1] + patched_masks = [] + for i in range(N): + patched_mask = rearrange(entity_masks[i], "B C (H P) (W Q) -> B (H W) (C P Q)", H=height//16, W=width//16, P=2, Q=2) + patched_masks.append(patched_mask) + attention_mask = torch.ones((batch_size, total_seq_len, total_seq_len), dtype=torch.bool).to(device=entity_masks[0].device) + + # prompt-image attention mask + image_start = sum(seq_lens) + image_end = total_seq_len + cumsum = [0] + for length in seq_lens: + cumsum.append(cumsum[-1] + length) + for i in range(N): + prompt_start = cumsum[i] + prompt_end = cumsum[i+1] + image_mask = torch.sum(patched_masks[i], dim=-1) > 0 + image_mask = image_mask.unsqueeze(1).repeat(1, seq_lens[i], 1) + # prompt update with image + attention_mask[:, prompt_start:prompt_end, image_start:image_end] = image_mask + # image update with prompt + attention_mask[:, image_start:image_end, prompt_start:prompt_end] = image_mask.transpose(1, 2) + # prompt-prompt attention mask, let the prompt tokens not attend to each other + for i in range(N): + for j in range(N): + if i == j: + continue + start_i, end_i = cumsum[i], cumsum[i+1] + start_j, end_j = cumsum[j], cumsum[j+1] + attention_mask[:, start_i:end_i, start_j:end_j] = False + + attention_mask = attention_mask.float() + attention_mask[attention_mask == 0] = float('-inf') + attention_mask[attention_mask == 1] = 0 + attention_mask = attention_mask.to(device=latents.device, dtype=latents.dtype).unsqueeze(1) + + return all_prompt_emb, image_rotary_emb, attention_mask + def forward( self, latents=None, diff --git a/diffsynth/pipelines/qwen_image.py b/diffsynth/pipelines/qwen_image.py index deccd62..0611a7d 100644 --- a/diffsynth/pipelines/qwen_image.py +++ b/diffsynth/pipelines/qwen_image.py @@ -38,6 +38,7 @@ class QwenImagePipeline(BasePipeline): QwenImageUnit_NoiseInitializer(), QwenImageUnit_InputImageEmbedder(), QwenImageUnit_PromptEmbedder(), + QwenImageUnit_EntityControl(), ] self.model_fn = model_fn_qwen_image @@ -190,6 +191,10 @@ class QwenImagePipeline(BasePipeline): rand_device: str = "cpu", # Steps num_inference_steps: int = 30, + # EliGen + eligen_entity_prompts: list[str] = None, + eligen_entity_masks: list[Image.Image] = None, + eligen_enable_on_negative: bool = False, # Tile tiled: bool = False, tile_size: int = 128, @@ -213,6 +218,7 @@ class QwenImagePipeline(BasePipeline): "height": height, "width": width, "seed": seed, "rand_device": rand_device, "tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride, + "eligen_entity_prompts": eligen_entity_prompts, "eligen_entity_masks": eligen_entity_masks, "eligen_enable_on_negative": eligen_enable_on_negative, } for unit in self.units: inputs_shared, inputs_posi, inputs_nega = self.unit_runner(unit, self, inputs_shared, inputs_posi, inputs_nega) @@ -322,6 +328,84 @@ class QwenImageUnit_PromptEmbedder(PipelineUnit): return {} +class QwenImageUnit_EntityControl(PipelineUnit): + def __init__(self): + super().__init__( + take_over=True, + onload_model_names=("text_encoder") + ) + + def extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor): + bool_mask = mask.bool() + valid_lengths = bool_mask.sum(dim=1) + selected = hidden_states[bool_mask] + split_result = torch.split(selected, valid_lengths.tolist(), dim=0) + return split_result + + def get_prompt_emb(self, pipe: QwenImagePipeline, prompt) -> dict: + if pipe.text_encoder is not None: + prompt = [prompt] + template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + drop_idx = 34 + txt = [template.format(e) for e in prompt] + txt_tokens = pipe.tokenizer(txt, max_length=1024+drop_idx, padding=True, truncation=True, return_tensors="pt").to(pipe.device) + hidden_states = pipe.text_encoder(input_ids=txt_tokens.input_ids, attention_mask=txt_tokens.attention_mask, output_hidden_states=True,)[-1] + + split_hidden_states = self.extract_masked_hidden(hidden_states, txt_tokens.attention_mask) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states] + max_seq_len = max([e.size(0) for e in split_hidden_states]) + prompt_embeds = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]) + encoder_attention_mask = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]) + prompt_embeds = prompt_embeds.to(dtype=pipe.torch_dtype, device=pipe.device) + return {"prompt_emb": prompt_embeds, "prompt_emb_mask": encoder_attention_mask} + else: + return {} + + def preprocess_masks(self, pipe, masks, height, width, dim): + out_masks = [] + for mask in masks: + mask = pipe.preprocess_image(mask.resize((width, height), resample=Image.NEAREST)).mean(dim=1, keepdim=True) > 0 + mask = mask.repeat(1, dim, 1, 1).to(device=pipe.device, dtype=pipe.torch_dtype) + out_masks.append(mask) + return out_masks + + def prepare_entity_inputs(self, pipe, entity_prompts, entity_masks, width, height): + entity_masks = self.preprocess_masks(pipe, entity_masks, height//8, width//8, 1) + entity_masks = torch.cat(entity_masks, dim=0).unsqueeze(0) # b, n_mask, c, h, w + prompt_embs, prompt_emb_masks = [], [] + for entity_prompt in entity_prompts: + prompt_emb_dict = self.get_prompt_emb(pipe, entity_prompt) + prompt_embs.append(prompt_emb_dict['prompt_emb']) + prompt_emb_masks.append(prompt_emb_dict['prompt_emb_mask']) + return prompt_embs, prompt_emb_masks, entity_masks + + def prepare_eligen(self, pipe, prompt_emb_nega, eligen_entity_prompts, eligen_entity_masks, width, height, enable_eligen_on_negative, cfg_scale): + entity_prompt_emb_posi, entity_prompt_emb_posi_mask, entity_masks_posi = self.prepare_entity_inputs(pipe, eligen_entity_prompts, eligen_entity_masks, width, height) + if enable_eligen_on_negative and cfg_scale != 1.0: + entity_prompt_emb_nega = [prompt_emb_nega['prompt_emb']] * len(entity_prompt_emb_posi) + entity_prompt_emb_nega_mask = [prompt_emb_nega['prompt_emb_mask']] * len(entity_prompt_emb_posi) + entity_masks_nega = entity_masks_posi + else: + entity_prompt_emb_nega, entity_prompt_emb_nega_mask, entity_masks_nega = None, None, None + eligen_kwargs_posi = {"entity_prompt_emb": entity_prompt_emb_posi, "entity_masks": entity_masks_posi, "entity_prompt_emb_mask": entity_prompt_emb_posi_mask} + eligen_kwargs_nega = {"entity_prompt_emb": entity_prompt_emb_nega, "entity_masks": entity_masks_nega, "entity_prompt_emb_mask": entity_prompt_emb_nega_mask} + return eligen_kwargs_posi, eligen_kwargs_nega + + def process(self, pipe: QwenImagePipeline, inputs_shared, inputs_posi, inputs_nega): + eligen_entity_prompts, eligen_entity_masks = inputs_shared.get("eligen_entity_prompts", None), inputs_shared.get("eligen_entity_masks", None) + if eligen_entity_prompts is None or eligen_entity_masks is None or len(eligen_entity_prompts) == 0 or len(eligen_entity_masks) == 0: + return inputs_shared, inputs_posi, inputs_nega + pipe.load_models_to_device(self.onload_model_names) + eligen_enable_on_negative = inputs_shared.get("eligen_enable_on_negative", False) + eligen_kwargs_posi, eligen_kwargs_nega = self.prepare_eligen(pipe, inputs_nega, + eligen_entity_prompts, eligen_entity_masks, inputs_shared["width"], inputs_shared["height"], + eligen_enable_on_negative, inputs_shared["cfg_scale"]) + inputs_posi.update(eligen_kwargs_posi) + if inputs_shared.get("cfg_scale", 1.0) != 1.0: + inputs_nega.update(eligen_kwargs_nega) + return inputs_shared, inputs_posi, inputs_nega + def model_fn_qwen_image( dit: QwenImageDiT = None, @@ -331,6 +415,9 @@ def model_fn_qwen_image( prompt_emb_mask=None, height=None, width=None, + entity_prompt_emb=None, + entity_prompt_emb_mask=None, + entity_masks=None, use_gradient_checkpointing=False, use_gradient_checkpointing_offload=False, **kwargs @@ -342,9 +429,17 @@ def model_fn_qwen_image( image = rearrange(latents, "B C (H P) (W Q) -> B (H W) (C P Q)", H=height//16, W=width//16, P=2, Q=2) image = dit.img_in(image) - text = dit.txt_in(dit.txt_norm(prompt_emb)) conditioning = dit.time_text_embed(timestep, image.dtype) - image_rotary_emb = dit.pos_embed(img_shapes, txt_seq_lens, device=latents.device) + + if entity_prompt_emb is not None: + text, image_rotary_emb, attention_mask = dit.process_entity_masks( + latents, prompt_emb, prompt_emb_mask, entity_prompt_emb, entity_prompt_emb_mask, + entity_masks, height, width, image, img_shapes, + ) + else: + text = dit.txt_in(dit.txt_norm(prompt_emb)) + image_rotary_emb = dit.pos_embed(img_shapes, txt_seq_lens, device=latents.device) + attention_mask = None for block in dit.transformer_blocks: text, image = gradient_checkpoint_forward( @@ -355,6 +450,7 @@ def model_fn_qwen_image( text=text, temb=conditioning, image_rotary_emb=image_rotary_emb, + attention_mask=attention_mask, ) image = dit.norm_out(image, conditioning) diff --git a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py new file mode 100644 index 0000000..ef06eef --- /dev/null +++ b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py @@ -0,0 +1,89 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch +from PIL import Image, ImageDraw, ImageFont +from modelscope import dataset_snapshot_download +import random + + +def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False): + # Create a blank image for overlays + overlay = Image.new('RGBA', image.size, (0, 0, 0, 0)) + + colors = [ + (165, 238, 173, 80), + (76, 102, 221, 80), + (221, 160, 77, 80), + (204, 93, 71, 80), + (145, 187, 149, 80), + (134, 141, 172, 80), + (157, 137, 109, 80), + (153, 104, 95, 80), + (165, 238, 173, 80), + (76, 102, 221, 80), + (221, 160, 77, 80), + (204, 93, 71, 80), + (145, 187, 149, 80), + (134, 141, 172, 80), + (157, 137, 109, 80), + (153, 104, 95, 80), + ] + # Generate random colors for each mask + if use_random_colors: + colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))] + + # Font settings + try: + font = ImageFont.truetype("arial", font_size) # Adjust as needed + except IOError: + font = ImageFont.load_default(font_size) + + # Overlay each mask onto the overlay image + for mask, mask_prompt, color in zip(masks, mask_prompts, colors): + # Convert mask to RGBA mode + mask_rgba = mask.convert('RGBA') + mask_data = mask_rgba.getdata() + new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data] + mask_rgba.putdata(new_data) + + # Draw the mask prompt text on the mask + draw = ImageDraw.Draw(mask_rgba) + mask_bbox = mask.getbbox() # Get the bounding box of the mask + text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10) # Adjust text position based on mask position + draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font) + + # Alpha composite the overlay with this mask + overlay = Image.alpha_composite(overlay, mask_rgba) + + # Composite the overlay onto the original image + result = Image.alpha_composite(image.convert('RGBA'), overlay) + + # Save or display the resulting image + result.save(output_path) + + return result + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +example_id = 1 +global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a long dress, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea\n" +dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png") +entity_prompts = ["cliff", "sea", "red moon", "sailing boat", "a seated beautiful woman wearing red dress", "yellow long dress"] +masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + +for seed in range(20): + image = pipe(global_prompt, seed=seed, num_inference_steps=40, eligen_entity_prompts=entity_prompts, eligen_entity_masks=masks, cfg_scale=4.0, height=1024, width=1024) + image.save(f"workdirs/qwen_image/eligen_{seed}.jpg") + + visualize_masks(image, masks, entity_prompts, f"workdirs/qwen_image/eligen_{seed}_mask.png") + + image1 = pipe(global_prompt, seed=seed, num_inference_steps=40, height=1024, width=1024, cfg_scale=4.0) + image1.save(f"workdirs/qwen_image/qwenimage_{seed}.jpg") diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh b/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh new file mode 100644 index 0000000..ea2e659 --- /dev/null +++ b/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh @@ -0,0 +1,19 @@ +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_eligen.json \ + --data_file_keys "image,eligen_entity_masks" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-EliGen_lora" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --align_to_opensource_format \ + --extra_inputs "eligen_entity_masks,eligen_entity_prompts" \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py new file mode 100644 index 0000000..90680d3 --- /dev/null +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py @@ -0,0 +1,29 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch +from PIL import Image + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +pipe.load_lora(pipe.dit, "models/train/Qwen-Image_lora/epoch-4.safetensors") + + +entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"] +global_prompt = "A beautiful girl wearing shirt and shorts in the street, holding a sign 'Entity Control'" +masks = [Image.open(f"data/example_image_dataset/eligen/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + +image = pipe(global_prompt, + seed=0, + height=1024, + width=1024, + eligen_entity_prompts=entity_prompts, + eligen_entity_masks=masks) +image.save("image.jpg") From a3b67436a6122f6584fa70478945a725b4554cf0 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 6 Aug 2025 15:04:38 +0800 Subject: [PATCH 2/7] eligen ui --- apps/gradio/qwen_image_eligen.py | 382 ++++++++++++++++++ .../model_inference/Qwen-Image-EliGen.py | 66 ++- 2 files changed, 437 insertions(+), 11 deletions(-) create mode 100644 apps/gradio/qwen_image_eligen.py diff --git a/apps/gradio/qwen_image_eligen.py b/apps/gradio/qwen_image_eligen.py new file mode 100644 index 0000000..38dcf71 --- /dev/null +++ b/apps/gradio/qwen_image_eligen.py @@ -0,0 +1,382 @@ +import os +import torch +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import random +import json +import gradio as gr +from diffsynth import ModelManager, FluxImagePipeline, download_customized_models +from modelscope import dataset_snapshot_download +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig + +# pip install pydantic==2.10.6 +# pip install gradio==5.4.0 + + +dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/*") +example_json = 'data/examples/eligen/entity_control/ui_examples.json' +with open(example_json, 'r') as f: + examples = json.load(f)['examples'] + +for idx in range(len(examples)): + example_id = examples[idx]['example_id'] + entity_prompts = examples[idx]['local_prompt_list'] + examples[idx]['mask_lists'] = [Image.open(f"data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + +def create_canvas_data(background, masks): + if background.shape[-1] == 3: + background = np.dstack([background, np.full(background.shape[:2], 255, dtype=np.uint8)]) + layers = [] + for mask in masks: + if mask is not None: + mask_single_channel = mask if mask.ndim == 2 else mask[..., 0] + layer = np.zeros((mask_single_channel.shape[0], mask_single_channel.shape[1], 4), dtype=np.uint8) + layer[..., -1] = mask_single_channel + layers.append(layer) + else: + layers.append(np.zeros_like(background)) + + composite = background.copy() + for layer in layers: + if layer.size > 0: + composite = np.where(layer[..., -1:] > 0, layer, composite) + return { + "background": background, + "layers": layers, + "composite": composite, + } + +def load_example(load_example_button): + example_idx = int(load_example_button.split()[-1]) - 1 + example = examples[example_idx] + result = [ + 50, + example["global_prompt"], + example["negative_prompt"], + example["seed"], + *example["local_prompt_list"], + ] + num_entities = len(example["local_prompt_list"]) + result += [""] * (config["max_num_painter_layers"] - num_entities) + masks = [] + for mask in example["mask_lists"]: + mask_single_channel = np.array(mask.convert("L")) + masks.append(mask_single_channel) + for _ in range(config["max_num_painter_layers"] - len(masks)): + blank_mask = np.zeros_like(masks[0]) if masks else np.zeros((512, 512), dtype=np.uint8) + masks.append(blank_mask) + background = np.ones((masks[0].shape[0], masks[0].shape[1], 4), dtype=np.uint8) * 255 + canvas_data_list = [] + for mask in masks: + canvas_data = create_canvas_data(background, [mask]) + canvas_data_list.append(canvas_data) + result.extend(canvas_data_list) + return result + +def save_mask_prompts(masks, mask_prompts, global_prompt, seed=0, random_dir='0000000'): + save_dir = os.path.join('workdirs/tmp_mask', random_dir) + print(f'save to {save_dir}') + os.makedirs(save_dir, exist_ok=True) + for i, mask in enumerate(masks): + save_path = os.path.join(save_dir, f'{i}.png') + mask.save(save_path) + sample = { + "global_prompt": global_prompt, + "mask_prompts": mask_prompts, + "seed": seed, + } + with open(os.path.join(save_dir, f"prompts.json"), 'w', encoding='utf-8') as f: + json.dump(sample, f, ensure_ascii=False, indent=4) + +def visualize_masks(image, masks, mask_prompts, font_size=35, use_random_colors=False): + # Create a blank image for overlays + overlay = Image.new('RGBA', image.size, (0, 0, 0, 0)) + colors = [ + (165, 238, 173, 80), + (76, 102, 221, 80), + (221, 160, 77, 80), + (204, 93, 71, 80), + (145, 187, 149, 80), + (134, 141, 172, 80), + (157, 137, 109, 80), + (153, 104, 95, 80), + (165, 238, 173, 80), + (76, 102, 221, 80), + (221, 160, 77, 80), + (204, 93, 71, 80), + (145, 187, 149, 80), + (134, 141, 172, 80), + (157, 137, 109, 80), + (153, 104, 95, 80), + ] + # Generate random colors for each mask + if use_random_colors: + colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))] + # Font settings + font = ImageFont.truetype("dinglieciweifont20250217-2.ttf", font_size) # Adjust as needed + # Overlay each mask onto the overlay image + for mask, mask_prompt, color in zip(masks, mask_prompts, colors): + if mask is None: + continue + # Convert mask to RGBA mode + mask_rgba = mask.convert('RGBA') + mask_data = mask_rgba.getdata() + new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data] + mask_rgba.putdata(new_data) + # Draw the mask prompt text on the mask + draw = ImageDraw.Draw(mask_rgba) + mask_bbox = mask.getbbox() # Get the bounding box of the mask + if mask_bbox is None: + continue + text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10) # Adjust text position based on mask position + draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font) + # Alpha composite the overlay with this mask + overlay = Image.alpha_composite(overlay, mask_rgba) + # Composite the overlay onto the original image + result = Image.alpha_composite(image.convert('RGBA'), overlay) + return result + +config = { + "max_num_painter_layers": 8, + "max_num_model_cache": 1, +} + +model_dict = {} + +def load_model(model_type='qwen-image'): + global model_dict + model_key = f"{model_type}" + if model_key in model_dict: + return model_dict[model_key] + pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), + ) + pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/step-20000.safetensors") + model_dict[model_key] = pipe + return pipe + +load_model('qwen-image') + +with gr.Blocks() as app: + gr.Markdown( + """## EliGen: Entity-Level Controllable Text-to-Image Model + 1. On the left, input the **global prompt** for the overall image, such as "a person stands by the river." + 2. On the right, input the **local prompt** for each entity, such as "person," and draw the corresponding mask in the **Entity Mask Painter**. Generally, solid rectangular masks yield better results. + 3. Click the **Generate** button to create the image. By selecting different **random seeds**, you can generate diverse images. + 4. **You can directly click the "Load Example" button on any sample at the bottom to load example inputs.** + """ + ) + + loading_status = gr.Textbox(label="Loading Model...", value="Loading model... Please wait...", visible=True) + main_interface = gr.Column(visible=False) + + def initialize_model(): + try: + load_model('qwen-image') + return { + loading_status: gr.update(value="Model loaded successfully!", visible=False), + main_interface: gr.update(visible=True), + } + except Exception as e: + print(f'Failed to load model with error: {e}') + return { + loading_status: gr.update(value=f"Failed to load model: {str(e)}", visible=True), + main_interface: gr.update(visible=True), + } + + app.load(initialize_model, inputs=None, outputs=[loading_status, main_interface]) + + with main_interface: + with gr.Row(): + local_prompt_list = [] + canvas_list = [] + random_mask_dir = gr.State(f'{random.randint(0, 1000000):08d}') + with gr.Column(scale=382, min_width=100): + model_type = gr.State('qwen-image') + with gr.Accordion(label="Global prompt"): + prompt = gr.Textbox(label="Global Prompt", lines=3) + negative_prompt = gr.Textbox(label="Negative prompt", value="", lines=3) + with gr.Accordion(label="Inference Options", open=True): + seed = gr.Number(minimum=0, maximum=10**9, value=42, interactive=True, label="Random seed", show_label=True) + num_inference_steps = gr.Slider(minimum=1, maximum=100, value=30, step=1, interactive=True, label="Inference steps") + cfg_scale = gr.Slider(minimum=2.0, maximum=10.0, value=4.0, step=0.1, interactive=True, label="Classifier-free guidance scale") + embedded_guidance = gr.Slider(minimum=0.0, maximum=10.0, value=3.5, step=0.1, interactive=True, label="Embedded guidance scale") + height = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, interactive=True, label="Height") + width = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, interactive=True, label="Width") + with gr.Accordion(label="Inpaint Input Image", open=False): + input_image = gr.Image(sources=None, show_label=False, interactive=True, type="pil") + background_weight = gr.Slider(minimum=0.0, maximum=1000., value=0., step=1, interactive=False, label="background_weight", visible=False) + + with gr.Column(): + reset_input_button = gr.Button(value="Reset Inpaint Input") + send_input_to_painter = gr.Button(value="Set as painter's background") + @gr.on(inputs=[input_image], outputs=[input_image], triggers=reset_input_button.click) + def reset_input_image(input_image): + return None + + with gr.Column(scale=618, min_width=100): + with gr.Accordion(label="Entity Painter"): + for painter_layer_id in range(config["max_num_painter_layers"]): + with gr.Tab(label=f"Entity {painter_layer_id}"): + local_prompt = gr.Textbox(label="Local prompt", key=f"local_prompt_{painter_layer_id}") + canvas = gr.ImageEditor( + canvas_size=(1024, 1024), + sources=None, + layers=False, + interactive=True, + image_mode="RGBA", + brush=gr.Brush( + default_size=50, + default_color="#000000", + colors=["#000000"], + ), + label="Entity Mask Painter", + key=f"canvas_{painter_layer_id}", + width=width, + height=height, + ) + @gr.on(inputs=[height, width, canvas], outputs=canvas, triggers=[height.change, width.change, canvas.clear], show_progress="hidden") + def resize_canvas(height, width, canvas): + if canvas is None or canvas["background"] is None: + return np.ones((height, width, 3), dtype=np.uint8) * 255 + h, w = canvas["background"].shape[:2] + if h != height or width != w: + return np.ones((height, width, 3), dtype=np.uint8) * 255 + else: + return canvas + local_prompt_list.append(local_prompt) + canvas_list.append(canvas) + with gr.Accordion(label="Results"): + run_button = gr.Button(value="Generate", variant="primary") + output_image = gr.Image(sources=None, show_label=False, interactive=False, type="pil") + with gr.Row(): + with gr.Column(): + output_to_painter_button = gr.Button(value="Set as painter's background") + with gr.Column(): + return_with_mask = gr.Checkbox(value=False, interactive=True, label="show result with mask painting") + output_to_input_button = gr.Button(value="Set as input image", visible=False, interactive=False) + real_output = gr.State(None) + mask_out = gr.State(None) + + @gr.on( + inputs=[model_type, prompt, negative_prompt, cfg_scale, embedded_guidance, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir] + local_prompt_list + canvas_list, + outputs=[output_image, real_output, mask_out], + triggers=run_button.click + ) + def generate_image(model_type, prompt, negative_prompt, cfg_scale, embedded_guidance, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir, *args, progress=gr.Progress()): + pipe = load_model(model_type) + input_params = { + "prompt": prompt, + "negative_prompt": negative_prompt, + "cfg_scale": cfg_scale, + "num_inference_steps": num_inference_steps, + "height": height, + "width": width, + "progress_bar_cmd": progress.tqdm, + } + if isinstance(pipe, FluxImagePipeline): + input_params["embedded_guidance"] = embedded_guidance + if input_image is not None: + input_params["input_image"] = input_image.resize((width, height)).convert("RGB") + input_params["enable_eligen_inpaint"] = True + + local_prompt_list, canvas_list = ( + args[0 * config["max_num_painter_layers"]: 1 * config["max_num_painter_layers"]], + args[1 * config["max_num_painter_layers"]: 2 * config["max_num_painter_layers"]], + ) + local_prompts, masks = [], [] + for local_prompt, canvas in zip(local_prompt_list, canvas_list): + if isinstance(local_prompt, str) and len(local_prompt) > 0: + local_prompts.append(local_prompt) + masks.append(Image.fromarray(canvas["layers"][0][:, :, -1]).convert("RGB")) + entity_prompts = None if len(local_prompts) == 0 else local_prompts + entity_masks = None if len(masks) == 0 or entity_prompts is None else masks + input_params.update({ + "eligen_entity_prompts": entity_prompts, + "eligen_entity_masks": entity_masks, + }) + torch.manual_seed(seed) + save_mask_prompts(masks, local_prompts, prompt, seed, random_mask_dir) + image = pipe(**input_params) + masks = [mask.resize(image.size) for mask in masks] + image_with_mask = visualize_masks(image, masks, local_prompts) + + real_output = gr.State(image) + mask_out = gr.State(image_with_mask) + + if return_with_mask: + return image_with_mask, real_output, mask_out + return image, real_output, mask_out + + @gr.on(inputs=[input_image] + canvas_list, outputs=canvas_list, triggers=send_input_to_painter.click) + def send_input_to_painter_background(input_image, *canvas_list): + if input_image is None: + return tuple(canvas_list) + for canvas in canvas_list: + h, w = canvas["background"].shape[:2] + canvas["background"] = input_image.resize((w, h)) + return tuple(canvas_list) + @gr.on(inputs=[real_output] + canvas_list, outputs=canvas_list, triggers=output_to_painter_button.click) + def send_output_to_painter_background(real_output, *canvas_list): + if real_output is None: + return tuple(canvas_list) + for canvas in canvas_list: + h, w = canvas["background"].shape[:2] + canvas["background"] = real_output.value.resize((w, h)) + return tuple(canvas_list) + @gr.on(inputs=[return_with_mask, real_output, mask_out], outputs=[output_image], triggers=[return_with_mask.change], show_progress="hidden") + def show_output(return_with_mask, real_output, mask_out): + if return_with_mask: + return mask_out.value + else: + return real_output.value + @gr.on(inputs=[real_output], outputs=[input_image], triggers=output_to_input_button.click) + def send_output_to_pipe_input(real_output): + return real_output.value + + with gr.Column(): + gr.Markdown("## Examples") + for i in range(0, len(examples), 2): + with gr.Row(): + if i < len(examples): + example = examples[i] + with gr.Column(): + example_image = gr.Image( + value=f"data/examples/eligen/entity_control/example_{example['example_id']}/example_image.png", + label=example["description"], + interactive=False, + width=1024, + height=512 + ) + load_example_button = gr.Button(value=f"Load Example {example['example_id']}") + load_example_button.click( + load_example, + inputs=[load_example_button], + outputs=[num_inference_steps, prompt, negative_prompt, seed] + local_prompt_list + canvas_list + ) + + if i + 1 < len(examples): + example = examples[i + 1] + with gr.Column(): + example_image = gr.Image( + value=f"data/examples/eligen/entity_control/example_{example['example_id']}/example_image.png", + label=example["description"], + interactive=False, + width=1024, + height=512 + ) + load_example_button = gr.Button(value=f"Load Example {example['example_id']}") + load_example_button.click( + load_example, + inputs=[load_example_button], + outputs=[num_inference_steps, prompt, negative_prompt, seed] + local_prompt_list + canvas_list + ) +app.config["show_progress"] = "hidden" +app.launch(share=False) diff --git a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py index ef06eef..76bee7a 100644 --- a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py @@ -62,6 +62,26 @@ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_r return result +def example(pipe, seeds, example_id, global_prompt, entity_prompts): + dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png") + masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + negative_prompt = "" + for seed in seeds: + # generate image + image = pipe( + prompt=global_prompt, + cfg_scale=4.0, + negative_prompt=negative_prompt, + num_inference_steps=30, + seed=seed, + height=1024, + width=1024, + eligen_entity_prompts=entity_prompts, + eligen_entity_masks=masks, + ) + image.save(f"eligen_example_{example_id}_{seed}.png") + visualize_masks(image, masks, entity_prompts, f"eligen_example_{example_id}_mask_{seed}.png") + pipe = QwenImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, @@ -73,17 +93,41 @@ pipe = QwenImagePipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), ) -example_id = 1 -global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a long dress, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea\n" -dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png") -entity_prompts = ["cliff", "sea", "red moon", "sailing boat", "a seated beautiful woman wearing red dress", "yellow long dress"] -masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] +pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/step-20000.safetensors") -for seed in range(20): - image = pipe(global_prompt, seed=seed, num_inference_steps=40, eligen_entity_prompts=entity_prompts, eligen_entity_masks=masks, cfg_scale=4.0, height=1024, width=1024) - image.save(f"workdirs/qwen_image/eligen_{seed}.jpg") - visualize_masks(image, masks, entity_prompts, f"workdirs/qwen_image/eligen_{seed}_mask.png") +# example 1 +global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a pale blue long dress with soft glow, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea, best quality, realistic, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, maximalist style, photorealistic, concept art, sharp focus, harmony, serenity, tranquility, soft pastell colors,ambient occlusion, cozy ambient lighting, masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning, view from above\n" +entity_prompts = ["cliff", "sea", "moon", "sailing boat", "a seated beautiful woman", "pale blue long dress with soft glow"] +example(pipe, [0], 1, global_prompt, entity_prompts) - image1 = pipe(global_prompt, seed=seed, num_inference_steps=40, height=1024, width=1024, cfg_scale=4.0) - image1.save(f"workdirs/qwen_image/qwenimage_{seed}.jpg") +# example 2 +global_prompt = "samurai girl wearing a kimono, she's holding a sword glowing with red flame, her long hair is flowing in the wind, she is looking at a small bird perched on the back of her hand. ultra realist style. maximum image detail. maximum realistic render." +entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "blue belt"] +example(pipe, [0], 2, global_prompt, entity_prompts) + +# example 3 +global_prompt = "Image of a neverending staircase up to a mysterious palace in the sky, The ancient palace stood majestically atop a mist-shrouded mountain, sunrise, two traditional monk walk in the stair looking at the sunrise, fog,see-through, best quality, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, photorealistic, concept art, harmony, serenity, tranquility, ambient occlusion, halation, cozy ambient lighting, dynamic lighting,masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning," +entity_prompts = ["ancient palace", "stone staircase with railings", "a traditional monk", "a traditional monk"] +example(pipe, [27], 3, global_prompt, entity_prompts) + +# example 4 +global_prompt = "A beautiful girl wearing shirt and shorts in the street, holding a sign 'Entity Control'" +entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"] +example(pipe, [21], 4, global_prompt, entity_prompts) + +# example 5 +global_prompt = "A captivating, dramatic scene in a painting that exudes mystery and foreboding. A white sky, swirling blue clouds, and a crescent yellow moon illuminate a solitary woman standing near the water's edge. Her long dress flows in the wind, silhouetted against the eerie glow. The water mirrors the fiery sky and moonlight, amplifying the uneasy atmosphere." +entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"] +example(pipe, [0], 5, global_prompt, entity_prompts) + +# example 6 +global_prompt = "Snow White and the 6 Dwarfs." +entity_prompts = ["Dwarf 1", "Dwarf 2", "Dwarf 3", "Snow White", "Dwarf 4", "Dwarf 5", "Dwarf 6"] +example(pipe, [8], 6, global_prompt, entity_prompts) + +# example 7, same prompt with different seeds +seeds = range(5, 9) +global_prompt = "A beautiful asia woman wearing white dress, holding a mirror, with a forest background;" +entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"] +example(pipe, seeds, 7, global_prompt, entity_prompts) From 3c2f85606fb395f52495af5a26e346f38c939407 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 6 Aug 2025 17:23:05 +0800 Subject: [PATCH 3/7] update model --- apps/gradio/qwen_image_eligen.py | 38 +++++++++---------- .../model_inference/Qwen-Image-EliGen.py | 33 ++++++++-------- .../model_training/lora/Qwen-Image-EliGen.sh | 3 +- .../validate_lora/Qwen-Image-EliGen.py | 2 +- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/apps/gradio/qwen_image_eligen.py b/apps/gradio/qwen_image_eligen.py index 38dcf71..c224f01 100644 --- a/apps/gradio/qwen_image_eligen.py +++ b/apps/gradio/qwen_image_eligen.py @@ -5,23 +5,23 @@ from PIL import Image, ImageDraw, ImageFont import random import json import gradio as gr -from diffsynth import ModelManager, FluxImagePipeline, download_customized_models -from modelscope import dataset_snapshot_download from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +from modelscope import dataset_snapshot_download, snapshot_download # pip install pydantic==2.10.6 # pip install gradio==5.4.0 +snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen", allow_file_pattern="model.safetensors") -dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/*") -example_json = 'data/examples/eligen/entity_control/ui_examples.json' +dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/*") +example_json = 'data/examples/eligen/qwen-image/ui_examples.json' with open(example_json, 'r') as f: examples = json.load(f)['examples'] for idx in range(len(examples)): example_id = examples[idx]['example_id'] entity_prompts = examples[idx]['local_prompt_list'] - examples[idx]['mask_lists'] = [Image.open(f"data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + examples[idx]['mask_lists'] = [Image.open(f"data/examples/eligen/qwen-image/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] def create_canvas_data(background, masks): if background.shape[-1] == 3: @@ -113,7 +113,10 @@ def visualize_masks(image, masks, mask_prompts, font_size=35, use_random_colors= if use_random_colors: colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))] # Font settings - font = ImageFont.truetype("dinglieciweifont20250217-2.ttf", font_size) # Adjust as needed + try: + font = ImageFont.truetype("wqy-zenhei.ttc", font_size) # Adjust as needed + except IOError: + font = ImageFont.load_default(font_size) # Overlay each mask onto the overlay image for mask, mask_prompt, color in zip(masks, mask_prompts, colors): if mask is None: @@ -158,7 +161,7 @@ def load_model(model_type='qwen-image'): ], tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), ) - pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/step-20000.safetensors") + pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen/model.safetensors") model_dict[model_key] = pipe return pipe @@ -171,7 +174,7 @@ with gr.Blocks() as app: 2. On the right, input the **local prompt** for each entity, such as "person," and draw the corresponding mask in the **Entity Mask Painter**. Generally, solid rectangular masks yield better results. 3. Click the **Generate** button to create the image. By selecting different **random seeds**, you can generate diverse images. 4. **You can directly click the "Load Example" button on any sample at the bottom to load example inputs.** - """ + """ ) loading_status = gr.Textbox(label="Loading Model...", value="Loading model... Please wait...", visible=True) @@ -207,10 +210,9 @@ with gr.Blocks() as app: seed = gr.Number(minimum=0, maximum=10**9, value=42, interactive=True, label="Random seed", show_label=True) num_inference_steps = gr.Slider(minimum=1, maximum=100, value=30, step=1, interactive=True, label="Inference steps") cfg_scale = gr.Slider(minimum=2.0, maximum=10.0, value=4.0, step=0.1, interactive=True, label="Classifier-free guidance scale") - embedded_guidance = gr.Slider(minimum=0.0, maximum=10.0, value=3.5, step=0.1, interactive=True, label="Embedded guidance scale") height = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, interactive=True, label="Height") width = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, interactive=True, label="Width") - with gr.Accordion(label="Inpaint Input Image", open=False): + with gr.Accordion(label="Inpaint Input Image", open=False, visible=False): input_image = gr.Image(sources=None, show_label=False, interactive=True, type="pil") background_weight = gr.Slider(minimum=0.0, maximum=1000., value=0., step=1, interactive=False, label="background_weight", visible=False) @@ -266,11 +268,11 @@ with gr.Blocks() as app: mask_out = gr.State(None) @gr.on( - inputs=[model_type, prompt, negative_prompt, cfg_scale, embedded_guidance, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir] + local_prompt_list + canvas_list, + inputs=[model_type, prompt, negative_prompt, cfg_scale, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir] + local_prompt_list + canvas_list, outputs=[output_image, real_output, mask_out], triggers=run_button.click ) - def generate_image(model_type, prompt, negative_prompt, cfg_scale, embedded_guidance, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir, *args, progress=gr.Progress()): + def generate_image(model_type, prompt, negative_prompt, cfg_scale, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir, *args, progress=gr.Progress()): pipe = load_model(model_type) input_params = { "prompt": prompt, @@ -281,11 +283,9 @@ with gr.Blocks() as app: "width": width, "progress_bar_cmd": progress.tqdm, } - if isinstance(pipe, FluxImagePipeline): - input_params["embedded_guidance"] = embedded_guidance - if input_image is not None: - input_params["input_image"] = input_image.resize((width, height)).convert("RGB") - input_params["enable_eligen_inpaint"] = True + # if input_image is not None: + # input_params["input_image"] = input_image.resize((width, height)).convert("RGB") + # input_params["enable_eligen_inpaint"] = True local_prompt_list, canvas_list = ( args[0 * config["max_num_painter_layers"]: 1 * config["max_num_painter_layers"]], @@ -349,7 +349,7 @@ with gr.Blocks() as app: example = examples[i] with gr.Column(): example_image = gr.Image( - value=f"data/examples/eligen/entity_control/example_{example['example_id']}/example_image.png", + value=f"data/examples/eligen/qwen-image/example_{example['example_id']}/example_image.png", label=example["description"], interactive=False, width=1024, @@ -366,7 +366,7 @@ with gr.Blocks() as app: example = examples[i + 1] with gr.Column(): example_image = gr.Image( - value=f"data/examples/eligen/entity_control/example_{example['example_id']}/example_image.png", + value=f"data/examples/eligen/qwen-image/example_{example['example_id']}/example_image.png", label=example["description"], interactive=False, width=1024, diff --git a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py index 76bee7a..afee321 100644 --- a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py @@ -1,14 +1,14 @@ from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig import torch from PIL import Image, ImageDraw, ImageFont -from modelscope import dataset_snapshot_download +from modelscope import dataset_snapshot_download, snapshot_download import random def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False): # Create a blank image for overlays overlay = Image.new('RGBA', image.size, (0, 0, 0, 0)) - + colors = [ (165, 238, 173, 80), (76, 102, 221, 80), @@ -30,10 +30,10 @@ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_r # Generate random colors for each mask if use_random_colors: colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))] - + # Font settings try: - font = ImageFont.truetype("arial", font_size) # Adjust as needed + font = ImageFont.truetype("wqy-zenhei.ttc", font_size) # Adjust as needed except IOError: font = ImageFont.load_default(font_size) @@ -53,18 +53,18 @@ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_r # Alpha composite the overlay with this mask overlay = Image.alpha_composite(overlay, mask_rgba) - + # Composite the overlay onto the original image result = Image.alpha_composite(image.convert('RGBA'), overlay) - + # Save or display the resulting image result.save(output_path) return result def example(pipe, seeds, example_id, global_prompt, entity_prompts): - dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png") - masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/example_{example_id}/*.png") + masks = [Image.open(f"./data/examples/eligen/qwen-image/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] negative_prompt = "" for seed in seeds: # generate image @@ -93,8 +93,8 @@ pipe = QwenImagePipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), ) -pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/step-20000.safetensors") - +snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen", allow_file_pattern="model.safetensors") +pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen/model.safetensors") # example 1 global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a pale blue long dress with soft glow, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea, best quality, realistic, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, maximalist style, photorealistic, concept art, sharp focus, harmony, serenity, tranquility, soft pastell colors,ambient occlusion, cozy ambient lighting, masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning, view from above\n" @@ -103,7 +103,7 @@ example(pipe, [0], 1, global_prompt, entity_prompts) # example 2 global_prompt = "samurai girl wearing a kimono, she's holding a sword glowing with red flame, her long hair is flowing in the wind, she is looking at a small bird perched on the back of her hand. ultra realist style. maximum image detail. maximum realistic render." -entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "blue belt"] +entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "yellow belt"] example(pipe, [0], 2, global_prompt, entity_prompts) # example 3 @@ -121,13 +121,14 @@ global_prompt = "A captivating, dramatic scene in a painting that exudes mystery entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"] example(pipe, [0], 5, global_prompt, entity_prompts) -# example 6 -global_prompt = "Snow White and the 6 Dwarfs." -entity_prompts = ["Dwarf 1", "Dwarf 2", "Dwarf 3", "Snow White", "Dwarf 4", "Dwarf 5", "Dwarf 6"] -example(pipe, [8], 6, global_prompt, entity_prompts) +# example 6, poster +seeds = range(0, 1) +global_prompt = "瑞幸咖啡蓝莓奶背的宣传海报,主体是两杯浅绿色的瑞幸蓝莓奶昔杯装饮品,背景是浅蓝色水雾,海报写着“Luckin Coffee 蓝莓奶昔闪耀回归”,“新品上市” " +entity_prompts = ["杯装饮品", "杯装饮品", "字:“新品上市”", "字:“Luckin Coffee 蓝莓奶昔闪耀回归”"] +example(pipe, seeds, 6, global_prompt, entity_prompts) # example 7, same prompt with different seeds seeds = range(5, 9) -global_prompt = "A beautiful asia woman wearing white dress, holding a mirror, with a forest background;" +global_prompt = "A beautiful asia woman wearing white dress, holding a mirror, with a forest background." entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"] example(pipe, seeds, 7, global_prompt, entity_prompts) diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh b/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh index ea2e659..99dbb7f 100644 --- a/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh +++ b/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh @@ -1,5 +1,5 @@ accelerate launch examples/qwen_image/model_training/train.py \ - --dataset_base_path data/example_image_dataset \ + --dataset_base_path "data/example_image_dataset" \ --dataset_metadata_path data/example_image_dataset/metadata_eligen.json \ --data_file_keys "image,eligen_entity_masks" \ --max_pixels 1048576 \ @@ -15,5 +15,4 @@ accelerate launch examples/qwen_image/model_training/train.py \ --align_to_opensource_format \ --extra_inputs "eligen_entity_masks,eligen_entity_prompts" \ --use_gradient_checkpointing \ - --dataset_num_workers 8 \ --find_unused_parameters diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py index 90680d3..c65afcc 100644 --- a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py @@ -26,4 +26,4 @@ image = pipe(global_prompt, width=1024, eligen_entity_prompts=entity_prompts, eligen_entity_masks=masks) -image.save("image.jpg") +image.save("Qwen-Image_EliGen.jpg") From d3224e1fdc3a3cc75e50c24962bb9c5b4060e0b8 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 6 Aug 2025 17:36:28 +0800 Subject: [PATCH 4/7] update qwen-image-eligen readme --- README.md | 3 ++- README_zh.md | 2 ++ examples/qwen_image/README.md | 2 +- examples/qwen_image/README_zh.md | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 86fb792..c8c02f0 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./examples/qwen_image/model_inference/Qwen-Image.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| - +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| ### FLUX Series @@ -363,6 +363,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## Update History +- **August 6, 2025** We open-sourced the entity control lora of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet). - **August 5, 2025** We open-sourced the distilled acceleration model of Qwen-Image, [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), achieving approximately 5x speedup. diff --git a/README_zh.md b/README_zh.md index 018f049..8bb0d9c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -93,6 +93,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./examples/qwen_image/model_inference/Qwen-Image.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| @@ -379,6 +380,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## 更新历史 +- **2025年8月6日** 我们开源了Qwen-Image的实体控制lora模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen能够实现实体级受控的文本到图像生成。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。 - **2025年8月5日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full),实现了约 5 倍加速。 diff --git a/examples/qwen_image/README.md b/examples/qwen_image/README.md index c9fd4ae..48e7e00 100644 --- a/examples/qwen_image/README.md +++ b/examples/qwen_image/README.md @@ -44,7 +44,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image )|[code](./model_inference/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| - +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| ## Model Inference diff --git a/examples/qwen_image/README_zh.md b/examples/qwen_image/README_zh.md index 0a311c1..a7385de 100644 --- a/examples/qwen_image/README_zh.md +++ b/examples/qwen_image/README_zh.md @@ -44,7 +44,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./model_inference/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| - +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| ## 模型推理 From 2803ffcb384525a9b8f059b2faa9621556301f5f Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 6 Aug 2025 17:39:00 +0800 Subject: [PATCH 5/7] minor fix --- README.md | 2 +- README_zh.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c8c02f0..d93854d 100644 --- a/README.md +++ b/README.md @@ -363,7 +363,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## Update History -- **August 6, 2025** We open-sourced the entity control lora of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet). +- **August 6, 2025** We open-sourced the entity control LoRA of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet). - **August 5, 2025** We open-sourced the distilled acceleration model of Qwen-Image, [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), achieving approximately 5x speedup. diff --git a/README_zh.md b/README_zh.md index 8bb0d9c..d43ff2f 100644 --- a/README_zh.md +++ b/README_zh.md @@ -380,7 +380,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## 更新历史 -- **2025年8月6日** 我们开源了Qwen-Image的实体控制lora模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen能够实现实体级受控的文本到图像生成。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。 +- **2025年8月6日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级受控的文本到图像生成。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。 - **2025年8月5日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full),实现了约 5 倍加速。 From bee2a969e59619aa2865099acf3dea25a17be03a Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 6 Aug 2025 17:48:44 +0800 Subject: [PATCH 6/7] minor fix readme and path --- README.md | 2 +- README_zh.md | 2 +- examples/qwen_image/README.md | 2 +- examples/qwen_image/README_zh.md | 2 +- .../model_training/validate_lora/Qwen-Image-EliGen.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d93854d..7845523 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./examples/qwen_image/model_inference/Qwen-Image.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| -|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| ### FLUX Series diff --git a/README_zh.md b/README_zh.md index d43ff2f..386cc3e 100644 --- a/README_zh.md +++ b/README_zh.md @@ -93,7 +93,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./examples/qwen_image/model_inference/Qwen-Image.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| -|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| diff --git a/examples/qwen_image/README.md b/examples/qwen_image/README.md index 48e7e00..eeb49c4 100644 --- a/examples/qwen_image/README.md +++ b/examples/qwen_image/README.md @@ -44,7 +44,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image )|[code](./model_inference/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| -|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| ## Model Inference diff --git a/examples/qwen_image/README_zh.md b/examples/qwen_image/README_zh.md index a7385de..8e4563a 100644 --- a/examples/qwen_image/README_zh.md +++ b/examples/qwen_image/README_zh.md @@ -44,7 +44,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./model_inference/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| -|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| ## 模型推理 diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py index c65afcc..cd7904e 100644 --- a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py @@ -13,7 +13,7 @@ pipe = QwenImagePipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), ) -pipe.load_lora(pipe.dit, "models/train/Qwen-Image_lora/epoch-4.safetensors") +pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/epoch-4.safetensors") entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"] From 57128dc89f8c8825c24eaa27000f1250a84b1ac3 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Thu, 7 Aug 2025 13:42:47 +0800 Subject: [PATCH 7/7] update readme for qwen-image-eligen --- README.md | 2 +- README_zh.md | 2 +- examples/qwen_image/README.md | 2 +- examples/qwen_image/README_zh.md | 2 +- examples/qwen_image/model_inference/Qwen-Image-EliGen.py | 6 ------ 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7845523..181a645 100644 --- a/README.md +++ b/README.md @@ -363,7 +363,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## Update History -- **August 6, 2025** We open-sourced the entity control LoRA of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet). +- **August 7, 2025** We open-sourced the entity control LoRA of Qwen-Image, [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen). Qwen-Image-EliGen is able to achieve entity-level controlled text-to-image generation. See the [paper](https://arxiv.org/abs/2501.01097) for technical details. Training dataset: [EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet). - **August 5, 2025** We open-sourced the distilled acceleration model of Qwen-Image, [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full), achieving approximately 5x speedup. diff --git a/README_zh.md b/README_zh.md index 386cc3e..7007a0e 100644 --- a/README_zh.md +++ b/README_zh.md @@ -380,7 +380,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-44 ## 更新历史 -- **2025年8月6日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级受控的文本到图像生成。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。 +- **2025年8月7日** 我们开源了 Qwen-Image 的实体控制 LoRA 模型 [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)。Qwen-Image-EliGen 能够实现实体级可控的文生图。技术细节请参见[论文](https://arxiv.org/abs/2501.01097)。训练数据集:[EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。 - **2025年8月5日** 我们开源了 Qwen-Image 的蒸馏加速模型 [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full),实现了约 5 倍加速。 diff --git a/examples/qwen_image/README.md b/examples/qwen_image/README.md index eeb49c4..6eaa533 100644 --- a/examples/qwen_image/README.md +++ b/examples/qwen_image/README.md @@ -44,7 +44,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image )|[code](./model_inference/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| -|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| ## Model Inference diff --git a/examples/qwen_image/README_zh.md b/examples/qwen_image/README_zh.md index 8e4563a..305e1ab 100644 --- a/examples/qwen_image/README_zh.md +++ b/examples/qwen_image/README_zh.md @@ -44,7 +44,7 @@ image.save("image.jpg") |-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./model_inference/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| -|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| +|[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| ## 模型推理 diff --git a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py index afee321..ab0fd14 100644 --- a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py @@ -121,12 +121,6 @@ global_prompt = "A captivating, dramatic scene in a painting that exudes mystery entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"] example(pipe, [0], 5, global_prompt, entity_prompts) -# example 6, poster -seeds = range(0, 1) -global_prompt = "瑞幸咖啡蓝莓奶背的宣传海报,主体是两杯浅绿色的瑞幸蓝莓奶昔杯装饮品,背景是浅蓝色水雾,海报写着“Luckin Coffee 蓝莓奶昔闪耀回归”,“新品上市” " -entity_prompts = ["杯装饮品", "杯装饮品", "字:“新品上市”", "字:“Luckin Coffee 蓝莓奶昔闪耀回归”"] -example(pipe, seeds, 6, global_prompt, entity_prompts) - # example 7, same prompt with different seeds seeds = range(5, 9) global_prompt = "A beautiful asia woman wearing white dress, holding a mirror, with a forest background."