diff --git a/apps/gradio/qwen_image_eligen.py b/apps/gradio/qwen_image_eligen.py index 38dcf71..c224f01 100644 --- a/apps/gradio/qwen_image_eligen.py +++ b/apps/gradio/qwen_image_eligen.py @@ -5,23 +5,23 @@ from PIL import Image, ImageDraw, ImageFont import random import json import gradio as gr -from diffsynth import ModelManager, FluxImagePipeline, download_customized_models -from modelscope import dataset_snapshot_download from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +from modelscope import dataset_snapshot_download, snapshot_download # pip install pydantic==2.10.6 # pip install gradio==5.4.0 +snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen", allow_file_pattern="model.safetensors") -dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/*") -example_json = 'data/examples/eligen/entity_control/ui_examples.json' +dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/*") +example_json = 'data/examples/eligen/qwen-image/ui_examples.json' with open(example_json, 'r') as f: examples = json.load(f)['examples'] for idx in range(len(examples)): example_id = examples[idx]['example_id'] entity_prompts = examples[idx]['local_prompt_list'] - examples[idx]['mask_lists'] = [Image.open(f"data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + examples[idx]['mask_lists'] = [Image.open(f"data/examples/eligen/qwen-image/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] def create_canvas_data(background, masks): if background.shape[-1] == 3: @@ -113,7 +113,10 @@ def visualize_masks(image, masks, mask_prompts, font_size=35, use_random_colors= if use_random_colors: colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))] # Font settings - font = ImageFont.truetype("dinglieciweifont20250217-2.ttf", font_size) # Adjust as needed + try: + font = ImageFont.truetype("wqy-zenhei.ttc", font_size) # Adjust as needed + except IOError: + font = ImageFont.load_default(font_size) # Overlay each mask onto the overlay image for mask, mask_prompt, color in zip(masks, mask_prompts, colors): if mask is None: @@ -158,7 +161,7 @@ def load_model(model_type='qwen-image'): ], tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), ) - pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/step-20000.safetensors") + pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen/model.safetensors") model_dict[model_key] = pipe return pipe @@ -171,7 +174,7 @@ with gr.Blocks() as app: 2. On the right, input the **local prompt** for each entity, such as "person," and draw the corresponding mask in the **Entity Mask Painter**. Generally, solid rectangular masks yield better results. 3. Click the **Generate** button to create the image. By selecting different **random seeds**, you can generate diverse images. 4. **You can directly click the "Load Example" button on any sample at the bottom to load example inputs.** - """ + """ ) loading_status = gr.Textbox(label="Loading Model...", value="Loading model... Please wait...", visible=True) @@ -207,10 +210,9 @@ with gr.Blocks() as app: seed = gr.Number(minimum=0, maximum=10**9, value=42, interactive=True, label="Random seed", show_label=True) num_inference_steps = gr.Slider(minimum=1, maximum=100, value=30, step=1, interactive=True, label="Inference steps") cfg_scale = gr.Slider(minimum=2.0, maximum=10.0, value=4.0, step=0.1, interactive=True, label="Classifier-free guidance scale") - embedded_guidance = gr.Slider(minimum=0.0, maximum=10.0, value=3.5, step=0.1, interactive=True, label="Embedded guidance scale") height = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, interactive=True, label="Height") width = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, interactive=True, label="Width") - with gr.Accordion(label="Inpaint Input Image", open=False): + with gr.Accordion(label="Inpaint Input Image", open=False, visible=False): input_image = gr.Image(sources=None, show_label=False, interactive=True, type="pil") background_weight = gr.Slider(minimum=0.0, maximum=1000., value=0., step=1, interactive=False, label="background_weight", visible=False) @@ -266,11 +268,11 @@ with gr.Blocks() as app: mask_out = gr.State(None) @gr.on( - inputs=[model_type, prompt, negative_prompt, cfg_scale, embedded_guidance, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir] + local_prompt_list + canvas_list, + inputs=[model_type, prompt, negative_prompt, cfg_scale, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir] + local_prompt_list + canvas_list, outputs=[output_image, real_output, mask_out], triggers=run_button.click ) - def generate_image(model_type, prompt, negative_prompt, cfg_scale, embedded_guidance, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir, *args, progress=gr.Progress()): + def generate_image(model_type, prompt, negative_prompt, cfg_scale, num_inference_steps, height, width, return_with_mask, seed, input_image, background_weight, random_mask_dir, *args, progress=gr.Progress()): pipe = load_model(model_type) input_params = { "prompt": prompt, @@ -281,11 +283,9 @@ with gr.Blocks() as app: "width": width, "progress_bar_cmd": progress.tqdm, } - if isinstance(pipe, FluxImagePipeline): - input_params["embedded_guidance"] = embedded_guidance - if input_image is not None: - input_params["input_image"] = input_image.resize((width, height)).convert("RGB") - input_params["enable_eligen_inpaint"] = True + # if input_image is not None: + # input_params["input_image"] = input_image.resize((width, height)).convert("RGB") + # input_params["enable_eligen_inpaint"] = True local_prompt_list, canvas_list = ( args[0 * config["max_num_painter_layers"]: 1 * config["max_num_painter_layers"]], @@ -349,7 +349,7 @@ with gr.Blocks() as app: example = examples[i] with gr.Column(): example_image = gr.Image( - value=f"data/examples/eligen/entity_control/example_{example['example_id']}/example_image.png", + value=f"data/examples/eligen/qwen-image/example_{example['example_id']}/example_image.png", label=example["description"], interactive=False, width=1024, @@ -366,7 +366,7 @@ with gr.Blocks() as app: example = examples[i + 1] with gr.Column(): example_image = gr.Image( - value=f"data/examples/eligen/entity_control/example_{example['example_id']}/example_image.png", + value=f"data/examples/eligen/qwen-image/example_{example['example_id']}/example_image.png", label=example["description"], interactive=False, width=1024, diff --git a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py index 76bee7a..afee321 100644 --- a/examples/qwen_image/model_inference/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_inference/Qwen-Image-EliGen.py @@ -1,14 +1,14 @@ from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig import torch from PIL import Image, ImageDraw, ImageFont -from modelscope import dataset_snapshot_download +from modelscope import dataset_snapshot_download, snapshot_download import random def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False): # Create a blank image for overlays overlay = Image.new('RGBA', image.size, (0, 0, 0, 0)) - + colors = [ (165, 238, 173, 80), (76, 102, 221, 80), @@ -30,10 +30,10 @@ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_r # Generate random colors for each mask if use_random_colors: colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))] - + # Font settings try: - font = ImageFont.truetype("arial", font_size) # Adjust as needed + font = ImageFont.truetype("wqy-zenhei.ttc", font_size) # Adjust as needed except IOError: font = ImageFont.load_default(font_size) @@ -53,18 +53,18 @@ def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_r # Alpha composite the overlay with this mask overlay = Image.alpha_composite(overlay, mask_rgba) - + # Composite the overlay onto the original image result = Image.alpha_composite(image.convert('RGBA'), overlay) - + # Save or display the resulting image result.save(output_path) return result def example(pipe, seeds, example_id, global_prompt, entity_prompts): - dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png") - masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] + dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/example_{example_id}/*.png") + masks = [Image.open(f"./data/examples/eligen/qwen-image/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))] negative_prompt = "" for seed in seeds: # generate image @@ -93,8 +93,8 @@ pipe = QwenImagePipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), ) -pipe.load_lora(pipe.dit, "models/train/Qwen-Image-EliGen_lora/step-20000.safetensors") - +snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen", allow_file_pattern="model.safetensors") +pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen/model.safetensors") # example 1 global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a pale blue long dress with soft glow, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea, best quality, realistic, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, maximalist style, photorealistic, concept art, sharp focus, harmony, serenity, tranquility, soft pastell colors,ambient occlusion, cozy ambient lighting, masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning, view from above\n" @@ -103,7 +103,7 @@ example(pipe, [0], 1, global_prompt, entity_prompts) # example 2 global_prompt = "samurai girl wearing a kimono, she's holding a sword glowing with red flame, her long hair is flowing in the wind, she is looking at a small bird perched on the back of her hand. ultra realist style. maximum image detail. maximum realistic render." -entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "blue belt"] +entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "yellow belt"] example(pipe, [0], 2, global_prompt, entity_prompts) # example 3 @@ -121,13 +121,14 @@ global_prompt = "A captivating, dramatic scene in a painting that exudes mystery entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"] example(pipe, [0], 5, global_prompt, entity_prompts) -# example 6 -global_prompt = "Snow White and the 6 Dwarfs." -entity_prompts = ["Dwarf 1", "Dwarf 2", "Dwarf 3", "Snow White", "Dwarf 4", "Dwarf 5", "Dwarf 6"] -example(pipe, [8], 6, global_prompt, entity_prompts) +# example 6, poster +seeds = range(0, 1) +global_prompt = "瑞幸咖啡蓝莓奶背的宣传海报,主体是两杯浅绿色的瑞幸蓝莓奶昔杯装饮品,背景是浅蓝色水雾,海报写着“Luckin Coffee 蓝莓奶昔闪耀回归”,“新品上市” " +entity_prompts = ["杯装饮品", "杯装饮品", "字:“新品上市”", "字:“Luckin Coffee 蓝莓奶昔闪耀回归”"] +example(pipe, seeds, 6, global_prompt, entity_prompts) # example 7, same prompt with different seeds seeds = range(5, 9) -global_prompt = "A beautiful asia woman wearing white dress, holding a mirror, with a forest background;" +global_prompt = "A beautiful asia woman wearing white dress, holding a mirror, with a forest background." entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"] example(pipe, seeds, 7, global_prompt, entity_prompts) diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh b/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh index ea2e659..99dbb7f 100644 --- a/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh +++ b/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh @@ -1,5 +1,5 @@ accelerate launch examples/qwen_image/model_training/train.py \ - --dataset_base_path data/example_image_dataset \ + --dataset_base_path "data/example_image_dataset" \ --dataset_metadata_path data/example_image_dataset/metadata_eligen.json \ --data_file_keys "image,eligen_entity_masks" \ --max_pixels 1048576 \ @@ -15,5 +15,4 @@ accelerate launch examples/qwen_image/model_training/train.py \ --align_to_opensource_format \ --extra_inputs "eligen_entity_masks,eligen_entity_prompts" \ --use_gradient_checkpointing \ - --dataset_num_workers 8 \ --find_unused_parameters diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py index 90680d3..c65afcc 100644 --- a/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py @@ -26,4 +26,4 @@ image = pipe(global_prompt, width=1024, eligen_entity_prompts=entity_prompts, eligen_entity_masks=masks) -image.save("image.jpg") +image.save("Qwen-Image_EliGen.jpg")