diff --git a/README.md b/README.md index 1acdff9..3b5facb 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ image.save("image.jpg") |-|-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./examples/qwen_image/model_inference/Qwen-Image.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image.py)| |[Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit)|[code](./examples/qwen_image/model_inference/Qwen-Image-Edit.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Edit.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Edit.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py)| +|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](./examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py)| |[DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py)| diff --git a/README_zh.md b/README_zh.md index bba1659..2b40b50 100644 --- a/README_zh.md +++ b/README_zh.md @@ -97,6 +97,7 @@ image.save("image.jpg") |-|-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./examples/qwen_image/model_inference/Qwen-Image.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image.py)| |[Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit)|[code](./examples/qwen_image/model_inference/Qwen-Image-Edit.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Edit.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Edit.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py)| +|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](./examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py)| |[DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py)| diff --git a/diffsynth/pipelines/qwen_image.py b/diffsynth/pipelines/qwen_image.py index 032b7a3..e949b56 100644 --- a/diffsynth/pipelines/qwen_image.py +++ b/diffsynth/pipelines/qwen_image.py @@ -524,37 +524,63 @@ class QwenImageUnit_PromptEmbedder(PipelineUnit): selected = hidden_states[bool_mask] split_result = torch.split(selected, valid_lengths.tolist(), dim=0) return split_result + + def calculate_dimensions(self, target_area, ratio): + import math + width = math.sqrt(target_area * ratio) + height = width / ratio + width = round(width / 32) * 32 + height = round(height / 32) * 32 + return width, height + + def resize_image(self, image, target_area=384*384): + width, height = self.calculate_dimensions(target_area, image.size[0] / image.size[1]) + return image.resize((width, height)) + + def encode_prompt(self, pipe: QwenImagePipeline, prompt): + template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + drop_idx = 34 + txt = [template.format(e) for e in prompt] + model_inputs = pipe.tokenizer(txt, max_length=4096+drop_idx, padding=True, truncation=True, return_tensors="pt").to(pipe.device) + if model_inputs.input_ids.shape[1] >= 1024: + print(f"Warning!!! QwenImage model was trained on prompts up to 512 tokens. Current prompt requires {model_inputs['input_ids'].shape[1] - drop_idx} tokens, which may lead to unpredictable behavior.") + hidden_states = pipe.text_encoder(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask, output_hidden_states=True,)[-1] + split_hidden_states = self.extract_masked_hidden(hidden_states, model_inputs.attention_mask) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + return split_hidden_states + + def encode_prompt_edit(self, pipe: QwenImagePipeline, prompt, edit_image): + template = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" + drop_idx = 64 + txt = [template.format(e) for e in prompt] + model_inputs = pipe.processor(text=txt, images=edit_image, padding=True, return_tensors="pt").to(pipe.device) + hidden_states = pipe.text_encoder(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask, pixel_values=model_inputs.pixel_values, image_grid_thw=model_inputs.image_grid_thw, output_hidden_states=True,)[-1] + split_hidden_states = self.extract_masked_hidden(hidden_states, model_inputs.attention_mask) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + return split_hidden_states + + def encode_prompt_edit_multi(self, pipe: QwenImagePipeline, prompt, edit_image): + template = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + drop_idx = 64 + img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>" + base_img_prompt = "".join([img_prompt_template.format(i + 1) for i in range(len(edit_image))]) + txt = [template.format(base_img_prompt + e) for e in prompt] + edit_image = [self.resize_image(image) for image in edit_image] + model_inputs = pipe.processor(text=txt, images=edit_image, padding=True, return_tensors="pt").to(pipe.device) + hidden_states = pipe.text_encoder(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask, pixel_values=model_inputs.pixel_values, image_grid_thw=model_inputs.image_grid_thw, output_hidden_states=True,)[-1] + split_hidden_states = self.extract_masked_hidden(hidden_states, model_inputs.attention_mask) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + return split_hidden_states def process(self, pipe: QwenImagePipeline, prompt, edit_image=None) -> dict: if pipe.text_encoder is not None: prompt = [prompt] - # If edit_image is None, use the default template for Qwen-Image, otherwise use the template for Qwen-Image-Edit if edit_image is None: - template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" - drop_idx = 34 + split_hidden_states = self.encode_prompt(pipe, prompt) + elif isinstance(edit_image, Image.Image): + split_hidden_states = self.encode_prompt_edit(pipe, prompt, edit_image) else: - template = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" - drop_idx = 64 - txt = [template.format(e) for e in prompt] - - # Qwen-Image-Edit model - if pipe.processor is not None: - model_inputs = pipe.processor(text=txt, images=edit_image, padding=True, return_tensors="pt").to(pipe.device) - # Qwen-Image model - elif pipe.tokenizer is not None: - model_inputs = pipe.tokenizer(txt, max_length=4096+drop_idx, padding=True, truncation=True, return_tensors="pt").to(pipe.device) - if model_inputs.input_ids.shape[1] >= 1024: - print(f"Warning!!! QwenImage model was trained on prompts up to 512 tokens. Current prompt requires {model_inputs['input_ids'].shape[1] - drop_idx} tokens, which may lead to unpredictable behavior.") - else: - assert False, "QwenImagePipeline requires either tokenizer or processor to be loaded." - - if 'pixel_values' in model_inputs: - hidden_states = pipe.text_encoder(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask, pixel_values=model_inputs.pixel_values, image_grid_thw=model_inputs.image_grid_thw, output_hidden_states=True,)[-1] - else: - hidden_states = pipe.text_encoder(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask, output_hidden_states=True,)[-1] - - split_hidden_states = self.extract_masked_hidden(hidden_states, model_inputs.attention_mask) - split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + split_hidden_states = self.encode_prompt_edit_multi(pipe, prompt, edit_image) attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states] max_seq_len = max([e.size(0) for e in split_hidden_states]) prompt_embeds = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]) @@ -712,12 +738,23 @@ class QwenImageUnit_EditImageEmbedder(PipelineUnit): def process(self, pipe: QwenImagePipeline, edit_image, tiled, tile_size, tile_stride, edit_image_auto_resize=False): if edit_image is None: return {} - resized_edit_image = self.edit_image_auto_resize(edit_image) if edit_image_auto_resize else edit_image pipe.load_models_to_device(['vae']) - edit_image = pipe.preprocess_image(resized_edit_image).to(device=pipe.device, dtype=pipe.torch_dtype) - edit_latents = pipe.vae.encode(edit_image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) + if isinstance(edit_image, Image.Image): + resized_edit_image = self.edit_image_auto_resize(edit_image) if edit_image_auto_resize else edit_image + edit_image = pipe.preprocess_image(resized_edit_image).to(device=pipe.device, dtype=pipe.torch_dtype) + edit_latents = pipe.vae.encode(edit_image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) + else: + resized_edit_image, edit_latents = [], [] + for image in edit_image: + if edit_image_auto_resize: + image = self.edit_image_auto_resize(image) + resized_edit_image.append(image) + image = pipe.preprocess_image(image).to(device=pipe.device, dtype=pipe.torch_dtype) + latents = pipe.vae.encode(image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) + edit_latents.append(latents) return {"edit_latents": edit_latents, "edit_image": resized_edit_image} + class QwenImageUnit_ContextImageEmbedder(PipelineUnit): def __init__(self): super().__init__( @@ -770,9 +807,10 @@ def model_fn_qwen_image( context_image = rearrange(context_latents, "B C (H P) (W Q) -> B (H W) (C P Q)", H=context_latents.shape[2]//2, W=context_latents.shape[3]//2, P=2, Q=2) image = torch.cat([image, context_image], dim=1) if edit_latents is not None: - img_shapes += [(edit_latents.shape[0], edit_latents.shape[2]//2, edit_latents.shape[3]//2)] - edit_image = rearrange(edit_latents, "B C (H P) (W Q) -> B (H W) (C P Q)", H=edit_latents.shape[2]//2, W=edit_latents.shape[3]//2, P=2, Q=2) - image = torch.cat([image, edit_image], dim=1) + edit_latents_list = edit_latents if isinstance(edit_latents, list) else [edit_latents] + img_shapes += [(e.shape[0], e.shape[2]//2, e.shape[3]//2) for e in edit_latents_list] + edit_image = [rearrange(e, "B C (H P) (W Q) -> B (H W) (C P Q)", H=e.shape[2]//2, W=e.shape[3]//2, P=2, Q=2) for e in edit_latents_list] + image = torch.cat([image] + edit_image, dim=1) image = dit.img_in(image) conditioning = dit.time_text_embed(timestep, image.dtype) diff --git a/examples/qwen_image/README.md b/examples/qwen_image/README.md index d01da52..4190ade 100644 --- a/examples/qwen_image/README.md +++ b/examples/qwen_image/README.md @@ -47,6 +47,7 @@ image.save("image.jpg") |-|-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./model_inference/Qwen-Image.py)|[code](./model_inference_low_vram/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit)|[code](./model_inference/Qwen-Image-Edit.py)|[code](./model_inference_low_vram/Qwen-Image-Edit.py)|[code](./model_training/full/Qwen-Image-Edit.sh)|[code](./model_training/validate_full/Qwen-Image-Edit.py)|[code](./model_training/lora/Qwen-Image-Edit.sh)|[code](./model_training/validate_lora/Qwen-Image-Edit.py)| +|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](./model_inference/Qwen-Image-Edit-2509.py)|[code](./model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](./model_training/full/Qwen-Image-Edit-2509.sh)|[code](./model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](./model_training/lora/Qwen-Image-Edit-2509.sh)|[code](./model_training/validate_lora/Qwen-Image-Edit-2509.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|[code](./model_training/lora/Qwen-Image-Distill-LoRA.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-LoRA.py)| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|[code](./model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| diff --git a/examples/qwen_image/README_zh.md b/examples/qwen_image/README_zh.md index a7f75e4..fa7a3d4 100644 --- a/examples/qwen_image/README_zh.md +++ b/examples/qwen_image/README_zh.md @@ -47,6 +47,7 @@ image.save("image.jpg") |-|-|-|-|-|-|-| |[Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image)|[code](./model_inference/Qwen-Image.py)|[code](./model_inference_low_vram/Qwen-Image.py)|[code](./model_training/full/Qwen-Image.sh)|[code](./model_training/validate_full/Qwen-Image.py)|[code](./model_training/lora/Qwen-Image.sh)|[code](./model_training/validate_lora/Qwen-Image.py)| |[Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit)|[code](./model_inference/Qwen-Image-Edit.py)|[code](./model_inference_low_vram/Qwen-Image-Edit.py)|[code](./model_training/full/Qwen-Image-Edit.sh)|[code](./model_training/validate_full/Qwen-Image-Edit.py)|[code](./model_training/lora/Qwen-Image-Edit.sh)|[code](./model_training/validate_lora/Qwen-Image-Edit.py)| +|[Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509)|[code](./model_inference/Qwen-Image-Edit-2509.py)|[code](./model_inference_low_vram/Qwen-Image-Edit-2509.py)|[code](./model_training/full/Qwen-Image-Edit-2509.sh)|[code](./model_training/validate_full/Qwen-Image-Edit-2509.py)|[code](./model_training/lora/Qwen-Image-Edit-2509.sh)|[code](./model_training/validate_lora/Qwen-Image-Edit-2509.py)| |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|[code](./model_training/lora/Qwen-Image-Distill-LoRA.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-LoRA.py)| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|[code](./model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| diff --git a/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py b/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py new file mode 100644 index 0000000..45489cd --- /dev/null +++ b/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py @@ -0,0 +1,25 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +from PIL import Image +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image-Edit-2509", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), +) + +image_1 = pipe(prompt="一位少女", seed=0, num_inference_steps=40, height=1328, width=1024) +image_1.save("image1.jpg") + +image_2 = pipe(prompt="一位老人", seed=0, num_inference_steps=40, height=1328, width=1024) +image_2.save("image2.jpg") + +prompt = "生成这两个人的合影" +edit_image = [Image.open("image1.jpg"), Image.open("image2.jpg")] +image_3 = pipe(prompt, edit_image=edit_image, seed=1, num_inference_steps=40, height=1328, width=1024, edit_image_auto_resize=True) +image_3.save("image3.jpg") diff --git a/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py b/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py index bbe3b7e..c18eaa8 100644 --- a/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py +++ b/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py @@ -10,7 +10,6 @@ pipe = QwenImagePipeline.from_pretrained( ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=None, processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), ) snapshot_download("DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix", local_dir="models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix", allow_file_pattern="model.safetensors") diff --git a/examples/qwen_image/model_inference/Qwen-Image-Edit.py b/examples/qwen_image/model_inference/Qwen-Image-Edit.py index 39d39a6..8a47756 100644 --- a/examples/qwen_image/model_inference/Qwen-Image-Edit.py +++ b/examples/qwen_image/model_inference/Qwen-Image-Edit.py @@ -9,7 +9,6 @@ pipe = QwenImagePipeline.from_pretrained( ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=None, processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), ) prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。" diff --git a/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py b/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py new file mode 100644 index 0000000..833cba5 --- /dev/null +++ b/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py @@ -0,0 +1,26 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +from PIL import Image +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image-Edit-2509", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), + ], + processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), +) +pipe.enable_vram_management() + +image_1 = pipe(prompt="一位少女", seed=0, num_inference_steps=40, height=1328, width=1024) +image_1.save("image1.jpg") + +image_2 = pipe(prompt="一位老人", seed=0, num_inference_steps=40, height=1328, width=1024) +image_2.save("image2.jpg") + +prompt = "生成这两个人的合影" +edit_image = [Image.open("image1.jpg"), Image.open("image2.jpg")] +image_3 = pipe(prompt, edit_image=edit_image, seed=1, num_inference_steps=40, height=1328, width=1024, edit_image_auto_resize=True) +image_3.save("image3.jpg") diff --git a/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py b/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py index 55c771f..5573fb5 100644 --- a/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py +++ b/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py @@ -10,7 +10,6 @@ pipe = QwenImagePipeline.from_pretrained( ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), ], - tokenizer_config=None, processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), ) pipe.enable_vram_management() diff --git a/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py b/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py index 5cc79fd..b3b9a5a 100644 --- a/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py +++ b/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py @@ -9,7 +9,6 @@ pipe = QwenImagePipeline.from_pretrained( ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn), ], - tokenizer_config=None, processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), ) pipe.enable_vram_management() diff --git a/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh b/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh new file mode 100644 index 0000000..a41f2da --- /dev/null +++ b/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh @@ -0,0 +1,15 @@ +accelerate launch --config_file examples/qwen_image/model_training/full/accelerate_config_zero2offload.yaml examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_edit.csv \ + --data_file_keys "image,edit_image" \ + --extra_inputs "edit_image" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image-Edit-2509:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-Edit-2509_full" \ + --trainable_models "dit" \ + --use_gradient_checkpointing \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh b/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh new file mode 100644 index 0000000..3bbf742 --- /dev/null +++ b/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh @@ -0,0 +1,18 @@ +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_edit.csv \ + --data_file_keys "image,edit_image" \ + --extra_inputs "edit_image" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image-Edit-2509:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-Edit-2509_lora" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py b/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py new file mode 100644 index 0000000..ff1fdbf --- /dev/null +++ b/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py @@ -0,0 +1,23 @@ +import torch +from PIL import Image +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +from diffsynth import load_state_dict + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image-Edit-2509", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=None, + processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), +) +state_dict = load_state_dict("models/train/Qwen-Image-Edit-2509_full/epoch-1.safetensors") +pipe.dit.load_state_dict(state_dict) + +prompt = "将裙子改为粉色" +image = Image.open("data/example_image_dataset/edit/image1.jpg").resize((1024, 1024)) +image = pipe(prompt, edit_image=image, seed=0, num_inference_steps=40, height=1024, width=1024) +image.save(f"image.jpg") diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py new file mode 100644 index 0000000..4eaf428 --- /dev/null +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py @@ -0,0 +1,21 @@ +import torch +from PIL import Image +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image-Edit-2509", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=None, + processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"), +) +pipe.load_lora(pipe.dit, "models/train/Qwen-Image-Edit-2509_lora/epoch-4.safetensors") + +prompt = "将裙子改为粉色" +image = Image.open("data/example_image_dataset/edit/image1.jpg").resize((1024, 1024)) +image = pipe(prompt, edit_image=image, seed=0, num_inference_steps=40, height=1024, width=1024) +image.save(f"image.jpg")