diff --git a/README.md b/README.md index 78fce50..71329ca 100644 --- a/README.md +++ b/README.md @@ -93,8 +93,8 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|-|-|-|-| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py)| diff --git a/README_zh.md b/README_zh.md index a058538..1596482 100644 --- a/README_zh.md +++ b/README_zh.md @@ -95,8 +95,8 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./examples/qwen_image/model_inference/Qwen-Image-EliGen.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](./examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py)| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|-|-|-|-| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py)| diff --git a/examples/qwen_image/README.md b/examples/qwen_image/README.md index 49456ff..357ee7f 100644 --- a/examples/qwen_image/README.md +++ b/examples/qwen_image/README.md @@ -46,8 +46,8 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|[code](./model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|-|-|-|-| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py)| ## Model Inference diff --git a/examples/qwen_image/README_zh.md b/examples/qwen_image/README_zh.md index e245ba1..418bf25 100644 --- a/examples/qwen_image/README_zh.md +++ b/examples/qwen_image/README_zh.md @@ -46,8 +46,8 @@ image.save("image.jpg") |[DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full)|[code](./model_inference/Qwen-Image-Distill-Full.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-Full.py)|[code](./model_training/full/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_full/Qwen-Image-Distill-Full.py)|[code](./model_training/lora/Qwen-Image-Distill-Full.sh)|[code](./model_training/validate_lora/Qwen-Image-Distill-Full.py)| |[DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA)|[code](./model_inference/Qwen-Image-Distill-LoRA.py)|[code](./model_inference_low_vram/Qwen-Image-Distill-LoRA.py)|-|-|-|-| |[DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen)|[code](./model_inference/Qwen-Image-EliGen.py)|[code](./model_inference_low_vram/Qwen-Image-EliGen.py)|-|-|[code](./model_training/lora/Qwen-Image-EliGen.sh)|[code](./model_training/validate_lora/Qwen-Image-EliGen.py)| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|-|-|-|-| -|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|-|-|-|-| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py)|[code](./model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh)|[code](./model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py)| +|[DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth)|[code](./model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py)|[code](./model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh)|[code](./model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py)| ## 模型推理 diff --git a/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh b/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh new file mode 100644 index 0000000..f563fd1 --- /dev/null +++ b/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh @@ -0,0 +1,38 @@ +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_blockwise_controlnet_canny.csv \ + --data_file_keys "image,blockwise_controlnet_image" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors,DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny:model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.blockwise_controlnet.models.0." \ + --output_path "./models/train/Qwen-Image-Blockwise-ControlNet-Canny_full" \ + --trainable_models "blockwise_controlnet" \ + --extra_inputs "blockwise_controlnet_image" \ + --use_gradient_checkpointing \ + --find_unused_parameters + +# If you want to pre-train a Blockwise ControlNet from scratch, +# please run the following script to first generate the initialized model weights file, +# and then start training with a high learning rate (1e-3). + +# python examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Initialize.py + +# accelerate launch examples/qwen_image/model_training/train.py \ +# --dataset_base_path data/example_image_dataset \ +# --dataset_metadata_path data/example_image_dataset/metadata_blockwise_controlnet_canny.csv \ +# --data_file_keys "image,blockwise_controlnet_image" \ +# --max_pixels 1048576 \ +# --dataset_repeat 50 \ +# --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ +# --model_paths '["models/blockwise_controlnet.safetensors"]' \ +# --learning_rate 1e-3 \ +# --num_epochs 2 \ +# --remove_prefix_in_ckpt "pipe.blockwise_controlnet.models.0." \ +# --output_path "./models/train/Qwen-Image-Blockwise-ControlNet-Canny_full" \ +# --trainable_models "blockwise_controlnet" \ +# --extra_inputs "blockwise_controlnet_image" \ +# --use_gradient_checkpointing \ +# --find_unused_parameters \ No newline at end of file diff --git a/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh b/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh new file mode 100644 index 0000000..2bd2926 --- /dev/null +++ b/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh @@ -0,0 +1,38 @@ +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_blockwise_controlnet_depth.csv \ + --data_file_keys "image,blockwise_controlnet_image" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors,DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth:model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.blockwise_controlnet.models.0." \ + --output_path "./models/train/Qwen-Image-Blockwise-ControlNet-Depth_full" \ + --trainable_models "blockwise_controlnet" \ + --extra_inputs "blockwise_controlnet_image" \ + --use_gradient_checkpointing \ + --find_unused_parameters + +# If you want to pre-train a Blockwise ControlNet from scratch, +# please run the following script to first generate the initialized model weights file, +# and then start training with a high learning rate (1e-3). + +# python examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Initialize.py + +# accelerate launch examples/qwen_image/model_training/train.py \ +# --dataset_base_path data/example_image_dataset \ +# --dataset_metadata_path data/example_image_dataset/metadata_blockwise_controlnet_depth.csv \ +# --data_file_keys "image,blockwise_controlnet_image" \ +# --max_pixels 1048576 \ +# --dataset_repeat 50 \ +# --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ +# --model_paths '["models/blockwise_controlnet.safetensors"]' \ +# --learning_rate 1e-3 \ +# --num_epochs 2 \ +# --remove_prefix_in_ckpt "pipe.blockwise_controlnet.models.0." \ +# --output_path "./models/train/Qwen-Image-Blockwise-ControlNet-Depth_full" \ +# --trainable_models "blockwise_controlnet" \ +# --extra_inputs "blockwise_controlnet_image" \ +# --use_gradient_checkpointing \ +# --find_unused_parameters \ No newline at end of file diff --git a/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh b/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh index 6343754..a56fe9d 100644 --- a/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh +++ b/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh @@ -9,4 +9,5 @@ accelerate launch --config_file examples/qwen_image/model_training/full/accelera --remove_prefix_in_ckpt "pipe.dit." \ --output_path "./models/train/Qwen-Image-Distill-Full_full" \ --trainable_models "dit" \ - --use_gradient_checkpointing + --use_gradient_checkpointing \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/full/Qwen-Image.sh b/examples/qwen_image/model_training/full/Qwen-Image.sh index 2f5eef0..979101e 100644 --- a/examples/qwen_image/model_training/full/Qwen-Image.sh +++ b/examples/qwen_image/model_training/full/Qwen-Image.sh @@ -9,4 +9,5 @@ accelerate launch --config_file examples/qwen_image/model_training/full/accelera --remove_prefix_in_ckpt "pipe.dit." \ --output_path "./models/train/Qwen-Image_full" \ --trainable_models "dit" \ - --use_gradient_checkpointing + --use_gradient_checkpointing \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh b/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh new file mode 100644 index 0000000..2263134 --- /dev/null +++ b/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh @@ -0,0 +1,17 @@ +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_blockwise_controlnet_canny.csv \ + --data_file_keys "image,blockwise_controlnet_image" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors,DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny:model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-Blockwise-ControlNet-Canny_lora" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --extra_inputs "blockwise_controlnet_image" \ + --use_gradient_checkpointing \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh b/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh new file mode 100644 index 0000000..60d3ca3 --- /dev/null +++ b/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh @@ -0,0 +1,17 @@ +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_blockwise_controlnet_depth.csv \ + --data_file_keys "image,blockwise_controlnet_image" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors,DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth:model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-Blockwise-ControlNet-Depth_lora" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --extra_inputs "blockwise_controlnet_image" \ + --use_gradient_checkpointing \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh b/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh index cb487a2..79d7c37 100644 --- a/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh +++ b/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh @@ -11,4 +11,5 @@ accelerate launch examples/qwen_image/model_training/train.py \ --lora_base_model "dit" \ --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ --lora_rank 32 \ - --use_gradient_checkpointing + --use_gradient_checkpointing \ + --find_unused_parameters diff --git a/examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Initialize.py b/examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Initialize.py new file mode 100644 index 0000000..5b0392f --- /dev/null +++ b/examples/qwen_image/model_training/scripts/Qwen-Image-Blockwise-ControlNet-Initialize.py @@ -0,0 +1,13 @@ +# This script is for initializing a Qwen-Image-Blockwise-ControlNet +from diffsynth import hash_state_dict_keys +from diffsynth.models.qwen_image_controlnet import QwenImageBlockWiseControlNet +import torch +from safetensors.torch import save_file + + +controlnet = QwenImageBlockWiseControlNet().to(dtype=torch.bfloat16, device="cuda") +controlnet.init_weight() +state_dict_controlnet = controlnet.state_dict() + +print(hash_state_dict_keys(state_dict_controlnet)) +save_file(state_dict_controlnet, "models/blockwise_controlnet.safetensors") diff --git a/examples/qwen_image/model_training/train.py b/examples/qwen_image/model_training/train.py index 56db9e2..5f26927 100644 --- a/examples/qwen_image/model_training/train.py +++ b/examples/qwen_image/model_training/train.py @@ -80,17 +80,18 @@ class QwenImageTrainingModule(DiffusionTrainingModule): } # Extra inputs - controlnet_input = {} + controlnet_input, blockwise_controlnet_input = {}, {} for extra_input in self.extra_inputs: if extra_input.startswith("blockwise_controlnet_"): - controlnet_input[extra_input.replace("blockwise_controlnet_", "")] = data[extra_input] + blockwise_controlnet_input[extra_input.replace("blockwise_controlnet_", "")] = data[extra_input] elif extra_input.startswith("controlnet_"): controlnet_input[extra_input.replace("controlnet_", "")] = data[extra_input] else: inputs_shared[extra_input] = data[extra_input] if len(controlnet_input) > 0: - controlnet_key = "blockwise_controlnet_inputs" if "blockwise_controlnet_image" in self.extra_inputs else "controlnet_inputs" - inputs_shared[controlnet_key] = [ControlNetInput(**controlnet_input)] + inputs_shared["controlnet_inputs"] = [ControlNetInput(**controlnet_input)] + if len(blockwise_controlnet_input) > 0: + inputs_shared["blockwise_controlnet_inputs"] = [ControlNetInput(**blockwise_controlnet_input)] # Pipeline units will automatically process the input parameters. for unit in self.pipe.units: diff --git a/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py b/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py new file mode 100644 index 0000000..6ae4d5b --- /dev/null +++ b/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py @@ -0,0 +1,31 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig, ControlNetInput +from PIL import Image +import torch +from modelscope import dataset_snapshot_download + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(path="models/train/Qwen-Image-Blockwise-ControlNet-Canny_full/epoch-1.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) + +dataset_snapshot_download( + dataset_id="DiffSynth-Studio/example_image_dataset", + local_dir="./data/example_image_dataset", + allow_file_pattern="canny/image_1.jpg" +) +controlnet_image = Image.open("data/example_image_dataset/canny/image_1.jpg").resize((1328, 1328)) + +prompt = "一只小狗,毛发光洁柔顺,眼神灵动,背景是樱花纷飞的春日庭院,唯美温馨。" +image = pipe( + prompt, seed=0, + blockwise_controlnet_inputs=[ControlNetInput(image=controlnet_image)] +) +image.save("image.jpg") diff --git a/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py b/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py new file mode 100644 index 0000000..18b597e --- /dev/null +++ b/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py @@ -0,0 +1,31 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig, ControlNetInput +from PIL import Image +import torch +from modelscope import dataset_snapshot_download + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(path="models/train/Qwen-Image-Blockwise-ControlNet-Depth_full/epoch-1.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) + +dataset_snapshot_download( + dataset_id="DiffSynth-Studio/example_image_dataset", + local_dir="./data/example_image_dataset", + allow_file_pattern="depth/image_1.jpg" +) +controlnet_image = Image.open("data/example_image_dataset/depth/image_1.jpg").resize((1328, 1328)) + +prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。" +image = pipe( + prompt, seed=0, + blockwise_controlnet_inputs=[ControlNetInput(image=controlnet_image)] +) +image.save("image.jpg") diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py new file mode 100644 index 0000000..4a54b5e --- /dev/null +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py @@ -0,0 +1,32 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig, ControlNetInput +from PIL import Image +import torch +from modelscope import dataset_snapshot_download + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny", origin_file_pattern="model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +pipe.load_lora(pipe.dit, "models/train/Qwen-Image-Blockwise-ControlNet-Canny_lora/epoch-4.safetensors") + +dataset_snapshot_download( + dataset_id="DiffSynth-Studio/example_image_dataset", + local_dir="./data/example_image_dataset", + allow_file_pattern="canny/image_1.jpg" +) +controlnet_image = Image.open("data/example_image_dataset/canny/image_1.jpg").resize((1328, 1328)) + +prompt = "一只小狗,毛发光洁柔顺,眼神灵动,背景是樱花纷飞的春日庭院,唯美温馨。" +image = pipe( + prompt, seed=0, + blockwise_controlnet_inputs=[ControlNetInput(image=controlnet_image)] +) +image.save("image.jpg") diff --git a/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py b/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py new file mode 100644 index 0000000..6266545 --- /dev/null +++ b/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py @@ -0,0 +1,33 @@ +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig, ControlNetInput +from PIL import Image +import torch +from modelscope import dataset_snapshot_download + + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth", origin_file_pattern="model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +pipe.load_lora(pipe.dit, "models/train/Qwen-Image-Blockwise-ControlNet-Depth_lora/epoch-4.safetensors") + +dataset_snapshot_download( + dataset_id="DiffSynth-Studio/example_image_dataset", + local_dir="./data/example_image_dataset", + allow_file_pattern="depth/image_1.jpg" +) + +controlnet_image = Image.open("data/example_image_dataset/depth/image_1.jpg").resize((1328, 1328)) + +prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。" +image = pipe( + prompt, seed=0, + blockwise_controlnet_inputs=[ControlNetInput(image=controlnet_image)] +) +image.save("image.jpg")