support training for eligen and nexusgen

2026-03-18 22:08:13 +00:00 · 2025-07-29 13:28:42 +08:00
parent 2861ec4d9f
commit 8ef91b3672
14 changed files with 218 additions and 17 deletions
--- a/examples/flux/model_inference/Nexus-Gen-Editing.py
+++ b/examples/flux/model_inference/Nexus-Gen-Editing.py
@@ -2,7 +2,7 @@ import importlib
 import torch
 from PIL import Image
 from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
-from modelscope import snapshot_download
+from modelscope import snapshot_download, dataset_snapshot_download

 if importlib.util.find_spec("transformers") is None:
    raise ImportError("You are using Nexus-GenV2. It depends on transformers, which is not installed. Please install it with `pip install transformers==4.49.0`.")
@@ -23,12 +23,13 @@ pipe = FluxImagePipeline.from_pretrained(
    ],
 )

-prompt = "给猫加一副太阳镜"
-ref_image = Image.open("cat.png").convert("RGB")
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/nexusgen/cat.jpg")
+ref_image = Image.open("data/examples/nexusgen/cat.jpg").convert("RGB")
+prompt = "Add a crown."
 image = pipe(
    prompt=prompt, negative_prompt="",
-    seed=0, cfg_scale=1.0, num_inference_steps=50,
+    seed=42, cfg_scale=2.0, num_inference_steps=50,
    nexus_gen_reference_image=ref_image,
    height=512, width=512,
 )
-image.save("cat_glasses.jpg")
+image.save("cat_crown.jpg")
--- a/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py
+++ b/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py
@@ -0,0 +1,36 @@
+import importlib
+import torch
+from PIL import Image
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from modelscope import snapshot_download, dataset_snapshot_download
+
+if importlib.util.find_spec("transformers") is None:
+    raise ImportError("You are using Nexus-GenV2. It depends on transformers, which is not installed. Please install it with `pip install transformers==4.49.0`.")
+else:
+    import transformers
+    assert transformers.__version__ == "4.49.0", "Nexus-GenV2 requires transformers==4.49.0, please install it with `pip install transformers==4.49.0`."
+
+snapshot_download("DiffSynth-Studio/Nexus-GenV2", local_dir="models/DiffSynth-Studio/Nexus-GenV2")
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/nexusgen/cat.jpg")
+ref_image = Image.open("data/examples/nexusgen/cat.jpg").convert("RGB")
+prompt = "Add a crown."
+image = pipe(
+    prompt=prompt, negative_prompt="",
+    seed=42, cfg_scale=2.0, num_inference_steps=50,
+    nexus_gen_reference_image=ref_image,
+    height=512, width=512,
+)
+image.save("cat_crown.jpg")
--- a/examples/flux/model_training/full/FLUX.1-NexusGen-Edit.sh
+++ b/examples/flux/model_training/full/FLUX.1-NexusGen-Edit.sh
@@ -0,0 +1,14 @@
+accelerate launch --config_file examples/flux/model_training/full/accelerate_config_zero2offload.yaml examples/flux/model_training/train.py \
+  --dataset_base_path data/example_image_dataset \
+  --dataset_metadata_path data/example_image_dataset/metadata_nexusgen_edit.csv \
+  --data_file_keys "image,nexus_gen_reference_image" \
+  --max_pixels 262144 \
+  --dataset_repeat 400 \
+  --model_id_with_origin_paths "DiffSynth-Studio/Nexus-GenV2:model*.safetensors,DiffSynth-Studio/Nexus-GenV2:edit_decoder.bin,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/,black-forest-labs/FLUX.1-dev:ae.safetensors" \
+  --learning_rate 1e-5 \
+  --num_epochs 1 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/FLUX.1-NexusGen-Edit_full" \
+  --trainable_models "dit" \
+  --extra_inputs "nexus_gen_reference_image" \
+  --use_gradient_checkpointing_offload
--- a/examples/flux/model_training/full/accelerate_config_zero2offload.yaml
+++ b/examples/flux/model_training/full/accelerate_config_zero2offload.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: 'cpu'
+  offload_param_device: 'cpu'
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/flux/model_training/lora/FLUX.1-NexusGen-Edit.sh
+++ b/examples/flux/model_training/lora/FLUX.1-NexusGen-Edit.sh
@@ -0,0 +1,17 @@
+accelerate launch examples/flux/model_training/train.py \
+  --dataset_base_path data/example_image_dataset \
+  --dataset_metadata_path data/example_image_dataset/metadata_nexusgen_edit.csv \
+  --data_file_keys "image,nexus_gen_reference_image" \
+  --max_pixels 1048576 \
+  --dataset_repeat 400 \
+  --model_id_with_origin_paths "DiffSynth-Studio/Nexus-GenV2:model*.safetensors,DiffSynth-Studio/Nexus-GenV2:edit_decoder.bin,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/,black-forest-labs/FLUX.1-dev:ae.safetensors" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/FLUX.1-NexusGen-Edit_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "a_to_qkv,b_to_qkv,ff_a.0,ff_a.2,ff_b.0,ff_b.2,a_to_out,b_to_out,proj_out,norm.linear,norm1_a.linear,norm1_b.linear,to_qkv_mlp" \
+  --lora_rank 32 \
+  --align_to_opensource_format \
+  --extra_inputs "nexus_gen_reference_image" \
+  --use_gradient_checkpointing
--- a/examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh
+++ b/examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh
@@ -0,0 +1,17 @@
+accelerate launch examples/flux/model_training/train.py \
+  --dataset_base_path data/example_image_dataset \
+  --dataset_metadata_path data/example_image_dataset/metadata_eligen.json \
+  --data_file_keys "image,eligen_entity_masks" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.1-dev:flux1-dev.safetensors,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/,black-forest-labs/FLUX.1-dev:ae.safetensors" \
+  --learning_rate 1e-4 \
+  --num_epochs 5 \
+  --remove_prefix_in_ckpt "pipe.dit." \
+  --output_path "./models/train/FLUX.1-dev-EliGen_lora" \
+  --lora_base_model "dit" \
+  --lora_target_modules "a_to_qkv,b_to_qkv,ff_a.0,ff_a.2,ff_b.0,ff_b.2,a_to_out,b_to_out,proj_out,norm.linear,norm1_a.linear,norm1_b.linear,to_qkv_mlp" \
+  --lora_rank 32 \
+  --align_to_opensource_format \
+  --extra_inputs "eligen_entity_masks,eligen_entity_prompts" \
+  --use_gradient_checkpointing
--- a/examples/flux/model_training/validate_full/Nexus-Gen-Editing.py
+++ b/examples/flux/model_training/validate_full/Nexus-Gen-Editing.py
@@ -0,0 +1,28 @@
+import torch
+from PIL import Image
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from diffsynth import load_state_dict
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+state_dict = load_state_dict("models/train/FLUX.1-NexusGen-Edit_full/epoch-0.safetensors")
+pipe.dit.load_state_dict(state_dict)
+
+ref_image = Image.open("data/example_image_dataset/nexus_gen/image_1.png").convert("RGB")
+prompt = "Add a pair of sunglasses."
+image = pipe(
+    prompt=prompt, negative_prompt="",
+    seed=42, cfg_scale=2.0, num_inference_steps=50,
+    nexus_gen_reference_image=ref_image,
+    height=512, width=512,
+)
+image.save("NexusGen-Edit_full.jpg")
--- a/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py
+++ b/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py
@@ -0,0 +1,33 @@
+import torch
+from PIL import Image
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+
+pipe.load_lora(pipe.dit, "models/train/FLUX.1-dev-EliGen_lora/epoch-4.safetensors", alpha=1)
+
+entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"]
+global_prompt = "A beautiful girl wearing shirt and shorts in the street,  holding a sign 'Entity Control'"
+masks = [Image.open(f"data/example_image_dataset/eligen/{i}.png").convert('RGB') for i in range(len(entity_prompts))]
+# generate image
+image = pipe(
+    prompt=global_prompt,
+    cfg_scale=1.0,
+    num_inference_steps=50,
+    embedded_guidance=3.5,
+    seed=42,
+    height=1024,
+    width=1024,
+    eligen_entity_prompts=entity_prompts,
+    eligen_entity_masks=masks,
+)
+image.save(f"EliGen_lora.png")
--- a/examples/flux/model_training/validate_lora/Nexus-Gen-Editing.py
+++ b/examples/flux/model_training/validate_lora/Nexus-Gen-Editing.py
@@ -0,0 +1,26 @@
+import torch
+from PIL import Image
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
+        ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
+    ],
+)
+pipe.load_lora(pipe.dit, "models/train/FLUX.1-NexusGen-Edit_lora/epoch-4.safetensors", alpha=1)
+
+ref_image = Image.open("data/example_image_dataset/nexus_gen/image_1.png").convert("RGB")
+prompt = "Add a pair of sunglasses."
+image = pipe(
+    prompt=prompt, negative_prompt="",
+    seed=42, cfg_scale=1.0, num_inference_steps=50,
+    nexus_gen_reference_image=ref_image,
+    height=512, width=512,
+)
+image.save("NexusGen-Edit_lora.jpg")