mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
support training for eligen and nexusgen
This commit is contained in:
@@ -2,7 +2,7 @@ import importlib
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
|
||||
from modelscope import snapshot_download
|
||||
from modelscope import snapshot_download, dataset_snapshot_download
|
||||
|
||||
if importlib.util.find_spec("transformers") is None:
|
||||
raise ImportError("You are using Nexus-GenV2. It depends on transformers, which is not installed. Please install it with `pip install transformers==4.49.0`.")
|
||||
@@ -23,12 +23,13 @@ pipe = FluxImagePipeline.from_pretrained(
|
||||
],
|
||||
)
|
||||
|
||||
prompt = "给猫加一副太阳镜"
|
||||
ref_image = Image.open("cat.png").convert("RGB")
|
||||
dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/nexusgen/cat.jpg")
|
||||
ref_image = Image.open("data/examples/nexusgen/cat.jpg").convert("RGB")
|
||||
prompt = "Add a crown."
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
seed=0, cfg_scale=1.0, num_inference_steps=50,
|
||||
seed=42, cfg_scale=2.0, num_inference_steps=50,
|
||||
nexus_gen_reference_image=ref_image,
|
||||
height=512, width=512,
|
||||
)
|
||||
image.save("cat_glasses.jpg")
|
||||
image.save("cat_crown.jpg")
|
||||
|
||||
36
examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py
Normal file
36
examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import importlib
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
|
||||
from modelscope import snapshot_download, dataset_snapshot_download
|
||||
|
||||
if importlib.util.find_spec("transformers") is None:
|
||||
raise ImportError("You are using Nexus-GenV2. It depends on transformers, which is not installed. Please install it with `pip install transformers==4.49.0`.")
|
||||
else:
|
||||
import transformers
|
||||
assert transformers.__version__ == "4.49.0", "Nexus-GenV2 requires transformers==4.49.0, please install it with `pip install transformers==4.49.0`."
|
||||
|
||||
snapshot_download("DiffSynth-Studio/Nexus-GenV2", local_dir="models/DiffSynth-Studio/Nexus-GenV2")
|
||||
pipe = FluxImagePipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
|
||||
ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
|
||||
],
|
||||
)
|
||||
pipe.enable_vram_management()
|
||||
|
||||
dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/nexusgen/cat.jpg")
|
||||
ref_image = Image.open("data/examples/nexusgen/cat.jpg").convert("RGB")
|
||||
prompt = "Add a crown."
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
seed=42, cfg_scale=2.0, num_inference_steps=50,
|
||||
nexus_gen_reference_image=ref_image,
|
||||
height=512, width=512,
|
||||
)
|
||||
image.save("cat_crown.jpg")
|
||||
14
examples/flux/model_training/full/FLUX.1-NexusGen-Edit.sh
Normal file
14
examples/flux/model_training/full/FLUX.1-NexusGen-Edit.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
accelerate launch --config_file examples/flux/model_training/full/accelerate_config_zero2offload.yaml examples/flux/model_training/train.py \
|
||||
--dataset_base_path data/example_image_dataset \
|
||||
--dataset_metadata_path data/example_image_dataset/metadata_nexusgen_edit.csv \
|
||||
--data_file_keys "image,nexus_gen_reference_image" \
|
||||
--max_pixels 262144 \
|
||||
--dataset_repeat 400 \
|
||||
--model_id_with_origin_paths "DiffSynth-Studio/Nexus-GenV2:model*.safetensors,DiffSynth-Studio/Nexus-GenV2:edit_decoder.bin,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/,black-forest-labs/FLUX.1-dev:ae.safetensors" \
|
||||
--learning_rate 1e-5 \
|
||||
--num_epochs 1 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/FLUX.1-NexusGen-Edit_full" \
|
||||
--trainable_models "dit" \
|
||||
--extra_inputs "nexus_gen_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -0,0 +1,22 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
gradient_accumulation_steps: 1
|
||||
offload_optimizer_device: 'cpu'
|
||||
offload_param_device: 'cpu'
|
||||
zero3_init_flag: false
|
||||
zero_stage: 2
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
17
examples/flux/model_training/lora/FLUX.1-NexusGen-Edit.sh
Normal file
17
examples/flux/model_training/lora/FLUX.1-NexusGen-Edit.sh
Normal file
@@ -0,0 +1,17 @@
|
||||
accelerate launch examples/flux/model_training/train.py \
|
||||
--dataset_base_path data/example_image_dataset \
|
||||
--dataset_metadata_path data/example_image_dataset/metadata_nexusgen_edit.csv \
|
||||
--data_file_keys "image,nexus_gen_reference_image" \
|
||||
--max_pixels 1048576 \
|
||||
--dataset_repeat 400 \
|
||||
--model_id_with_origin_paths "DiffSynth-Studio/Nexus-GenV2:model*.safetensors,DiffSynth-Studio/Nexus-GenV2:edit_decoder.bin,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/,black-forest-labs/FLUX.1-dev:ae.safetensors" \
|
||||
--learning_rate 1e-4 \
|
||||
--num_epochs 5 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/FLUX.1-NexusGen-Edit_lora" \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "a_to_qkv,b_to_qkv,ff_a.0,ff_a.2,ff_b.0,ff_b.2,a_to_out,b_to_out,proj_out,norm.linear,norm1_a.linear,norm1_b.linear,to_qkv_mlp" \
|
||||
--lora_rank 32 \
|
||||
--align_to_opensource_format \
|
||||
--extra_inputs "nexus_gen_reference_image" \
|
||||
--use_gradient_checkpointing
|
||||
17
examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh
Normal file
17
examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh
Normal file
@@ -0,0 +1,17 @@
|
||||
accelerate launch examples/flux/model_training/train.py \
|
||||
--dataset_base_path data/example_image_dataset \
|
||||
--dataset_metadata_path data/example_image_dataset/metadata_eligen.json \
|
||||
--data_file_keys "image,eligen_entity_masks" \
|
||||
--max_pixels 1048576 \
|
||||
--dataset_repeat 50 \
|
||||
--model_id_with_origin_paths "black-forest-labs/FLUX.1-dev:flux1-dev.safetensors,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/,black-forest-labs/FLUX.1-dev:ae.safetensors" \
|
||||
--learning_rate 1e-4 \
|
||||
--num_epochs 5 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/FLUX.1-dev-EliGen_lora" \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "a_to_qkv,b_to_qkv,ff_a.0,ff_a.2,ff_b.0,ff_b.2,a_to_out,b_to_out,proj_out,norm.linear,norm1_a.linear,norm1_b.linear,to_qkv_mlp" \
|
||||
--lora_rank 32 \
|
||||
--align_to_opensource_format \
|
||||
--extra_inputs "eligen_entity_masks,eligen_entity_prompts" \
|
||||
--use_gradient_checkpointing
|
||||
@@ -0,0 +1,28 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
|
||||
from diffsynth import load_state_dict
|
||||
|
||||
pipe = FluxImagePipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
|
||||
ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
|
||||
],
|
||||
)
|
||||
state_dict = load_state_dict("models/train/FLUX.1-NexusGen-Edit_full/epoch-0.safetensors")
|
||||
pipe.dit.load_state_dict(state_dict)
|
||||
|
||||
ref_image = Image.open("data/example_image_dataset/nexus_gen/image_1.png").convert("RGB")
|
||||
prompt = "Add a pair of sunglasses."
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
seed=42, cfg_scale=2.0, num_inference_steps=50,
|
||||
nexus_gen_reference_image=ref_image,
|
||||
height=512, width=512,
|
||||
)
|
||||
image.save("NexusGen-Edit_full.jpg")
|
||||
@@ -0,0 +1,33 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
|
||||
|
||||
pipe = FluxImagePipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
|
||||
],
|
||||
)
|
||||
|
||||
pipe.load_lora(pipe.dit, "models/train/FLUX.1-dev-EliGen_lora/epoch-4.safetensors", alpha=1)
|
||||
|
||||
entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"]
|
||||
global_prompt = "A beautiful girl wearing shirt and shorts in the street, holding a sign 'Entity Control'"
|
||||
masks = [Image.open(f"data/example_image_dataset/eligen/{i}.png").convert('RGB') for i in range(len(entity_prompts))]
|
||||
# generate image
|
||||
image = pipe(
|
||||
prompt=global_prompt,
|
||||
cfg_scale=1.0,
|
||||
num_inference_steps=50,
|
||||
embedded_guidance=3.5,
|
||||
seed=42,
|
||||
height=1024,
|
||||
width=1024,
|
||||
eligen_entity_prompts=entity_prompts,
|
||||
eligen_entity_masks=masks,
|
||||
)
|
||||
image.save(f"EliGen_lora.png")
|
||||
@@ -0,0 +1,26 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
|
||||
|
||||
pipe = FluxImagePipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors"),
|
||||
ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"),
|
||||
],
|
||||
)
|
||||
pipe.load_lora(pipe.dit, "models/train/FLUX.1-NexusGen-Edit_lora/epoch-4.safetensors", alpha=1)
|
||||
|
||||
ref_image = Image.open("data/example_image_dataset/nexus_gen/image_1.png").convert("RGB")
|
||||
prompt = "Add a pair of sunglasses."
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
seed=42, cfg_scale=1.0, num_inference_steps=50,
|
||||
nexus_gen_reference_image=ref_image,
|
||||
height=512, width=512,
|
||||
)
|
||||
image.save("NexusGen-Edit_lora.jpg")
|
||||
Reference in New Issue
Block a user