import torch from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig, ControlNetInput from diffsynth.utils.controlnet import Annotator from modelscope import snapshot_download vram_config = { "offload_dtype": torch.float8_e4m3fn, "offload_device": "cpu", "onload_dtype": torch.float8_e4m3fn, "onload_device": "cpu", "preparing_dtype": torch.float8_e4m3fn, "preparing_device": "cuda", "computation_dtype": torch.bfloat16, "computation_device": "cuda", } snapshot_download("sd_lora/Annotators", allow_file_pattern="dpt_hybrid-midas-501f0c75.pt", local_dir="models/Annotators") pipe = FluxImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", **vram_config), ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", **vram_config), ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors", **vram_config), ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", **vram_config), ModelConfig(model_id="InstantX/FLUX.1-dev-Controlnet-Union-alpha", origin_file_pattern="diffusion_pytorch_model.safetensors", **vram_config), ], vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) image_1 = pipe( prompt="a beautiful Asian girl, full body, red dress, summer", height=1024, width=1024, seed=6, rand_device="cuda", ) image_1.save("image_1.jpg") image_canny = Annotator("canny")(image_1) image_depth = Annotator("depth")(image_1) image_2 = pipe( prompt="a beautiful Asian girl, full body, red dress, winter", controlnet_inputs=[ ControlNetInput(image=image_canny, scale=0.3, processor_id="canny"), ControlNetInput(image=image_depth, scale=0.3, processor_id="depth"), ], height=1024, width=1024, seed=7, rand_device="cuda", ) image_2.save("image_2.jpg")