From af6b1d4246a86b785e6c047c50c3a68f5aef7c10 Mon Sep 17 00:00:00 2001
From: Artiprocher <wangye87v5@hotmail.com>
Date: Tue, 15 Jul 2025 20:11:02 +0800
Subject: [PATCH] flux series vram management

---
 diffsynth/models/flux_infiniteyou.py          |   1 +
 diffsynth/models/flux_value_control.py        |   3 +-
 diffsynth/models/step1x_connector.py          |   4 +-
 diffsynth/pipelines/flux_image_new.py         |  80 +++++++---
 .../flux/model_inference_low_vram/EliGen.py   | 148 ++++++++++++++++++
 .../FLEX.2-preview.py                         |  51 ++++++
 .../FLUX.1-Kontext-dev.py                     |  55 +++++++
 .../FLUX.1-dev-Controlnet-Inpainting-Beta.py  |  38 +++++
 .../FLUX.1-dev-Controlnet-Union-alpha.py      |  41 +++++
 .../FLUX.1-dev-Controlnet-Upscaler.py         |  34 ++++
 .../FLUX.1-dev-IP-Adapter.py                  |  25 +++
 .../FLUX.1-dev-InfiniteYou.py                 |  60 +++++++
 .../FLUX.1-dev-LoRAFusion.py                  |  35 +++++
 .../FLUX.1-dev-ValueControl.py                |  21 +++
 .../model_inference_low_vram/FLUX.1-dev.py    |  27 ++++
 .../model_inference_low_vram/Step1X-Edit.py   |  33 ++++
 16 files changed, 629 insertions(+), 27 deletions(-)
 create mode 100644 examples/flux/model_inference_low_vram/EliGen.py
 create mode 100644 examples/flux/model_inference_low_vram/FLEX.2-preview.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-LoRAFusion.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev-ValueControl.py
 create mode 100644 examples/flux/model_inference_low_vram/FLUX.1-dev.py
 create mode 100644 examples/flux/model_inference_low_vram/Step1X-Edit.py

diff --git a/diffsynth/models/flux_infiniteyou.py b/diffsynth/models/flux_infiniteyou.py
index 2015de4..861538a 100644
--- a/diffsynth/models/flux_infiniteyou.py
+++ b/diffsynth/models/flux_infiniteyou.py
@@ -104,6 +104,7 @@ class InfiniteYouImageProjector(nn.Module):
     def forward(self, x):
 
         latents = self.latents.repeat(x.size(0), 1, 1)
+        latents = latents.to(dtype=x.dtype, device=x.device)
 
         x = self.proj_in(x)
 
diff --git a/diffsynth/models/flux_value_control.py b/diffsynth/models/flux_value_control.py
index 54eaa07..0ff68d3 100644
--- a/diffsynth/models/flux_value_control.py
+++ b/diffsynth/models/flux_value_control.py
@@ -40,7 +40,8 @@ class SingleValueEncoder(torch.nn.Module):
         emb = self.prefer_proj(value).to(dtype)
         emb = self.prefer_value_embedder(emb).squeeze(0)
         base_embeddings = emb.expand(self.prefer_len, -1)
-        learned_embeddings = base_embeddings + self.positional_embedding
+        positional_embedding = self.positional_embedding.to(dtype=base_embeddings.dtype, device=base_embeddings.device)
+        learned_embeddings = base_embeddings + positional_embedding
         return learned_embeddings
 
     @staticmethod
diff --git a/diffsynth/models/step1x_connector.py b/diffsynth/models/step1x_connector.py
index b4abe40..9d5f0d9 100644
--- a/diffsynth/models/step1x_connector.py
+++ b/diffsynth/models/step1x_connector.py
@@ -162,7 +162,7 @@ class TimestepEmbedder(nn.Module):
     def forward(self, t):
         t_freq = self.timestep_embedding(
             t, self.frequency_embedding_size, self.max_period
-        ).type(self.mlp[0].weight.dtype)  # type: ignore
+        ).type(t.dtype)  # type: ignore
         t_emb = self.mlp(t_freq)
         return t_emb
     
@@ -656,7 +656,7 @@ class Qwen2Connector(torch.nn.Module):
         mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
         x_mean = (x * mask_float).sum(
                 dim=1
-            ) / mask_float.sum(dim=1) * (1 + self.scale_factor)
+            ) / mask_float.sum(dim=1) * (1 + self.scale_factor.to(dtype=x.dtype, device=x.device))
 
         global_out=self.global_proj_out(x_mean)
         encoder_hidden_states = self.S(x,t,mask)
diff --git a/diffsynth/pipelines/flux_image_new.py b/diffsynth/pipelines/flux_image_new.py
index b7f13a1..3dbb9b8 100644
--- a/diffsynth/pipelines/flux_image_new.py
+++ b/diffsynth/pipelines/flux_image_new.py
@@ -24,7 +24,6 @@ from ..models.tiler import FastTileWorker
 from .wan_video_new import BasePipeline, ModelConfig, PipelineUnitRunner, PipelineUnit
 from ..lora.flux_lora import FluxLoRALoader, FluxLoraPatcher
 
-from transformers.models.t5.modeling_t5 import T5LayerNorm, T5DenseActDense, T5DenseGatedActDense
 from ..models.flux_dit import RMSNorm
 from ..vram_management import gradient_checkpoint_forward, enable_vram_management, AutoWrappedModule, AutoWrappedLinear
 
@@ -185,22 +184,18 @@ class FluxImagePipeline(BasePipeline):
         return loss
     
     
-    def enable_vram_management(self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5):
-        self.vram_management_enabled = True
-        if num_persistent_param_in_dit is not None:
-            vram_limit = None
-        else:
-            if vram_limit is None:
-                vram_limit = self.get_vram()
-            vram_limit = vram_limit - vram_buffer
-        if self.text_encoder_1 is not None:
-            dtype = next(iter(self.text_encoder_1.parameters())).dtype
+    def _enable_vram_management_with_default_config(self, model, vram_limit):
+        if model is not None:
+            dtype = next(iter(model.parameters())).dtype
             enable_vram_management(
-                self.text_encoder_1,
+                model,
                 module_map = {
                     torch.nn.Linear: AutoWrappedLinear,
                     torch.nn.Embedding: AutoWrappedModule,
                     torch.nn.LayerNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.GroupNorm: AutoWrappedModule,
+                    RMSNorm: AutoWrappedModule,
                 },
                 module_config = dict(
                     offload_dtype=dtype,
@@ -212,7 +207,25 @@ class FluxImagePipeline(BasePipeline):
                 ),
                 vram_limit=vram_limit,
             )
+    
+    
+    def enable_vram_management(self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5):
+        self.vram_management_enabled = True
+        if num_persistent_param_in_dit is not None:
+            vram_limit = None
+        else:
+            if vram_limit is None:
+                vram_limit = self.get_vram()
+            vram_limit = vram_limit - vram_buffer
+
+        # Default config
+        default_vram_management_models = ["text_encoder_1", "vae_decoder", "vae_encoder", "controlnet", "image_proj_model", "ipadapter", "lora_patcher", "value_controller", "step1x_connector"]
+        for model_name in default_vram_management_models:
+            self._enable_vram_management_with_default_config(getattr(self, model_name), vram_limit)
+
+        # Special config
         if self.text_encoder_2 is not None:
+            from transformers.models.t5.modeling_t5 import T5LayerNorm, T5DenseActDense, T5DenseGatedActDense
             dtype = next(iter(self.text_encoder_2.parameters())).dtype
             enable_vram_management(
                 self.text_encoder_2,
@@ -261,14 +274,18 @@ class FluxImagePipeline(BasePipeline):
                 ),
                 vram_limit=vram_limit,
             )
-        if self.vae_decoder is not None:
-            dtype = next(iter(self.vae_decoder.parameters())).dtype
+        if self.ipadapter_image_encoder is not None:
+            from transformers.models.siglip.modeling_siglip import SiglipVisionEmbeddings, SiglipEncoder, SiglipMultiheadAttentionPoolingHead
+            dtype = next(iter(self.ipadapter_image_encoder.parameters())).dtype
             enable_vram_management(
-                self.vae_decoder,
+                self.ipadapter_image_encoder,
                 module_map = {
+                    SiglipVisionEmbeddings: AutoWrappedModule,
+                    SiglipEncoder: AutoWrappedModule,
+                    SiglipMultiheadAttentionPoolingHead: AutoWrappedModule,
+                    torch.nn.MultiheadAttention: AutoWrappedModule,
                     torch.nn.Linear: AutoWrappedLinear,
-                    torch.nn.Conv2d: AutoWrappedModule,
-                    torch.nn.GroupNorm: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
                 },
                 module_config = dict(
                     offload_dtype=dtype,
@@ -280,14 +297,25 @@ class FluxImagePipeline(BasePipeline):
                 ),
                 vram_limit=vram_limit,
             )
-        if self.vae_encoder is not None:
-            dtype = next(iter(self.vae_encoder.parameters())).dtype
+        if self.qwenvl is not None:
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+                Qwen2_5_VisionPatchEmbed, Qwen2_5_VLVisionBlock, Qwen2_5_VLPatchMerger,
+                Qwen2_5_VLDecoderLayer, Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VLRotaryEmbedding, Qwen2RMSNorm
+            )
+            dtype = next(iter(self.qwenvl.parameters())).dtype
             enable_vram_management(
-                self.vae_encoder,
+                self.qwenvl,
                 module_map = {
+                    Qwen2_5_VisionPatchEmbed: AutoWrappedModule,
+                    Qwen2_5_VLVisionBlock: AutoWrappedModule,
+                    Qwen2_5_VLPatchMerger: AutoWrappedModule,
+                    Qwen2_5_VLDecoderLayer: AutoWrappedModule,
+                    Qwen2_5_VisionRotaryEmbedding: AutoWrappedModule,
+                    Qwen2_5_VLRotaryEmbedding: AutoWrappedModule,
+                    Qwen2RMSNorm: AutoWrappedModule,
+                    torch.nn.Embedding: AutoWrappedModule,
                     torch.nn.Linear: AutoWrappedLinear,
-                    torch.nn.Conv2d: AutoWrappedModule,
-                    torch.nn.GroupNorm: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
                 },
                 module_config = dict(
                     offload_dtype=dtype,
@@ -774,9 +802,13 @@ class FluxImageUnit_Flex(PipelineUnit):
 
 class FluxImageUnit_InfiniteYou(PipelineUnit):
     def __init__(self):
-        super().__init__(input_params=("infinityou_id_image", "infinityou_guidance"))
+        super().__init__(
+            input_params=("infinityou_id_image", "infinityou_guidance"),
+            onload_model_names=("infinityou_processor",)
+        )
 
     def process(self, pipe: FluxImagePipeline, infinityou_id_image, infinityou_guidance):
+        pipe.load_models_to_device("infinityou_processor")
         if infinityou_id_image is not None:
             return pipe.infinityou_processor.prepare_infinite_you(pipe.image_proj_model, infinityou_id_image, infinityou_guidance, pipe.device)
         else:
@@ -816,7 +848,7 @@ class InfinitYou(torch.nn.Module):
         self.app_320.prepare(ctx_id=0, det_size=(320, 320))
         self.app_160 = FaceAnalysis(name='antelopev2', root=insightface_root_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
         self.app_160.prepare(ctx_id=0, det_size=(160, 160))
-        self.arcface_model = init_recognition_model('arcface', device=self.device)
+        self.arcface_model = init_recognition_model('arcface', device=self.device).to(torch_dtype)
 
     def _detect_face(self, id_image_cv2):
         face_info = self.app_640.get(id_image_cv2)
diff --git a/examples/flux/model_inference_low_vram/EliGen.py b/examples/flux/model_inference_low_vram/EliGen.py
new file mode 100644
index 0000000..55b4b27
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/EliGen.py
@@ -0,0 +1,148 @@
+import random
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from diffsynth import download_customized_models
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from modelscope import dataset_snapshot_download
+
+
+def visualize_masks(image, masks, mask_prompts, output_path, font_size=35, use_random_colors=False):
+    # Create a blank image for overlays
+    overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
+    
+    colors = [
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+        (165, 238, 173, 80),
+        (76, 102, 221, 80),
+        (221, 160, 77, 80),
+        (204, 93, 71, 80),
+        (145, 187, 149, 80),
+        (134, 141, 172, 80),
+        (157, 137, 109, 80),
+        (153, 104, 95, 80),
+    ]
+    # Generate random colors for each mask
+    if use_random_colors:
+        colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 80) for _ in range(len(masks))]
+    
+    # Font settings
+    try:
+        font = ImageFont.truetype("arial", font_size)  # Adjust as needed
+    except IOError:
+        font = ImageFont.load_default(font_size)
+
+    # Overlay each mask onto the overlay image
+    for mask, mask_prompt, color in zip(masks, mask_prompts, colors):
+        # Convert mask to RGBA mode
+        mask_rgba = mask.convert('RGBA')
+        mask_data = mask_rgba.getdata()
+        new_data = [(color if item[:3] == (255, 255, 255) else (0, 0, 0, 0)) for item in mask_data]
+        mask_rgba.putdata(new_data)
+
+        # Draw the mask prompt text on the mask
+        draw = ImageDraw.Draw(mask_rgba)
+        mask_bbox = mask.getbbox()  # Get the bounding box of the mask
+        text_position = (mask_bbox[0] + 10, mask_bbox[1] + 10)  # Adjust text position based on mask position
+        draw.text(text_position, mask_prompt, fill=(255, 255, 255, 255), font=font)
+
+        # Alpha composite the overlay with this mask
+        overlay = Image.alpha_composite(overlay, mask_rgba)
+    
+    # Composite the overlay onto the original image
+    result = Image.alpha_composite(image.convert('RGBA'), overlay)
+    
+    # Save or display the resulting image
+    result.save(output_path)
+
+    return result
+
+def example(pipe, seeds, example_id, global_prompt, entity_prompts):
+    dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/entity_control/example_{example_id}/*.png")
+    masks = [Image.open(f"./data/examples/eligen/entity_control/example_{example_id}/{i}.png").convert('RGB') for i in range(len(entity_prompts))]
+    negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+    for seed in seeds:
+        # generate image
+        image = pipe(
+            prompt=global_prompt,
+            cfg_scale=3.0,
+            negative_prompt=negative_prompt,
+            num_inference_steps=50,
+            embedded_guidance=3.5,
+            seed=seed,
+            height=1024,
+            width=1024,
+            eligen_entity_prompts=entity_prompts,
+            eligen_entity_masks=masks,
+        )
+        image.save(f"eligen_example_{example_id}_{seed}.png")
+        visualize_masks(image, masks, entity_prompts, f"eligen_example_{example_id}_mask_{seed}.png")
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+download_from_modelscope = True
+if download_from_modelscope:
+    model_id = "DiffSynth-Studio/Eligen"
+    downloading_priority = ["ModelScope"]
+else:
+    model_id = "modelscope/EliGen"
+    downloading_priority = ["HuggingFace"]
+EliGen_path = download_customized_models(
+    model_id=model_id,
+    origin_file_path="model_bf16.safetensors",
+    local_dir="models/lora/entity_control",
+    downloading_priority=downloading_priority)[0]
+pipe.load_lora(pipe.dit, EliGen_path, alpha=1)
+
+# example 1
+global_prompt = "A breathtaking beauty of Raja Ampat by the late-night moonlight , one beautiful woman from behind wearing a pale blue long dress with soft glow, sitting at the top of a cliff looking towards the beach,pastell light colors, a group of small distant birds flying in far sky, a boat sailing on the sea, best quality, realistic, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, maximalist style, photorealistic, concept art, sharp focus, harmony, serenity, tranquility, soft pastell colors,ambient occlusion, cozy ambient lighting, masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning, view from above\n"
+entity_prompts = ["cliff", "sea", "moon", "sailing boat", "a seated beautiful woman", "pale blue long dress with soft glow"]
+example(pipe, [0], 1, global_prompt, entity_prompts)
+
+# example 2
+global_prompt = "samurai girl wearing a kimono, she's holding a sword  glowing with red flame, her long hair is flowing in the wind, she is looking at a small bird perched on the back of her hand. ultra realist style. maximum image detail. maximum realistic render."
+entity_prompts = ["flowing hair", "sword glowing with red flame", "A cute bird", "blue belt"]
+example(pipe, [0], 2, global_prompt, entity_prompts)
+
+# example 3
+global_prompt = "Image of a neverending staircase up to a mysterious palace in the sky, The ancient palace stood majestically atop a mist-shrouded mountain, sunrise, two traditional monk walk in the stair looking at the sunrise, fog,see-through, best quality, whimsical, fantastic, splash art, intricate detailed, hyperdetailed, photorealistic, concept art, harmony, serenity, tranquility, ambient occlusion, halation, cozy ambient lighting, dynamic lighting,masterpiece, liiv1, linquivera, metix, mentixis, masterpiece, award winning,"
+entity_prompts = ["ancient palace", "stone staircase with railings", "a traditional monk", "a traditional monk"]
+example(pipe, [27], 3, global_prompt, entity_prompts)
+
+# example 4
+global_prompt = "A beautiful girl wearing shirt and shorts in the street,  holding a sign 'Entity Control'"
+entity_prompts = ["A beautiful girl", "sign 'Entity Control'", "shorts", "shirt"]
+example(pipe, [21], 4, global_prompt, entity_prompts)
+
+# example 5
+global_prompt = "A captivating, dramatic scene in a painting that exudes mystery and foreboding. A white sky, swirling blue clouds, and a crescent yellow moon illuminate a solitary woman standing near the water's edge. Her long dress flows in the wind, silhouetted against the eerie glow. The water mirrors the fiery sky and moonlight, amplifying the uneasy atmosphere."
+entity_prompts = ["crescent yellow moon", "a solitary woman", "water", "swirling blue clouds"]
+example(pipe, [0], 5, global_prompt, entity_prompts)
+
+# example 6
+global_prompt = "Snow White and the 6 Dwarfs."
+entity_prompts = ["Dwarf 1", "Dwarf 2", "Dwarf 3", "Snow White", "Dwarf 4", "Dwarf 5", "Dwarf 6"]
+example(pipe, [8], 6, global_prompt, entity_prompts)
+
+# example 7, same prompt with different seeds
+seeds = range(5, 9)
+global_prompt = "A beautiful woman wearing white dress, holding a mirror, with a warm light background;"
+entity_prompts = ["A beautiful woman", "mirror", "necklace", "glasses", "earring", "white dress", "jewelry headpiece"]
+example(pipe, seeds, 7, global_prompt, entity_prompts)
diff --git a/examples/flux/model_inference_low_vram/FLEX.2-preview.py b/examples/flux/model_inference_low_vram/FLEX.2-preview.py
new file mode 100644
index 0000000..e94cc98
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLEX.2-preview.py
@@ -0,0 +1,51 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from diffsynth.controlnets.processors import Annotator
+import numpy as np
+from PIL import Image
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="ostris/Flex.2-preview", origin_file_pattern="Flex.2-preview.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl, long hair, red t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    seed=0
+)
+image.save(f"image_1.jpg")
+
+mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
+mask[200:400, 400:700] = 255
+mask = Image.fromarray(mask)
+mask.save(f"image_mask.jpg")
+
+inpaint_image = image
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl with sunglasses, long hair, red t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    flex_inpaint_image=inpaint_image, flex_inpaint_mask=mask,
+    seed=4
+)
+image.save(f"image_2_new.jpg")
+
+control_image = Annotator("canny")(image)
+control_image.save("image_control.jpg")
+
+image = pipe(
+    prompt="portrait of a beautiful Asian girl with sunglasses, long hair, yellow t-shirt, sunshine, beach",
+    num_inference_steps=50, embedded_guidance=3.5,
+    flex_control_image=control_image,
+    seed=4
+)
+image.save(f"image_3_new.jpg")
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py b/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py
new file mode 100644
index 0000000..c36c0dd
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py
@@ -0,0 +1,55 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from PIL import Image
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-Kontext-dev", origin_file_pattern="flux1-kontext-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+image_1 = pipe(
+    prompt="a beautiful Asian long-haired female college student.",
+    embedded_guidance=2.5,
+    seed=1,
+)
+image_1.save("image_1.jpg")
+
+image_2 = pipe(
+    prompt="transform the style to anime style.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=2,
+)
+image_2.save("image_2.jpg")
+
+image_3 = pipe(
+    prompt="let her smile.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=3,
+)
+image_3.save("image_3.jpg")
+
+image_4 = pipe(
+    prompt="let the girl play basketball.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=4,
+)
+image_4.save("image_4.jpg")
+
+image_5 = pipe(
+    prompt="move the girl to a park, let her sit on a chair.",
+    kontext_images=image_1,
+    embedded_guidance=2.5,
+    seed=5,
+)
+image_5.save("image_5.jpg")
\ No newline at end of file
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py
new file mode 100644
index 0000000..2dcc190
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py
@@ -0,0 +1,38 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig, ControlNetInput
+import numpy as np
+from PIL import Image
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", origin_file_pattern="diffusion_pytorch_model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+image_1 = pipe(
+    prompt="a cat sitting on a chair",
+    height=1024, width=1024,
+    seed=8, rand_device="cuda",
+)
+image_1.save("image_1.jpg")
+
+mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
+mask[100:350, 350: -300] = 255
+mask = Image.fromarray(mask)
+mask.save("mask.jpg")
+
+image_2 = pipe(
+    prompt="a cat sitting on a chair, wearing sunglasses",
+    controlnet_inputs=[ControlNetInput(image=image_1, inpaint_mask=mask, scale=0.9)],
+    height=1024, width=1024,
+    seed=9, rand_device="cuda",
+)
+image_2.save("image_2.jpg")
\ No newline at end of file
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py
new file mode 100644
index 0000000..62eeee0
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py
@@ -0,0 +1,41 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig, ControlNetInput
+from diffsynth.controlnets.processors import Annotator
+from diffsynth import download_models
+
+
+
+download_models(["Annotators:Depth"])
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="InstantX/FLUX.1-dev-Controlnet-Union-alpha", origin_file_pattern="diffusion_pytorch_model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+image_1 = pipe(
+    prompt="a beautiful Asian girl, full body, red dress, summer",
+    height=1024, width=1024,
+    seed=6, rand_device="cuda",
+)
+image_1.save("image_1.jpg")
+
+image_canny = Annotator("canny")(image_1)
+image_depth = Annotator("depth")(image_1)
+
+image_2 = pipe(
+    prompt="a beautiful Asian girl, full body, red dress, winter",
+    controlnet_inputs=[
+        ControlNetInput(image=image_canny, scale=0.3, processor_id="canny"),
+        ControlNetInput(image=image_depth, scale=0.3, processor_id="depth"),
+    ],
+    height=1024, width=1024,
+    seed=7, rand_device="cuda",
+)
+image_2.save("image_2.jpg")
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py
new file mode 100644
index 0000000..58c3b9d
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py
@@ -0,0 +1,34 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig, ControlNetInput
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="jasperai/Flux.1-dev-Controlnet-Upscaler", origin_file_pattern="diffusion_pytorch_model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+image_1 = pipe(
+    prompt="a photo of a cat, highly detailed",
+    height=768, width=768,
+    seed=0, rand_device="cuda",
+)
+image_1.save("image_1.jpg")
+
+image_1 = image_1.resize((2048, 2048))
+image_2 = pipe(
+    prompt="a photo of a cat, highly detailed",
+    controlnet_inputs=[ControlNetInput(image=image_1, scale=0.7)],
+    input_image=image_1,
+    denoising_strength=0.99,
+    height=2048, width=2048, tiled=True,
+    seed=1, rand_device="cuda",
+)
+image_2.save("image_2.jpg")
\ No newline at end of file
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py
new file mode 100644
index 0000000..83439a9
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py
@@ -0,0 +1,25 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="InstantX/FLUX.1-dev-IP-Adapter", origin_file_pattern="ip-adapter.bin", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="google/siglip-so400m-patch14-384", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+origin_prompt = "a rabbit in a garden, colorful flowers"
+image = pipe(prompt=origin_prompt, height=1280, width=960, seed=42)
+image.save("style image.jpg")
+
+image = pipe(prompt="A piggy", height=1280, width=960, seed=42,
+    ipadapter_images=[image], ipadapter_scale=0.7)
+image.save("A piggy.jpg")
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py
new file mode 100644
index 0000000..dbe719a
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py
@@ -0,0 +1,60 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig, ControlNetInput
+from modelscope import dataset_snapshot_download
+from modelscope import snapshot_download
+from PIL import Image
+import numpy as np
+
+
+snapshot_download(
+    "ByteDance/InfiniteYou",
+    allow_file_pattern="supports/insightface/models/antelopev2/*",
+    local_dir="models/ByteDance/InfiniteYou",
+)
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/image_proj_model.bin", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/InfuseNetModel/*.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+dataset_snapshot_download(
+    dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+    local_dir="./",
+    allow_file_pattern=f"data/examples/infiniteyou/*",
+)
+
+height, width = 1024, 1024
+controlnet_image = Image.fromarray(np.zeros([height, width, 3]).astype(np.uint8))
+controlnet_inputs = [ControlNetInput(image=controlnet_image, scale=1.0, processor_id="None")]
+
+prompt = "A man, portrait, cinematic"
+id_image = "data/examples/infiniteyou/man.jpg"
+id_image = Image.open(id_image).convert('RGB')
+image = pipe(
+    prompt=prompt, seed=1,
+    infinityou_id_image=id_image, infinityou_guidance=1.0,
+    controlnet_inputs=controlnet_inputs,
+    num_inference_steps=50, embedded_guidance=3.5,
+    height=height, width=width,
+)
+image.save("man.jpg")
+
+prompt = "A woman, portrait, cinematic"
+id_image = "data/examples/infiniteyou/woman.jpg"
+id_image = Image.open(id_image).convert('RGB')
+image = pipe(
+    prompt=prompt, seed=1,
+    infinityou_id_image=id_image, infinityou_guidance=1.0,
+    controlnet_inputs=controlnet_inputs,
+    num_inference_steps=50, embedded_guidance=3.5,
+    height=height, width=width,
+)
+image.save("woman.jpg")
\ No newline at end of file
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRAFusion.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRAFusion.py
new file mode 100644
index 0000000..44ad3a5
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRAFusion.py
@@ -0,0 +1,35 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="DiffSynth-Studio/FLUX.1-dev-LoRAFusion", origin_file_pattern="model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn)
+    ],
+)
+pipe.enable_vram_management()
+pipe.enable_lora_patcher()
+pipe.load_lora(
+    pipe.dit,
+    ModelConfig(model_id="yangyufeng/fgao", origin_file_pattern="30.safetensors"),
+    hotload=True
+)
+pipe.load_lora(
+    pipe.dit,
+    ModelConfig(model_id="bobooblue/LoRA-bling-mai", origin_file_pattern="10.safetensors"),
+    hotload=True
+)
+pipe.load_lora(
+    pipe.dit,
+    ModelConfig(model_id="JIETANGAB/E", origin_file_pattern="17.safetensors"),
+    hotload=True
+)
+
+image = pipe(prompt="This is a digital painting in a soft, ethereal style. a beautiful Asian girl Shine like a diamond. Everywhere is shining with bling bling luster.The background is a textured blue with visible brushstrokes, giving the image an impressionistic style reminiscent of Vincent van Gogh's work", seed=0)
+image.save("flux.jpg")
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev-ValueControl.py b/examples/flux/model_inference_low_vram/FLUX.1-dev-ValueControl.py
new file mode 100644
index 0000000..bb6be21
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev-ValueControl.py
@@ -0,0 +1,21 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="DiffSynth-Studio/FLUX.1-dev-ValueController", origin_file_pattern="single/prefer_embed/value.ckpt", offload_device="cpu", offload_dtype=torch.float8_e4m3fn)
+    ],
+)
+pipe.load_lora(pipe.dit, ModelConfig(model_id="DiffSynth-Studio/FLUX.1-dev-ValueController", origin_file_pattern="single/dit_lora/dit_value.ckpt"))
+pipe.enable_vram_management()
+
+for i in range(10):
+    image = pipe(prompt="a cat", seed=0, value_controller_inputs=[i/10])
+    image.save(f"value_control_{i}.jpg")
\ No newline at end of file
diff --git a/examples/flux/model_inference_low_vram/FLUX.1-dev.py b/examples/flux/model_inference_low_vram/FLUX.1-dev.py
new file mode 100644
index 0000000..41d05a4
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/FLUX.1-dev.py
@@ -0,0 +1,27 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
+negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
+
+image = pipe(prompt=prompt, seed=0)
+image.save("flux.jpg")
+
+image = pipe(
+    prompt=prompt, negative_prompt=negative_prompt,
+    seed=0, cfg_scale=2, num_inference_steps=50,
+)
+image.save("flux_cfg.jpg")
diff --git a/examples/flux/model_inference_low_vram/Step1X-Edit.py b/examples/flux/model_inference_low_vram/Step1X-Edit.py
new file mode 100644
index 0000000..aad034f
--- /dev/null
+++ b/examples/flux/model_inference_low_vram/Step1X-Edit.py
@@ -0,0 +1,33 @@
+import torch
+from diffsynth.pipelines.flux_image_new import FluxImagePipeline, ModelConfig
+from PIL import Image
+import numpy as np
+
+
+pipe = FluxImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="Qwen/Qwen2.5-VL-7B-Instruct", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="step1x-edit-i1258.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+        ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="vae.safetensors", offload_device="cpu", offload_dtype=torch.float8_e4m3fn),
+    ],
+)
+pipe.enable_vram_management()
+
+image = Image.fromarray(np.zeros((1248, 832, 3), dtype=np.uint8) + 255)
+image = pipe(
+    prompt="draw red flowers in Chinese ink painting style",
+    step1x_reference_image=image,
+    width=832, height=1248, cfg_scale=6,
+    seed=1, rand_device='cuda'
+)
+image.save("image_1.jpg")
+
+image = pipe(
+    prompt="add more flowers in Chinese ink painting style",
+    step1x_reference_image=image,
+    width=832, height=1248, cfg_scale=6,
+    seed=2, rand_device='cuda'
+)
+image.save("image_2.jpg")