path

docs
Merge branch 'main' of https://github.com/modelscope/DiffSynth-Studio
2026-04-26 08:36:13 +00:00 · 2026-04-23 18:09:16 +08:00 · 2026-04-23 18:02:58 +08:00 · 2026-04-23 17:39:10 +08:00 · 2026-04-23 17:31:34 +08:00 · 2026-04-23 16:52:59 +08:00
1064 changed files with 89039 additions and 274036 deletions
--- a/.github/workflows/logo.gif
+++ b/.github/workflows/logo.gif
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -0,0 +1,29 @@
 name: release
 on:
  push:
    tags:
      - 'v**'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-publish
  cancel-in-progress: true
 jobs:
  build-n-publish:
    runs-on: ubuntu-20.04
    #if: startsWith(github.event.ref, 'refs/tags')
    steps:
      - uses: actions/checkout@v2
      - name: Set up Python 3.10
        uses: actions/setup-python@v2
        with:
          python-version: '3.10'
      - name: Install wheel
        run: pip install wheel==0.44.0 && pip install -r requirements.txt
      - name: Build DiffSynth
        run: python -m build
      - name: Publish package to PyPI
        run: |
          pip install twine
          twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,176 @@
 /data
 /models
 /scripts
 /diffusers
 /.vscode
 *.pkl
 *.safetensors
 *.pth
 *.ckpt
 *.pt
 *.bin
 *.DS_Store
 *.msc
 *.mv
 log*.txt
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/DiffSynth_Studio.py
+++ b/DiffSynth_Studio.py
@@ -1,15 +0,0 @@
 # Set web page format
 import streamlit as st
 st.set_page_config(layout="wide")
 # Diasble virtual VRAM on windows system
 import torch
 torch.cuda.set_per_process_memory_fraction(0.999, 0)
 st.markdown("""
 # DiffSynth Studio
 [Source Code](https://github.com/Artiprocher/DiffSynth-Studio)
 Welcome to DiffSynth Studio.
 """)
--- a/ExVideo_animatediff_train.py
+++ b/ExVideo_animatediff_train.py
@@ -1,267 +0,0 @@
 import torch, json, os, imageio
 from torchvision.transforms import v2
 from einops import rearrange
 import lightning as pl
 from diffsynth import ModelManager, EnhancedDDIMScheduler, SDVideoPipeline, SDUNet, load_state_dict, SDMotionModel
 def lets_dance(
    unet: SDUNet,
    motion_modules: SDMotionModel,
    sample,
    timestep,
    encoder_hidden_states,
    use_gradient_checkpointing=False,
 ):
    # 1. ControlNet (skip)
    # 2. time
    time_emb = unet.time_proj(timestep[None]).to(sample.dtype)
    time_emb = unet.time_embedding(time_emb)
    # 3. pre-process
    hidden_states = unet.conv_in(sample)
    text_emb = encoder_hidden_states
    res_stack = [hidden_states]
    # 4. blocks
    def create_custom_forward(module):
        def custom_forward(*inputs):
            return module(*inputs)
        return custom_forward
    for block_id, block in enumerate(unet.blocks):
        # 4.1 UNet
        if use_gradient_checkpointing:
            hidden_states, time_emb, text_emb, res_stack = torch.utils.checkpoint.checkpoint(
                create_custom_forward(block),
                hidden_states, time_emb, text_emb, res_stack,
                use_reentrant=False,
            )
        else:
            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
        # 4.2 AnimateDiff
        if block_id in motion_modules.call_block_id:
            motion_module_id = motion_modules.call_block_id[block_id]
            if use_gradient_checkpointing:
                hidden_states, time_emb, text_emb, res_stack = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(motion_modules.motion_modules[motion_module_id]),
                    hidden_states, time_emb, text_emb, res_stack,
                    use_reentrant=False,
                )
            else:
                hidden_states, time_emb, text_emb, res_stack = motion_modules.motion_modules[motion_module_id](hidden_states, time_emb, text_emb, res_stack)
    # 5. output
    hidden_states = unet.conv_norm_out(hidden_states)
    hidden_states = unet.conv_act(hidden_states)
    hidden_states = unet.conv_out(hidden_states)
    return hidden_states
 class TextVideoDataset(torch.utils.data.Dataset):
    def __init__(self, base_path, metadata_path, steps_per_epoch=10000, training_shapes=[(128, 1, 128, 512, 512)]):
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
        self.path = [os.path.join(base_path, i["path"]) for i in metadata]
        self.text = [i["text"] for i in metadata]
        self.steps_per_epoch = steps_per_epoch
        self.training_shapes = training_shapes
        self.frame_process = []
        for max_num_frames, interval, num_frames, height, width in training_shapes:
            self.frame_process.append(v2.Compose([
                v2.Resize(size=max(height, width), antialias=True),
                v2.CenterCrop(size=(height, width)),
                v2.Normalize(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]),
            ]))
    def load_frames_using_imageio(self, file_path, max_num_frames, start_frame_id, interval, num_frames, frame_process):
        reader = imageio.get_reader(file_path)
        if reader.count_frames() < max_num_frames or reader.count_frames() - 1 < start_frame_id + (num_frames - 1) * interval:
            reader.close()
            return None
        frames = []
        for frame_id in range(num_frames):
            frame = reader.get_data(start_frame_id + frame_id * interval)
            frame = torch.tensor(frame, dtype=torch.float32)
            frame = rearrange(frame, "H W C -> 1 C H W")
            frame = frame_process(frame)
            frames.append(frame)
        reader.close()
        frames = torch.concat(frames, dim=0)
        frames = rearrange(frames, "T C H W -> C T H W")
        return frames
    def load_video(self, file_path, training_shape_id):
        data = {}
        max_num_frames, interval, num_frames, height, width = self.training_shapes[training_shape_id]
        frame_process = self.frame_process[training_shape_id]
        start_frame_id = torch.randint(0, max_num_frames - (num_frames - 1) * interval, (1,))[0]
        frames = self.load_frames_using_imageio(file_path, max_num_frames, start_frame_id, interval, num_frames, frame_process)
        if frames is None:
            return None
        else:
            data[f"frames_{training_shape_id}"] = frames
            data[f"start_frame_id_{training_shape_id}"] = start_frame_id
        return data
    def __getitem__(self, index):
        video_data = {}
        for training_shape_id in range(len(self.training_shapes)):
            while True:
                data_id = torch.randint(0, len(self.path), (1,))[0]
                data_id = (data_id + index) % len(self.path) # For fixed seed.
                text = self.text[data_id]
                if isinstance(text, list):
                    text = text[torch.randint(0, len(text), (1,))[0]]
                video_file = self.path[data_id]
                try:
                    data = self.load_video(video_file, training_shape_id)
                except:
                    data = None
                if data is not None:
                    data[f"text_{training_shape_id}"] = text
                    break
            video_data.update(data)
        return video_data
    def __len__(self):
        return self.steps_per_epoch
 class LightningModel(pl.LightningModule):
    def __init__(self, learning_rate=1e-5, sd_ckpt_path=None):
        super().__init__()
        # Load models
        model_manager = ModelManager(torch_dtype=torch.float16, device="cpu")
        model_manager.load_stable_diffusion(load_state_dict(sd_ckpt_path))
        # Initialize motion modules
        model_manager.model["motion_modules"] = SDMotionModel().to(dtype=self.dtype, device=self.device)
        # Build pipeline
        self.pipe = SDVideoPipeline.from_model_manager(model_manager)
        self.pipe.vae_encoder.eval()
        self.pipe.vae_encoder.requires_grad_(False)
        self.pipe.vae_decoder.eval()
        self.pipe.vae_decoder.requires_grad_(False)
        self.pipe.text_encoder.eval()
        self.pipe.text_encoder.requires_grad_(False)
        self.pipe.unet.eval()
        self.pipe.unet.requires_grad_(False)
        self.pipe.motion_modules.train()
        self.pipe.motion_modules.requires_grad_(True)
        # Reset the scheduler
        self.pipe.scheduler = EnhancedDDIMScheduler(beta_schedule="scaled_linear")
        self.pipe.scheduler.set_timesteps(1000)
        # Other parameters
        self.learning_rate = learning_rate
    def encode_video_with_vae(self, video):
        video = video.to(device=self.device, dtype=self.dtype)
        video = video.unsqueeze(0)
        latents = self.pipe.vae_encoder.encode_video(video, batch_size=16)
        latents = rearrange(latents[0], "C T H W -> T C H W")
        return latents
    def calculate_loss(self, prompt, frames):
        with torch.no_grad():
            # Call video encoder
            latents = self.encode_video_with_vae(frames)
            # Call text encoder
            prompt_embs = self.pipe.prompter.encode_prompt(self.pipe.text_encoder, prompt, device=self.device, max_length=77)
            prompt_embs = prompt_embs.repeat(latents.shape[0], 1, 1)
            # Call scheduler
            timestep = torch.randint(0, len(self.pipe.scheduler.timesteps), (1,), device=self.device)[0]
            noise = torch.randn_like(latents)
            noisy_latents = self.pipe.scheduler.add_noise(latents, noise, timestep)
        # Calculate loss
        model_pred = lets_dance(
            self.pipe.unet, self.pipe.motion_modules,
            sample=noisy_latents, encoder_hidden_states=prompt_embs, timestep=timestep
        )
        loss = torch.nn.functional.mse_loss(model_pred.float(), noise.float(), reduction="mean")
        return loss
    def training_step(self, batch, batch_idx):
        # Loss
        frames = batch["frames_0"][0]
        prompt = batch["text_0"][0]
        loss = self.calculate_loss(prompt, frames)
        # Record log
        self.log("train_loss", loss, prog_bar=True)
        return loss
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.pipe.motion_modules.parameters(), lr=self.learning_rate)
        return optimizer
    def on_save_checkpoint(self, checkpoint):
        trainable_param_names = list(filter(lambda named_param: named_param[1].requires_grad, self.pipe.motion_modules.named_parameters()))
        trainable_param_names = [named_param[0] for named_param in trainable_param_names]
        checkpoint["trainable_param_names"] = trainable_param_names
 if __name__ == '__main__':
    # dataset and data loader
    dataset = TextVideoDataset(
        "/data/zhongjie/datasets/opensoraplan/data/processed",
        "/data/zhongjie/datasets/opensoraplan/data/processed/metadata.json",
        training_shapes=[(16, 1, 16, 512, 512)],
        steps_per_epoch=7*10000,
    )
    train_loader = torch.utils.data.DataLoader(
        dataset,
        shuffle=True,
        batch_size=1,
        num_workers=4
    )
    # model
    model = LightningModel(
        learning_rate=1e-5,
        sd_ckpt_path="models/stable_diffusion/v1-5-pruned-emaonly.safetensors",
    )
    # train
    trainer = pl.Trainer(
        max_epochs=100000,
        accelerator="gpu",
        devices="auto",
        strategy="deepspeed_stage_1",
        precision="16-mixed",
        default_root_dir="/data/zhongjie/models/train_extended_animatediff",
        accumulate_grad_batches=1,
        callbacks=[pl.pytorch.callbacks.ModelCheckpoint(save_top_k=-1)]
    )
    trainer.fit(
        model=model,
        train_dataloaders=train_loader,
        ckpt_path=None
    )
--- a/README.md
+++ b/README.md
--- a/README_zh.md
+++ b/README_zh.md
--- a/configs/hunyuan_dit/tokenizer/special_tokens_map.json
+++ b/configs/hunyuan_dit/tokenizer/special_tokens_map.json
@@ -1,7 +0,0 @@
 {
  "cls_token": "[CLS]",
  "mask_token": "[MASK]",
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "unk_token": "[UNK]"
 }
--- a/configs/hunyuan_dit/tokenizer/tokenizer_config.json
+++ b/configs/hunyuan_dit/tokenizer/tokenizer_config.json
@@ -1,16 +0,0 @@
 {
  "cls_token": "[CLS]",
  "do_basic_tokenize": true,
  "do_lower_case": true,
  "mask_token": "[MASK]",
  "name_or_path": "hfl/chinese-roberta-wwm-ext",
  "never_split": null,
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "special_tokens_map_file": "/home/chenweifeng/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext/snapshots/5c58d0b8ec1d9014354d691c538661bf00bfdb44/special_tokens_map.json",
  "strip_accents": null,
  "tokenize_chinese_chars": true,
  "tokenizer_class": "BertTokenizer",
  "unk_token": "[UNK]",
  "model_max_length": 77
 }
--- a/configs/hunyuan_dit/tokenizer/vocab.txt
+++ b/configs/hunyuan_dit/tokenizer/vocab.txt
--- a/configs/hunyuan_dit/tokenizer/vocab_org.txt
+++ b/configs/hunyuan_dit/tokenizer/vocab_org.txt
--- a/configs/hunyuan_dit/tokenizer_t5/config.json
+++ b/configs/hunyuan_dit/tokenizer_t5/config.json
@@ -1,28 +0,0 @@
 {
  "_name_or_path": "/home/patrick/t5/mt5-xl",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 5120,
  "d_kv": 64,
  "d_model": 2048,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 24,
  "num_heads": 32,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.10.0.dev0",
  "use_cache": true,
  "vocab_size": 250112
 }
--- a/configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json
+++ b/configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json
@@ -1 +0,0 @@
 {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
--- a/configs/hunyuan_dit/tokenizer_t5/spiece.model
+++ b/configs/hunyuan_dit/tokenizer_t5/spiece.model
--- a/configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json
+++ b/configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json
@@ -1 +0,0 @@
 {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0, "additional_special_tokens": null, "special_tokens_map_file": "", "tokenizer_file": null, "name_or_path": "google/mt5-small", "model_max_length": 256, "legacy": true}
--- a/configs/stable_diffusion/tokenizer/merges.txt
+++ b/configs/stable_diffusion/tokenizer/merges.txt
--- a/configs/stable_diffusion/tokenizer/special_tokens_map.json
+++ b/configs/stable_diffusion/tokenizer/special_tokens_map.json
@@ -1,24 +0,0 @@
 {
  "bos_token": {
    "content": "<|startoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": "<|endoftext|>",
  "unk_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/configs/stable_diffusion/tokenizer/tokenizer_config.json
+++ b/configs/stable_diffusion/tokenizer/tokenizer_config.json
@@ -1,34 +0,0 @@
 {
  "add_prefix_space": false,
  "bos_token": {
    "__type": "AddedToken",
    "content": "<|startoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
    "single_word": false
  },
  "do_lower_case": true,
  "eos_token": {
    "__type": "AddedToken",
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
    "single_word": false
  },
  "errors": "replace",
  "model_max_length": 77,
  "name_or_path": "openai/clip-vit-large-patch14",
  "pad_token": "<|endoftext|>",
  "special_tokens_map_file": "./special_tokens_map.json",
  "tokenizer_class": "CLIPTokenizer",
  "unk_token": {
    "__type": "AddedToken",
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": true,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/configs/stable_diffusion/tokenizer/vocab.json
+++ b/configs/stable_diffusion/tokenizer/vocab.json
--- a/configs/stable_diffusion_xl/tokenizer_2/merges.txt
+++ b/configs/stable_diffusion_xl/tokenizer_2/merges.txt
--- a/configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json
+++ b/configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json
@@ -1,24 +0,0 @@
 {
    "bos_token": {
      "content": "<|startoftext|>",
      "lstrip": false,
      "normalized": true,
      "rstrip": false,
      "single_word": false
    },
    "eos_token": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": true,
      "rstrip": false,
      "single_word": false
    },
    "pad_token": "!",
    "unk_token": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": true,
      "rstrip": false,
      "single_word": false
    }
 }
--- a/configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json
+++ b/configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json
@@ -1,38 +0,0 @@
 {
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "0": {
      "content": "!",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "49406": {
      "content": "<|startoftext|>",
      "lstrip": false,
      "normalized": true,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "49407": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": true,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "bos_token": "<|startoftext|>",
  "clean_up_tokenization_spaces": true,
  "do_lower_case": true,
  "eos_token": "<|endoftext|>",
  "errors": "replace",
  "model_max_length": 77,
  "pad_token": "!",
  "tokenizer_class": "CLIPTokenizer",
  "unk_token": "<|endoftext|>"
 }
--- a/configs/stable_diffusion_xl/tokenizer_2/vocab.json
+++ b/configs/stable_diffusion_xl/tokenizer_2/vocab.json
--- a/diffsynth/init.py
+++ b/diffsynth/init.py
@@ -1,6 +1 @@
-from .data import *
+from .core import *
 from .models import *
 from .prompts import *
 from .schedulers import *
 from .pipelines import *
 from .controlnets import *
--- a/diffsynth/configs/init.py
+++ b/diffsynth/configs/init.py
@@ -0,0 +1,2 @@
 from .model_configs import MODEL_CONFIGS
 from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS, VERSION_CHECKER_MAPS
--- a/diffsynth/configs/model_configs.py
+++ b/diffsynth/configs/model_configs.py
--- a/diffsynth/configs/vram_management_module_maps.py
+++ b/diffsynth/configs/vram_management_module_maps.py
@@ -0,0 +1,350 @@
 flux_general_vram_config = {
    "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
    "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    "diffsynth.models.general_modules.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    "diffsynth.models.flux_lora_encoder.LoRALayerBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
    "diffsynth.models.flux_lora_patcher.LoraMerger": "diffsynth.core.vram.layers.AutoWrappedModule",
 }
 VRAM_MANAGEMENT_MODULE_MAPS = {
    "diffsynth.models.qwen_image_dit.QwenImageDiT": {
        "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionPatchEmbed": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.qwen_image_vae.QwenImageVAE": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.qwen_image_vae.QwenImageRMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.qwen_image_controlnet.BlockWiseControlBlock": {
        "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder": {
        "transformers.models.siglip.modeling_siglip.SiglipVisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder": {
        "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTLayerScale": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTRopePositionEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.wan_video_animate_adapter.WanAnimateAdapter": {
        "diffsynth.models.wan_video_animate_adapter.FaceEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_animate_adapter.EqualLinear": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_animate_adapter.ConvLayer": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_animate_adapter.FusedLeakyReLU": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_animate_adapter.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_dit_s2v.WanS2VModel": {
        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit_s2v.WanS2VDiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit_s2v.CausalAudioEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_dit.WanModel": {
        "diffsynth.models.wan_video_dit.MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_image_encoder.WanImageEncoder": {
        "diffsynth.models.wan_video_image_encoder.VisionTransformer": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_mot.MotWanModel": {
        "diffsynth.models.wan_video_mot.MotWanAttentionBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_motion_controller.WanMotionControllerModel": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.wan_video_text_encoder.WanTextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_text_encoder.T5RelativeEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_text_encoder.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_vace.VaceWanModel": {
        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_vae.WanVideoVAE": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wan_video_vae.WanVideoVAE38": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.wav2vec.WanS2VAudioEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.longcat_video_dit.LongCatVideoTransformer3DModel": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.longcat_video_dit.RMSNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.longcat_video_dit.LayerNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.flux_dit.FluxDiT": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "diffsynth.models.flux_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.flux_text_encoder_clip.FluxTextEncoderClip": flux_general_vram_config,
    "diffsynth.models.flux_vae.FluxVAEEncoder": flux_general_vram_config,
    "diffsynth.models.flux_vae.FluxVAEDecoder": flux_general_vram_config,
    "diffsynth.models.flux_controlnet.FluxControlNet": flux_general_vram_config,
    "diffsynth.models.flux_infiniteyou.InfiniteYouImageProjector": flux_general_vram_config,
    "diffsynth.models.flux_ipadapter.FluxIpAdapter": flux_general_vram_config,
    "diffsynth.models.flux_lora_patcher.FluxLoraPatcher": flux_general_vram_config,
    "diffsynth.models.step1x_connector.Qwen2Connector": flux_general_vram_config,
    "diffsynth.models.flux_lora_encoder.FluxLoRAEncoder": flux_general_vram_config,
    "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.t5.modeling_t5.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.t5.modeling_t5.T5DenseActDense": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.t5.modeling_t5.T5DenseGatedActDense": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.flux_ipadapter.SiglipVisionModelSO400M": {
        "transformers.models.siglip.modeling_siglip.SiglipVisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.siglip.modeling_siglip.SiglipEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.MultiheadAttention": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.flux2_dit.Flux2DiT": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.flux2_text_encoder.Flux2TextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.mistral.modeling_mistral.MistralRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.flux2_vae.Flux2VAE": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.z_image_text_encoder.ZImageTextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.z_image_dit.ZImageDiT": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "diffsynth.models.z_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.z_image_controlnet.ZImageControlNet": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "diffsynth.models.z_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M": {
        "transformers.models.siglip2.modeling_siglip2.Siglip2VisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.siglip2.modeling_siglip2.Siglip2MultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
    },
    "diffsynth.models.ltx2_dit.LTXModel": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler": {
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder": {
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder": {
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder": {
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_audio_vae.LTX2Vocoder": {
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.ltx2_text_encoder.Embeddings1DConnector": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ltx2_text_encoder.LTX2TextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "transformers.models.gemma3.modeling_gemma3.Gemma3MultiModalProjector": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.gemma3.modeling_gemma3.Gemma3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.gemma3.modeling_gemma3.Gemma3TextScaledWordEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.anima_dit.AnimaDiT": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.mova_audio_dit.MovaAudioDit": {
        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.mova_audio_vae.DacVAE": {
        "diffsynth.models.mova_audio_vae.Snake1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ernie_image_dit.ErnieImageDiT": {
        "diffsynth.models.ernie_image_dit.ErnieImageRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.ministral3.modeling_ministral3.Ministral3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.joyai_image_dit.Transformer3DModel": {
        "diffsynth.models.joyai_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.joyai_image_dit.ModulateWan": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.joyai_image_text_encoder.JoyAIImageTextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLVisionModel": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3_vl.modeling_qwen3_vl.Qwen3VLTextRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    # ACE-Step module maps
    "diffsynth.models.ace_step_dit.AceStepDiTModel": {
        "diffsynth.models.ace_step_dit.AceStepDiTLayer": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ace_step_conditioner.AceStepConditionEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ace_step_text_encoder.AceStepTextEncoder": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ace_step_vae.AceStepVAE": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
        "diffsynth.models.ace_step_vae.Snake1d": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
    "diffsynth.models.ace_step_tokenizer.AceStepTokenizer": {
        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
        "vector_quantize_pytorch.ResidualFSQ": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
        "transformers.models.qwen3.modeling_qwen3.Qwen3RotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
    },
 }
 def QwenImageTextEncoder_Module_Map_Updater():
    current = VRAM_MANAGEMENT_MODULE_MAPS["diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder"]
    from packaging import version
    import transformers
    if version.parse(transformers.__version__) >= version.parse("5.2.0"):
        # The Qwen2RMSNorm in transformers 5.2.0+ has been renamed to Qwen2_5_VLRMSNorm, so we need to update the module map accordingly
        current.pop("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm", None)
        current["transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRMSNorm"] = "diffsynth.core.vram.layers.AutoWrappedModule"
    return current
 VERSION_CHECKER_MAPS = {
    "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": QwenImageTextEncoder_Module_Map_Updater,
 }
--- a/diffsynth/controlnets/init.py
+++ b/diffsynth/controlnets/init.py
@@ -1,2 +0,0 @@
 from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager
 from .processors import Annotator
--- a/diffsynth/controlnets/controlnet_unit.py
+++ b/diffsynth/controlnets/controlnet_unit.py
@@ -1,53 +0,0 @@
 import torch
 import numpy as np
 from .processors import Processor_id
 class ControlNetConfigUnit:
    def __init__(self, processor_id: Processor_id, model_path, scale=1.0):
        self.processor_id = processor_id
        self.model_path = model_path
        self.scale = scale
 class ControlNetUnit:
    def __init__(self, processor, model, scale=1.0):
        self.processor = processor
        self.model = model
        self.scale = scale
 class MultiControlNetManager:
    def __init__(self, controlnet_units=[]):
        self.processors = [unit.processor for unit in controlnet_units]
        self.models = [unit.model for unit in controlnet_units]
        self.scales = [unit.scale for unit in controlnet_units]
    def process_image(self, image, processor_id=None):
        if processor_id is None:
            processed_image = [processor(image) for processor in self.processors]
        else:
            processed_image = [self.processors[processor_id](image)]
        processed_image = torch.concat([
            torch.Tensor(np.array(image_, dtype=np.float32) / 255).permute(2, 0, 1).unsqueeze(0)
            for image_ in processed_image
        ], dim=0)
        return processed_image
    def __call__(
        self,
        sample, timestep, encoder_hidden_states, conditionings,
        tiled=False, tile_size=64, tile_stride=32
    ):
        res_stack = None
        for conditioning, model, scale in zip(conditionings, self.models, self.scales):
            res_stack_ = model(
                sample, timestep, encoder_hidden_states, conditioning,
                tiled=tiled, tile_size=tile_size, tile_stride=tile_stride
            )
            res_stack_ = [res * scale for res in res_stack_]
            if res_stack is None:
                res_stack = res_stack_
            else:
                res_stack = [i + j for i, j in zip(res_stack, res_stack_)]
        return res_stack
--- a/diffsynth/controlnets/processors.py
+++ b/diffsynth/controlnets/processors.py
@@ -1,51 +0,0 @@
 from typing_extensions import Literal, TypeAlias
 import warnings
 with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    from controlnet_aux.processor import (
        CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector
    )
 Processor_id: TypeAlias = Literal[
    "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "tile"
 ]
 class Annotator:
    def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None):
        if processor_id == "canny":
            self.processor = CannyDetector()
        elif processor_id == "depth":
            self.processor = MidasDetector.from_pretrained(model_path).to("cuda")
        elif processor_id == "softedge":
            self.processor = HEDdetector.from_pretrained(model_path).to("cuda")
        elif processor_id == "lineart":
            self.processor = LineartDetector.from_pretrained(model_path).to("cuda")
        elif processor_id == "lineart_anime":
            self.processor = LineartAnimeDetector.from_pretrained(model_path).to("cuda")
        elif processor_id == "openpose":
            self.processor = OpenposeDetector.from_pretrained(model_path).to("cuda")
        elif processor_id == "tile":
            self.processor = None
        else:
            raise ValueError(f"Unsupported processor_id: {processor_id}")
        self.processor_id = processor_id
        self.detect_resolution = detect_resolution
    def __call__(self, image):
        width, height = image.size
        if self.processor_id == "openpose":
            kwargs = {
                "include_body": True,
                "include_hand": True,
                "include_face": True
            }
        else:
            kwargs = {}
        if self.processor is not None:
            detect_resolution = self.detect_resolution if self.detect_resolution is not None else min(width, height)
            image = self.processor(image, detect_resolution=detect_resolution, image_resolution=min(width, height), **kwargs)
        image = image.resize((width, height))
        return image
--- a/diffsynth/core/init.py
+++ b/diffsynth/core/init.py
@@ -0,0 +1,6 @@
 from .attention import *
 from .data import *
 from .gradient import *
 from .loader import *
 from .vram import *
 from .device import *
--- a/diffsynth/core/attention/init.py
+++ b/diffsynth/core/attention/init.py
@@ -0,0 +1 @@
 from .attention import attention_forward
--- a/diffsynth/core/attention/attention.py
+++ b/diffsynth/core/attention/attention.py
@@ -0,0 +1,121 @@
 import torch, os
 from einops import rearrange
 try:
    import flash_attn_interface
    FLASH_ATTN_3_AVAILABLE = True
 except ModuleNotFoundError:
    FLASH_ATTN_3_AVAILABLE = False
 try:
    import flash_attn
    FLASH_ATTN_2_AVAILABLE = True
 except ModuleNotFoundError:
    FLASH_ATTN_2_AVAILABLE = False
 try:
    from sageattention import sageattn
    SAGE_ATTN_AVAILABLE = True
 except ModuleNotFoundError:
    SAGE_ATTN_AVAILABLE = False
 try:
    import xformers.ops as xops
    XFORMERS_AVAILABLE = True
 except ModuleNotFoundError:
    XFORMERS_AVAILABLE = False
 def initialize_attention_priority():
    if os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION') is not None:
        return os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION').lower()
    elif FLASH_ATTN_3_AVAILABLE:
        return "flash_attention_3"
    elif FLASH_ATTN_2_AVAILABLE:
        return "flash_attention_2"
    elif SAGE_ATTN_AVAILABLE:
        return "sage_attention"
    elif XFORMERS_AVAILABLE:
        return "xformers"
    else:
        return "torch"
 ATTENTION_IMPLEMENTATION = initialize_attention_priority()
 def rearrange_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", required_in_pattern="b n s d", dims=None):
    dims = {} if dims is None else dims
    if q_pattern != required_in_pattern:
        q = rearrange(q, f"{q_pattern} -> {required_in_pattern}", **dims)
    if k_pattern != required_in_pattern:
        k = rearrange(k, f"{k_pattern} -> {required_in_pattern}", **dims)
    if v_pattern != required_in_pattern:
        v = rearrange(v, f"{v_pattern} -> {required_in_pattern}", **dims)
    return q, k, v
 def rearrange_out(out: torch.Tensor, out_pattern="b n s d", required_out_pattern="b n s d", dims=None):
    dims = {} if dims is None else dims
    if out_pattern != required_out_pattern:
        out = rearrange(out, f"{required_out_pattern} -> {out_pattern}", **dims)
    return out
 def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, attn_mask=None, scale=None):
    required_in_pattern, required_out_pattern= "b n s d", "b n s d"
    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale)
    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
    return out
 def flash_attention_3(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
    required_in_pattern, required_out_pattern= "b s n d", "b s n d"
    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
    out = flash_attn_interface.flash_attn_func(q, k, v, softmax_scale=scale)
    if isinstance(out, tuple):
        out = out[0]
    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
    return out
 def flash_attention_2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
    required_in_pattern, required_out_pattern= "b s n d", "b s n d"
    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
    out = flash_attn.flash_attn_func(q, k, v, softmax_scale=scale)
    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
    return out
 def sage_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
    required_in_pattern, required_out_pattern= "b n s d", "b n s d"
    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
    out = sageattn(q, k, v, sm_scale=scale)
    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
    return out
 def xformers_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
    required_in_pattern, required_out_pattern= "b s n d", "b s n d"
    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
    out = xops.memory_efficient_attention(q, k, v, scale=scale)
    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
    return out
 def attention_forward(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, attn_mask=None, scale=None, compatibility_mode=False):
    if compatibility_mode or (attn_mask is not None):
        return torch_sdpa(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, attn_mask=attn_mask, scale=scale)
    else:
        if ATTENTION_IMPLEMENTATION == "flash_attention_3":
            return flash_attention_3(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
        elif ATTENTION_IMPLEMENTATION == "flash_attention_2":
            return flash_attention_2(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
        elif ATTENTION_IMPLEMENTATION == "sage_attention":
            return sage_attention(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
        elif ATTENTION_IMPLEMENTATION == "xformers":
            return xformers_attention(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
        else:
            return torch_sdpa(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
--- a/diffsynth/core/data/init.py
+++ b/diffsynth/core/data/init.py
@@ -0,0 +1 @@
 from .unified_dataset import UnifiedDataset
--- a/diffsynth/core/data/operators.py
+++ b/diffsynth/core/data/operators.py
@@ -0,0 +1,303 @@
 import math, warnings
 import torch, torchvision, imageio, os
 import imageio.v3 as iio
 from PIL import Image
 import torchaudio
 from diffsynth.utils.data.audio import read_audio
 class DataProcessingPipeline:
    def __init__(self, operators=None):
        self.operators: list[DataProcessingOperator] = [] if operators is None else operators
    def __call__(self, data):
        for operator in self.operators:
            data = operator(data)
        return data
    def __rshift__(self, pipe):
        if isinstance(pipe, DataProcessingOperator):
            pipe = DataProcessingPipeline([pipe])
        return DataProcessingPipeline(self.operators + pipe.operators)
 class DataProcessingOperator:
    def __call__(self, data):
        raise NotImplementedError("DataProcessingOperator cannot be called directly.")
    def __rshift__(self, pipe):
        if isinstance(pipe, DataProcessingOperator):
            pipe = DataProcessingPipeline([pipe])
        return DataProcessingPipeline([self]).__rshift__(pipe)
 class DataProcessingOperatorRaw(DataProcessingOperator):
    def __call__(self, data):
        return data
 class ToInt(DataProcessingOperator):
    def __call__(self, data):
        return int(data)
 class ToFloat(DataProcessingOperator):
    def __call__(self, data):
        return float(data)
 class ToStr(DataProcessingOperator):
    def __init__(self, none_value=""):
        self.none_value = none_value
    def __call__(self, data):
        if data is None: data = self.none_value
        return str(data)
 class LoadImage(DataProcessingOperator):
    def __init__(self, convert_RGB=True, convert_RGBA=False):
        self.convert_RGB = convert_RGB
        self.convert_RGBA = convert_RGBA
    def __call__(self, data: str):
        image = Image.open(data)
        if self.convert_RGB: image = image.convert("RGB")
        if self.convert_RGBA: image = image.convert("RGBA")
        return image
 class ImageCropAndResize(DataProcessingOperator):
    def __init__(self, height=None, width=None, max_pixels=None, height_division_factor=1, width_division_factor=1):
        self.height = height
        self.width = width
        self.max_pixels = max_pixels
        self.height_division_factor = height_division_factor
        self.width_division_factor = width_division_factor
    def crop_and_resize(self, image, target_height, target_width):
        width, height = image.size
        scale = max(target_width / width, target_height / height)
        image = torchvision.transforms.functional.resize(
            image,
            (round(height*scale), round(width*scale)),
            interpolation=torchvision.transforms.InterpolationMode.BILINEAR
        )
        image = torchvision.transforms.functional.center_crop(image, (target_height, target_width))
        return image
    def get_height_width(self, image):
        if self.height is None or self.width is None:
            width, height = image.size
            if width * height > self.max_pixels:
                scale = (width * height / self.max_pixels) ** 0.5
                height, width = int(height / scale), int(width / scale)
            height = height // self.height_division_factor * self.height_division_factor
            width = width // self.width_division_factor * self.width_division_factor
        else:
            height, width = self.height, self.width
        return height, width
    def __call__(self, data: Image.Image):
        image = self.crop_and_resize(data, *self.get_height_width(data))
        return image
 class ToList(DataProcessingOperator):
    def __call__(self, data):
        return [data]
 class FrameSamplerByRateMixin:
    def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_rate=24, fix_frame_rate=False):
        self.num_frames = num_frames
        self.time_division_factor = time_division_factor
        self.time_division_remainder = time_division_remainder
        self.frame_rate = frame_rate
        self.fix_frame_rate = fix_frame_rate
    def get_reader(self, data: str):
        return imageio.get_reader(data)
    def get_available_num_frames(self, reader):
        if not self.fix_frame_rate:
            return reader.count_frames()
        meta_data = reader.get_meta_data()
        total_original_frames = int(reader.count_frames())
        duration = meta_data["duration"] if "duration" in meta_data else total_original_frames / meta_data['fps']
        total_available_frames = math.floor(duration * self.frame_rate)
        return int(total_available_frames)
    def get_num_frames(self, reader):
        num_frames = self.num_frames
        total_frames = self.get_available_num_frames(reader)
        if int(total_frames) < num_frames:
            num_frames = total_frames
            while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
                num_frames -= 1
        return num_frames
    def map_single_frame_id(self, new_sequence_id: int, raw_frame_rate: float, total_raw_frames: int) -> int:
        if not self.fix_frame_rate:
            return new_sequence_id
        target_time_in_seconds = new_sequence_id / self.frame_rate
        raw_frame_index_float = target_time_in_seconds * raw_frame_rate
        frame_id = int(round(raw_frame_index_float))        
        frame_id = min(frame_id, total_raw_frames - 1)
        return frame_id
 class LoadVideo(DataProcessingOperator, FrameSamplerByRateMixin):
    def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x, frame_rate=24, fix_frame_rate=False):
        FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
        # frame_processor is build in the video loader for high efficiency.
        self.frame_processor = frame_processor
    def __call__(self, data: str):
        reader = self.get_reader(data)
        raw_frame_rate = reader.get_meta_data()['fps']
        num_frames = self.get_num_frames(reader)
        total_raw_frames = reader.count_frames()
        frames = []
        for frame_id in range(num_frames):
            frame_id = self.map_single_frame_id(frame_id, raw_frame_rate, total_raw_frames)
            frame = reader.get_data(frame_id)
            frame = Image.fromarray(frame)
            frame = self.frame_processor(frame)
            frames.append(frame)
        reader.close()
        return frames
 class SequencialProcess(DataProcessingOperator):
    def __init__(self, operator=lambda x: x):
        self.operator = operator
    def __call__(self, data):
        return [self.operator(i) for i in data]
 class LoadGIF(DataProcessingOperator):
    def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x):
        self.num_frames = num_frames
        self.time_division_factor = time_division_factor
        self.time_division_remainder = time_division_remainder
        # frame_processor is build in the video loader for high efficiency.
        self.frame_processor = frame_processor
    def get_num_frames(self, path):
        num_frames = self.num_frames
        images = iio.imread(path, mode="RGB")
        if len(images) < num_frames:
            num_frames = len(images)
            while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
                num_frames -= 1
        return num_frames
    def __call__(self, data: str):
        num_frames = self.get_num_frames(data)
        frames = []
        images = iio.imread(data, mode="RGB")
        for img in images:
            frame = Image.fromarray(img)
            frame = self.frame_processor(frame)
            frames.append(frame)
            if len(frames) >= num_frames:
                break
        return frames
 class RouteByExtensionName(DataProcessingOperator):
    def __init__(self, operator_map):
        self.operator_map = operator_map
    def __call__(self, data: str):
        file_ext_name = data.split(".")[-1].lower()
        for ext_names, operator in self.operator_map:
            if ext_names is None or file_ext_name in ext_names:
                return operator(data)
        raise ValueError(f"Unsupported file: {data}")
 class RouteByType(DataProcessingOperator):
    def __init__(self, operator_map):
        self.operator_map = operator_map
    def __call__(self, data):
        for dtype, operator in self.operator_map:
            if dtype is None or isinstance(data, dtype):
                return operator(data)
        raise ValueError(f"Unsupported data: {data}")
 class LoadTorchPickle(DataProcessingOperator):
    def __init__(self, map_location="cpu"):
        self.map_location = map_location
    def __call__(self, data):
        return torch.load(data, map_location=self.map_location, weights_only=False)
 class ToAbsolutePath(DataProcessingOperator):
    def __init__(self, base_path=""):
        self.base_path = base_path
    def __call__(self, data):
        return os.path.join(self.base_path, data)
 class LoadAudio(DataProcessingOperator):
    def __init__(self, sr=16000):
        self.sr = sr
    def __call__(self, data: str):
        import librosa
        input_audio, sample_rate = librosa.load(data, sr=self.sr)
        return input_audio
 class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
    def __init__(self, num_frames=121, time_division_factor=8, time_division_remainder=1, frame_rate=24, fix_frame_rate=True):
        FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
    def __call__(self, data: str):
        try:
            reader = self.get_reader(data)
            num_frames = self.get_num_frames(reader)
            duration = num_frames / self.frame_rate
            waveform, sample_rate = torchaudio.load(data)
            target_samples = int(duration * sample_rate)
            current_samples = waveform.shape[-1]
            if current_samples > target_samples:
                waveform = waveform[..., :target_samples]
            elif current_samples < target_samples:
                padding = target_samples - current_samples
                waveform = torch.nn.functional.pad(waveform, (0, padding))
            return waveform, sample_rate
        except:
            warnings.warn(f"Cannot load audio in {data}. The audio will be `None`.")
            return None
 class LoadPureAudioWithTorchaudio(DataProcessingOperator):
    def __init__(self, target_sample_rate=None, target_duration=None):
        self.target_sample_rate = target_sample_rate
        self.target_duration = target_duration
        self.resample = True if target_sample_rate is not None else False
    def __call__(self, data: str):
        try:
            waveform, sample_rate = read_audio(data, resample=self.resample, resample_rate=self.target_sample_rate)
            if self.target_duration is not None:
                target_samples = int(self.target_duration * sample_rate)
                current_samples = waveform.shape[-1]
                if current_samples > target_samples:
                    waveform = waveform[..., :target_samples]
                elif current_samples < target_samples:
                    padding = target_samples - current_samples
                    waveform = torch.nn.functional.pad(waveform, (0, padding))
            return waveform, sample_rate
        except Exception as e:
            warnings.warn(f"Cannot load audio in '{data}' due to '{e}'. The audio will be `None`.")
            return None
--- a/diffsynth/core/data/unified_dataset.py
+++ b/diffsynth/core/data/unified_dataset.py
@@ -0,0 +1,118 @@
 from .operators import *
 import torch, json, pandas
 class UnifiedDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        base_path=None, metadata_path=None,
        repeat=1,
        data_file_keys=tuple(),
        main_data_operator=lambda x: x,
        special_operator_map=None,
        max_data_items=None,
    ):
        self.base_path = base_path
        self.metadata_path = metadata_path
        self.repeat = repeat
        self.data_file_keys = data_file_keys
        self.main_data_operator = main_data_operator
        self.cached_data_operator = LoadTorchPickle()
        self.special_operator_map = {} if special_operator_map is None else special_operator_map
        self.max_data_items = max_data_items
        self.data = []
        self.cached_data = []
        self.load_from_cache = metadata_path is None
        self.load_metadata(metadata_path)
    @staticmethod
    def default_image_operator(
        base_path="",
        max_pixels=1920*1080, height=None, width=None,
        height_division_factor=16, width_division_factor=16,
    ):
        return RouteByType(operator_map=[
            (str, ToAbsolutePath(base_path) >> LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor)),
            (list, SequencialProcess(ToAbsolutePath(base_path) >> LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor))),
        ])
    @staticmethod
    def default_video_operator(
        base_path="",
        max_pixels=1920*1080, height=None, width=None,
        height_division_factor=16, width_division_factor=16,
        num_frames=81, time_division_factor=4, time_division_remainder=1,
        frame_rate=24, fix_frame_rate=False,
    ):
        return RouteByType(operator_map=[
            (str, ToAbsolutePath(base_path) >> RouteByExtensionName(operator_map=[
                (("jpg", "jpeg", "png", "webp"), LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor) >> ToList()),
                (("gif",), LoadGIF(
                    num_frames, time_division_factor, time_division_remainder,
                    frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor),
                )),
                (("mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"), LoadVideo(
                    num_frames, time_division_factor, time_division_remainder,
                    frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor),
                    frame_rate=frame_rate, fix_frame_rate=fix_frame_rate,
                )),
            ])),
        ])
    def search_for_cached_data_files(self, path):
        for file_name in os.listdir(path):
            subpath = os.path.join(path, file_name)
            if os.path.isdir(subpath):
                self.search_for_cached_data_files(subpath)
            elif subpath.endswith(".pth"):
                self.cached_data.append(subpath)
    def load_metadata(self, metadata_path):
        if metadata_path is None:
            print("No metadata_path. Searching for cached data files.")
            self.search_for_cached_data_files(self.base_path)
            print(f"{len(self.cached_data)} cached data files found.")
        elif metadata_path.endswith(".json"):
            with open(metadata_path, "r") as f:
                metadata = json.load(f)
            self.data = metadata
        elif metadata_path.endswith(".jsonl"):
            metadata = []
            with open(metadata_path, 'r') as f:
                for line in f:
                    metadata.append(json.loads(line.strip()))
            self.data = metadata
        else:
            metadata = pandas.read_csv(metadata_path)
            self.data = [metadata.iloc[i].to_dict() for i in range(len(metadata))]
    def __getitem__(self, data_id):
        if self.load_from_cache:
            data = self.cached_data[data_id % len(self.cached_data)]
            data = self.cached_data_operator(data)
        else:
            data = self.data[data_id % len(self.data)].copy()
            for key in self.data_file_keys:
                if key in data:
                    if key in self.special_operator_map:
                        data[key] = self.special_operator_map[key](data[key])
                    elif key in self.data_file_keys:
                        data[key] = self.main_data_operator(data[key])
        return data
    def __len__(self):
        if self.max_data_items is not None:
            return self.max_data_items
        elif self.load_from_cache:
            return len(self.cached_data) * self.repeat
        else:
            return len(self.data) * self.repeat
    def check_data_equal(self, data1, data2):
        # Debug only
        if len(data1) != len(data2):
            return False
        for k in data1:
            if data1[k] != data2[k]:
                return False
        return True
--- a/diffsynth/core/device/init.py
+++ b/diffsynth/core/device/init.py
@@ -0,0 +1,2 @@
 from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name
 from .npu_compatible_device import IS_NPU_AVAILABLE, IS_CUDA_AVAILABLE
--- a/diffsynth/core/device/npu_compatible_device.py
+++ b/diffsynth/core/device/npu_compatible_device.py
@@ -0,0 +1,107 @@
 import importlib
 import torch
 from typing import Any
 def is_torch_npu_available():
    return importlib.util.find_spec("torch_npu") is not None
 IS_CUDA_AVAILABLE = torch.cuda.is_available()
 IS_NPU_AVAILABLE = is_torch_npu_available() and torch.npu.is_available()
 if IS_NPU_AVAILABLE:
    import torch_npu
    torch.npu.config.allow_internal_format = False
 def get_device_type() -> str:
    """Get device type based on current machine, currently only support CPU, CUDA, NPU."""
    if IS_CUDA_AVAILABLE:
        device = "cuda"
    elif IS_NPU_AVAILABLE:
        device = "npu"
    else:
        device = "cpu"
    return device
 def get_torch_device() -> Any:
    """Get torch attribute based on device type, e.g. torch.cuda or torch.npu"""
    device_name = get_device_type()
    try:
        return getattr(torch, device_name)
    except AttributeError:
        print(f"Device namespace '{device_name}' not found in torch, try to load 'torch.cuda'.")
        return torch.cuda
 def get_device_id() -> int:
    """Get current device id based on device type."""
    return get_torch_device().current_device()
 def get_device_name() -> str:
    """Get current device name based on device type."""
    return f"{get_device_type()}:{get_device_id()}"
 def synchronize() -> None:
    """Execute torch synchronize operation."""
    get_torch_device().synchronize()
 def empty_cache() -> None:
    """Execute torch empty cache operation."""
    get_torch_device().empty_cache()
 def get_nccl_backend() -> str:
    """Return distributed communication backend type based on device type."""
    if IS_CUDA_AVAILABLE:
        return "nccl"
    elif IS_NPU_AVAILABLE:
        return "hccl"
    else:
        raise RuntimeError(f"No available distributed communication backend found on device type {get_device_type()}.")
 def enable_high_precision_for_bf16():
    """
    Set high accumulation dtype for matmul and reduction.
    """
    if IS_CUDA_AVAILABLE:
        torch.backends.cuda.matmul.allow_tf32 = False
        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
    if IS_NPU_AVAILABLE:
        torch.npu.matmul.allow_tf32 = False
        torch.npu.matmul.allow_bf16_reduced_precision_reduction = False
 def parse_device_type(device):
    if isinstance(device, str):
        if device.startswith("cuda"):
            return "cuda"
        elif device.startswith("npu"):
            return "npu"
        else:
            return "cpu"
    elif isinstance(device, torch.device):
        return device.type
 def parse_nccl_backend(device_type):
    if device_type == "cuda":
        return "nccl"
    elif device_type == "npu":
        return "hccl"
    else:
        raise RuntimeError(f"No available distributed communication backend found on device type {device_type}.")
 def get_available_device_type():
    return get_device_type()
--- a/diffsynth/core/gradient/init.py
+++ b/diffsynth/core/gradient/init.py
@@ -0,0 +1 @@
 from .gradient_checkpoint import gradient_checkpoint_forward
--- a/diffsynth/core/gradient/gradient_checkpoint.py
+++ b/diffsynth/core/gradient/gradient_checkpoint.py
@@ -0,0 +1,65 @@
 import torch
 try:
    import deepspeed
    _HAS_DEEPSPEED = True
 except ModuleNotFoundError:
    _HAS_DEEPSPEED = False
 def create_custom_forward(module):
    def custom_forward(*inputs, **kwargs):
        return module(*inputs, **kwargs)
    return custom_forward
 def create_custom_forward_use_reentrant(module):
    def custom_forward(*inputs):
        return module(*inputs)
    return custom_forward
 def judge_args_requires_grad(*args):
    for arg in args:
        if isinstance(arg, torch.Tensor) and arg.requires_grad:
            return True
    return False
 def gradient_checkpoint_forward(
    model,
    use_gradient_checkpointing,
    use_gradient_checkpointing_offload,
    *args,
    **kwargs,
 ):
    if use_gradient_checkpointing and _HAS_DEEPSPEED and deepspeed.checkpointing.is_configured():
        all_args = args + tuple(kwargs.values())
        if not judge_args_requires_grad(*all_args):
            # get the first grad_enabled tensor from un_checkpointed forward
            model_output = model(*args, **kwargs)
        else:
            model_output = deepspeed.checkpointing.checkpoint(
                create_custom_forward_use_reentrant(model),
                *all_args,
            )
        return model_output
    if use_gradient_checkpointing_offload:
        with torch.autograd.graph.save_on_cpu():
            model_output = torch.utils.checkpoint.checkpoint(
                create_custom_forward(model),
                *args,
                **kwargs,
                use_reentrant=False,
            )
    elif use_gradient_checkpointing:
        model_output = torch.utils.checkpoint.checkpoint(
            create_custom_forward(model),
            *args,
            **kwargs,
            use_reentrant=False,
        )
    else:
        model_output = model(*args, **kwargs)
    return model_output
--- a/diffsynth/core/loader/init.py
+++ b/diffsynth/core/loader/init.py
@@ -0,0 +1,3 @@
 from .file import load_state_dict, hash_state_dict_keys, hash_model_file
 from .model import load_model, load_model_with_disk_offload
 from .config import ModelConfig
--- a/diffsynth/core/loader/config.py
+++ b/diffsynth/core/loader/config.py
@@ -0,0 +1,119 @@
 import torch, glob, os
 from typing import Optional, Union, Dict
 from dataclasses import dataclass
 from modelscope import snapshot_download
 from huggingface_hub import snapshot_download as hf_snapshot_download
 from typing import Optional
@dataclass
 class ModelConfig:
    path: Union[str, list[str]] = None
    model_id: str = None
    origin_file_pattern: Union[str, list[str]] = None
    download_source: str = None
    local_model_path: str = None
    skip_download: bool = None
    offload_device: Optional[Union[str, torch.device]] = None
    offload_dtype: Optional[torch.dtype] = None
    onload_device: Optional[Union[str, torch.device]] = None
    onload_dtype: Optional[torch.dtype] = None
    preparing_device: Optional[Union[str, torch.device]] = None
    preparing_dtype: Optional[torch.dtype] = None
    computation_device: Optional[Union[str, torch.device]] = None
    computation_dtype: Optional[torch.dtype] = None
    clear_parameters: bool = False
    state_dict: Dict[str, torch.Tensor] = None
    def check_input(self):
        if self.path is None and self.model_id is None:
            raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`. `skip_download=True` only supports the first one.""")
    def parse_original_file_pattern(self):
        if self.origin_file_pattern in [None, "", "./"]:
            return "*"
        elif self.origin_file_pattern.endswith("/"):
            return self.origin_file_pattern + "*"
        else:
            return self.origin_file_pattern
    def parse_download_source(self):
        if self.download_source is None:
            if os.environ.get('DIFFSYNTH_DOWNLOAD_SOURCE') is not None:
                return os.environ.get('DIFFSYNTH_DOWNLOAD_SOURCE')
            else:
                return "modelscope"
        else:
            return self.download_source
    def parse_skip_download(self):
        if self.skip_download is None:
            if os.environ.get('DIFFSYNTH_SKIP_DOWNLOAD') is not None:
                if os.environ.get('DIFFSYNTH_SKIP_DOWNLOAD').lower() == "true":
                    return True
                elif os.environ.get('DIFFSYNTH_SKIP_DOWNLOAD').lower() == "false":
                    return False
            else:
                return False
        else:
            return self.skip_download
    def download(self):
        origin_file_pattern = self.parse_original_file_pattern()
        downloaded_files = glob.glob(origin_file_pattern, root_dir=os.path.join(self.local_model_path, self.model_id))
        download_source = self.parse_download_source()
        if download_source.lower() == "modelscope":
            snapshot_download(
                self.model_id,
                local_dir=os.path.join(self.local_model_path, self.model_id),
                allow_file_pattern=origin_file_pattern,
                ignore_file_pattern=downloaded_files,
                local_files_only=False
            )
        elif download_source.lower() == "huggingface":
            hf_snapshot_download(
                self.model_id,
                local_dir=os.path.join(self.local_model_path, self.model_id),
                allow_patterns=origin_file_pattern,
                ignore_patterns=downloaded_files,
                local_files_only=False
            )
        else:
            raise ValueError("`download_source` should be `modelscope` or `huggingface`.")
    def require_downloading(self):
        if self.path is not None:
            return False
        skip_download = self.parse_skip_download()
        return not skip_download
    def reset_local_model_path(self):
        if os.environ.get('DIFFSYNTH_MODEL_BASE_PATH') is not None:
            self.local_model_path = os.environ.get('DIFFSYNTH_MODEL_BASE_PATH')
        elif self.local_model_path is None:
            self.local_model_path = "./models"
    def download_if_necessary(self):
        self.check_input()
        self.reset_local_model_path()
        if self.require_downloading():
            self.download()
        if self.path is None:
            if self.origin_file_pattern in [None, "", "./"]:
                self.path = os.path.join(self.local_model_path, self.model_id)
            else:
                self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern))
        if isinstance(self.path, list) and len(self.path) == 1:
            self.path = self.path[0]
    def vram_config(self):
        return {
            "offload_device": self.offload_device,
            "offload_dtype": self.offload_dtype,
            "onload_device": self.onload_device,
            "onload_dtype": self.onload_dtype,
            "preparing_device": self.preparing_device,
            "preparing_dtype": self.preparing_dtype,
            "computation_device": self.computation_device,
            "computation_dtype": self.computation_dtype,
        }
--- a/diffsynth/core/loader/file.py
+++ b/diffsynth/core/loader/file.py
@@ -0,0 +1,130 @@
 from safetensors import safe_open
 import torch, hashlib
 def load_state_dict(file_path, torch_dtype=None, device="cpu", pin_memory=False, verbose=0):
    if isinstance(file_path, list):
        state_dict = {}
        for file_path_ in file_path:
            state_dict.update(load_state_dict(file_path_, torch_dtype, device, pin_memory=pin_memory, verbose=verbose))
    else:
        if verbose >= 1:
            print(f"Loading file [started]: {file_path}")
        if file_path.endswith(".safetensors"):
            state_dict = load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
        else:
            state_dict = load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)
        # If load state dict in CPU memory, `pin_memory=True` will make `model.to("cuda")` faster.
        if pin_memory:
            for i in state_dict:
                state_dict[i] = state_dict[i].pin_memory()
        if verbose >= 1:
            print(f"Loading file [done]: {file_path}")
    return state_dict
 def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
    state_dict = {}
    with safe_open(file_path, framework="pt", device=str(device)) as f:
        for k in f.keys():
            state_dict[k] = f.get_tensor(k)
            if torch_dtype is not None:
                state_dict[k] = state_dict[k].to(torch_dtype)
    return state_dict
 def load_state_dict_from_bin(file_path, torch_dtype=None, device="cpu"):
    state_dict = torch.load(file_path, map_location=device, weights_only=True)
    if len(state_dict) == 1:
        if "state_dict" in state_dict:
            state_dict = state_dict["state_dict"]
        elif "module" in state_dict:
            state_dict = state_dict["module"]
        elif "model_state" in state_dict:
            state_dict = state_dict["model_state"]
    if torch_dtype is not None:
        for i in state_dict:
            if isinstance(state_dict[i], torch.Tensor):
                state_dict[i] = state_dict[i].to(torch_dtype)
    return state_dict
 def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
    keys = []
    for key, value in state_dict.items():
        if isinstance(key, str):
            if isinstance(value, torch.Tensor):
                if with_shape:
                    shape = "_".join(map(str, list(value.shape)))
                    keys.append(key + ":" + shape)
                keys.append(key)
            elif isinstance(value, dict):
                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
    keys.sort()
    keys_str = ",".join(keys)
    return keys_str
 def hash_state_dict_keys(state_dict, with_shape=True):
    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
    keys_str = keys_str.encode(encoding="UTF-8")
    return hashlib.md5(keys_str).hexdigest()
 def load_keys_dict(file_path):
    if isinstance(file_path, list):
        state_dict = {}
        for file_path_ in file_path:
            state_dict.update(load_keys_dict(file_path_))
        return state_dict
    if file_path.endswith(".safetensors"):
        return load_keys_dict_from_safetensors(file_path)
    else:
        return load_keys_dict_from_bin(file_path)
 def load_keys_dict_from_safetensors(file_path):
    keys_dict = {}
    with safe_open(file_path, framework="pt", device="cpu") as f:
        for k in f.keys():
            keys_dict[k] = f.get_slice(k).get_shape()
    return keys_dict
 def convert_state_dict_to_keys_dict(state_dict):
    keys_dict = {}
    for k, v in state_dict.items():
        if isinstance(v, torch.Tensor):
            keys_dict[k] = list(v.shape)
        else:
            keys_dict[k] = convert_state_dict_to_keys_dict(v)
    return keys_dict
 def load_keys_dict_from_bin(file_path):
    state_dict = load_state_dict_from_bin(file_path)
    keys_dict = convert_state_dict_to_keys_dict(state_dict)
    return keys_dict
 def convert_keys_dict_to_single_str(state_dict, with_shape=True):
    keys = []
    for key, value in state_dict.items():
        if isinstance(key, str):
            if isinstance(value, dict):
                keys.append(key + "|" + convert_keys_dict_to_single_str(value, with_shape=with_shape))
            else:
                if with_shape:
                    shape = "_".join(map(str, list(value)))
                    keys.append(key + ":" + shape)
                keys.append(key)
    keys.sort()
    keys_str = ",".join(keys)
    return keys_str
 def hash_model_file(path, with_shape=True):
    keys_dict = load_keys_dict(path)
    keys_str = convert_keys_dict_to_single_str(keys_dict, with_shape=with_shape)
    keys_str = keys_str.encode(encoding="UTF-8")
    return hashlib.md5(keys_str).hexdigest()
--- a/diffsynth/core/loader/model.py
+++ b/diffsynth/core/loader/model.py
@@ -0,0 +1,105 @@
 from ..vram.initialization import skip_model_initialization
 from ..vram.disk_map import DiskMap
 from ..vram.layers import enable_vram_management
 from .file import load_state_dict
 import torch
 from contextlib import contextmanager
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import ContextManagers
 def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, use_disk_map=False, module_map=None, vram_config=None, vram_limit=None, state_dict=None):
    config = {} if config is None else config
    with ContextManagers(get_init_context(torch_dtype=torch_dtype, device=device)):
        model = model_class(**config)
    # What is `module_map`?
    # This is a module mapping table for VRAM management.
    if module_map is not None:
        devices = [vram_config["offload_device"], vram_config["onload_device"], vram_config["preparing_device"], vram_config["computation_device"]]
        device = [d for d in devices if d != "disk"][0]
        dtypes = [vram_config["offload_dtype"], vram_config["onload_dtype"], vram_config["preparing_dtype"], vram_config["computation_dtype"]]
        dtype = [d for d in dtypes if d != "disk"][0]
        if vram_config["offload_device"] != "disk":
            if state_dict is None: state_dict = DiskMap(path, device, torch_dtype=dtype)
            if state_dict_converter is not None:
                state_dict = state_dict_converter(state_dict)
            else:
                state_dict = {i: state_dict[i] for i in state_dict}
            model.load_state_dict(state_dict, assign=True)
            model = enable_vram_management(model, module_map, vram_config=vram_config, disk_map=None, vram_limit=vram_limit)
        else:
            disk_map = DiskMap(path, device, state_dict_converter=state_dict_converter)
            model = enable_vram_management(model, module_map, vram_config=vram_config, disk_map=disk_map, vram_limit=vram_limit)
    else:
        # Why do we use `DiskMap`?
        # Sometimes a model file contains multiple models,
        # and DiskMap can load only the parameters of a single model,
        # avoiding the need to load all parameters in the file.
        if state_dict is not None:
            pass
        elif use_disk_map:
            state_dict = DiskMap(path, device, torch_dtype=torch_dtype)
        else:
            state_dict = load_state_dict(path, torch_dtype, device)
        # Why do we use `state_dict_converter`?
        # Some models are saved in complex formats,
        # and we need to convert the state dict into the appropriate format.
        if state_dict_converter is not None:
            state_dict = state_dict_converter(state_dict)
        else:
            state_dict = {i: state_dict[i] for i in state_dict}
        # Why does DeepSpeed ZeRO Stage 3 need to be handled separately?
        # Because at this stage, model parameters are partitioned across multiple GPUs.
        # Loading them directly could lead to excessive GPU memory consumption.
        if is_deepspeed_zero3_enabled():
            from transformers.integrations.deepspeed import _load_state_dict_into_zero3_model
            _load_state_dict_into_zero3_model(model, state_dict)
        else:
            model.load_state_dict(state_dict, assign=True)
        # Why do we call `to()`?
        # Because some models override the behavior of `to()`,
        # especially those from libraries like Transformers.
        model = model.to(dtype=torch_dtype, device=device)
    if hasattr(model, "eval"):
        model = model.eval()
    return model
 def load_model_with_disk_offload(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, module_map=None):
    if isinstance(path, str):
        path = [path]
    config = {} if config is None else config
    with skip_model_initialization():
        model = model_class(**config)
    if hasattr(model, "eval"):
        model = model.eval()
    disk_map = DiskMap(path, device, state_dict_converter=state_dict_converter)
    vram_config = {
        "offload_dtype": "disk",
        "offload_device": "disk",
        "onload_dtype": "disk",
        "onload_device": "disk",
        "preparing_dtype": torch.float8_e4m3fn,
        "preparing_device": device,
        "computation_dtype": torch_dtype,
        "computation_device": device,
    }
    enable_vram_management(model, module_map, vram_config=vram_config, disk_map=disk_map, vram_limit=80)
    return model
 def get_init_context(torch_dtype, device):
    if is_deepspeed_zero3_enabled():
        from transformers.modeling_utils import set_zero3_state
        import deepspeed
        # Why do we use "deepspeed.zero.Init"?
        # Weight segmentation of the model can be performed on the CPU side
        # and loading the segmented weights onto the computing card
        init_contexts = [deepspeed.zero.Init(remote_device=device, dtype=torch_dtype), set_zero3_state()]
    else:
        # Why do we use `skip_model_initialization`?
        # It skips the random initialization of model parameters,
        # thereby speeding up model loading and avoiding excessive memory usage.
        init_contexts = [skip_model_initialization()]
    return init_contexts
--- a/diffsynth/core/npu_patch/npu_fused_operator.py
+++ b/diffsynth/core/npu_patch/npu_fused_operator.py
@@ -0,0 +1,30 @@
 import torch
 from ..device.npu_compatible_device import get_device_type
 try:
    import torch_npu
 except:
    pass
 def rms_norm_forward_npu(self, hidden_states):
    "npu rms fused operator for RMSNorm.forward from diffsynth\models\general_modules.py"
    if hidden_states.dtype != self.weight.dtype:
        hidden_states = hidden_states.to(self.weight.dtype)
    return torch_npu.npu_rms_norm(hidden_states, self.weight, self.eps)[0]
 def rms_norm_forward_transformers_npu(self, hidden_states):
    "npu rms fused operator for transformers"
    if hidden_states.dtype != self.weight.dtype:
        hidden_states = hidden_states.to(self.weight.dtype)
    return torch_npu.npu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
 def rotary_emb_Zimage_npu(self, x_in: torch.Tensor, freqs_cis: torch.Tensor):
    "npu rope fused operator for Zimage"
    with torch.amp.autocast(get_device_type(), enabled=False):
        freqs_cis = freqs_cis.unsqueeze(2)
        cos, sin = torch.chunk(torch.view_as_real(freqs_cis), 2, dim=-1)
        cos = cos.expand(-1, -1, -1, -1, 2).flatten(-2)
        sin = sin.expand(-1, -1, -1, -1, 2).flatten(-2)
        return torch_npu.npu_rotary_mul(x_in, cos, sin, rotary_mode="interleave").to(x_in)
--- a/diffsynth/core/vram/init.py
+++ b/diffsynth/core/vram/init.py
@@ -0,0 +1,2 @@
 from .initialization import skip_model_initialization
 from .layers import *
--- a/diffsynth/core/vram/disk_map.py
+++ b/diffsynth/core/vram/disk_map.py
@@ -0,0 +1,93 @@
 from safetensors import safe_open
 import torch, os
 class SafetensorsCompatibleTensor:
    def __init__(self, tensor):
        self.tensor = tensor
    def get_shape(self):
        return list(self.tensor.shape)
 class SafetensorsCompatibleBinaryLoader:
    def __init__(self, path, device):
        print("Detected non-safetensors files, which may cause slower loading. It's recommended to convert it to a safetensors file.")
        self.state_dict = torch.load(path, weights_only=True, map_location=device)
    def keys(self):
        return self.state_dict.keys()
    def get_tensor(self, name):
        return self.state_dict[name]
    def get_slice(self, name):
        return SafetensorsCompatibleTensor(self.state_dict[name])
 class DiskMap:
    def __init__(self, path, device, torch_dtype=None, state_dict_converter=None, buffer_size=10**9):
        self.path = path if isinstance(path, list) else [path]
        self.device = device
        self.torch_dtype = torch_dtype
        if os.environ.get('DIFFSYNTH_DISK_MAP_BUFFER_SIZE') is not None:
            self.buffer_size = int(os.environ.get('DIFFSYNTH_DISK_MAP_BUFFER_SIZE'))
        else:
            self.buffer_size = buffer_size
        self.files = []
        self.flush_files()
        self.name_map = {}
        for file_id, file in enumerate(self.files):
            for name in file.keys():
                self.name_map[name] = file_id
        self.rename_dict = self.fetch_rename_dict(state_dict_converter)
    def flush_files(self):
        if len(self.files) == 0:
            for path in self.path:
                if path.endswith(".safetensors"):
                    self.files.append(safe_open(path, framework="pt", device=str(self.device)))
                else:
                    self.files.append(SafetensorsCompatibleBinaryLoader(path, device=self.device))
        else:
            for i, path in enumerate(self.path):
                if path.endswith(".safetensors"):
                    self.files[i] = safe_open(path, framework="pt", device=str(self.device))
        self.num_params = 0
    def __getitem__(self, name):
        if self.rename_dict is not None: name = self.rename_dict[name]
        file_id = self.name_map[name]
        param = self.files[file_id].get_tensor(name)
        if self.torch_dtype is not None and isinstance(param, torch.Tensor):
            param = param.to(self.torch_dtype)
        if isinstance(param, torch.Tensor) and param.device == "cpu":
            param = param.clone()
        if isinstance(param, torch.Tensor):
            self.num_params += param.numel()
        if self.num_params > self.buffer_size:
            self.flush_files()
        return param
    def fetch_rename_dict(self, state_dict_converter):
        if state_dict_converter is None:
            return None
        state_dict = {}
        for file in self.files:
            for name in file.keys():
                state_dict[name] = name
        state_dict = state_dict_converter(state_dict)
        return state_dict
    def __iter__(self):
        if self.rename_dict is not None:
            return self.rename_dict.__iter__()
        else:
            return self.name_map.__iter__()
    def __contains__(self, x):
        if self.rename_dict is not None:
            return x in self.rename_dict
        else:
            return x in self.name_map
--- a/diffsynth/core/vram/initialization.py
+++ b/diffsynth/core/vram/initialization.py
@@ -0,0 +1,21 @@
 import torch
 from contextlib import contextmanager
@contextmanager
 def skip_model_initialization(device=torch.device("meta")):
    def register_empty_parameter(module, name, param):
        old_register_parameter(module, name, param)
        if param is not None:
            param_cls = type(module._parameters[name])
            kwargs = module._parameters[name].__dict__
            kwargs["requires_grad"] = param.requires_grad
            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
    old_register_parameter = torch.nn.Module.register_parameter
    torch.nn.Module.register_parameter = register_empty_parameter
    try:
        yield
    finally:
        torch.nn.Module.register_parameter = old_register_parameter
--- a/diffsynth/core/vram/layers.py
+++ b/diffsynth/core/vram/layers.py
@@ -0,0 +1,479 @@
 import torch, copy
 from typing import Union
 from .initialization import skip_model_initialization
 from .disk_map import DiskMap
 from ..device import parse_device_type, get_device_name, IS_NPU_AVAILABLE
 class AutoTorchModule(torch.nn.Module):
    def __init__(
        self,
        offload_dtype: torch.dtype = None,
        offload_device: Union[str, torch.device] = None,
        onload_dtype: torch.dtype = None,
        onload_device: Union[str, torch.device] = None,
        preparing_dtype: torch.dtype = None,
        preparing_device: Union[str, torch.device] = None,
        computation_dtype: torch.dtype = None,
        computation_device: Union[str, torch.device] = None,
        vram_limit: float = None,
    ):
        super().__init__()
        self.set_dtype_and_device(
            offload_dtype,
            offload_device,
            onload_dtype,
            onload_device,
            preparing_dtype,
            preparing_device,
            computation_dtype,
            computation_device,
            vram_limit,
        )
        self.state = 0
        self.name = ""
        self.computation_device_type = parse_device_type(self.computation_device)
    def set_dtype_and_device(
        self,
        offload_dtype: torch.dtype = None,
        offload_device: Union[str, torch.device] = None,
        onload_dtype: torch.dtype = None,
        onload_device: Union[str, torch.device] = None,
        preparing_dtype: torch.dtype = None,
        preparing_device: Union[str, torch.device] = None,
        computation_dtype: torch.dtype = None,
        computation_device: Union[str, torch.device] = None,
        vram_limit: float = None,
    ):
        self.offload_dtype = offload_dtype or computation_dtype
        self.offload_device = offload_device or computation_dtype
        self.onload_dtype = onload_dtype or computation_dtype
        self.onload_device = onload_device or computation_dtype
        self.preparing_dtype = preparing_dtype or computation_dtype
        self.preparing_device = preparing_device or computation_dtype
        self.computation_dtype = computation_dtype
        self.computation_device = computation_device
        self.vram_limit = vram_limit
    def cast_to(self, weight, dtype, device):
        r = torch.empty_like(weight, dtype=dtype, device=device)
        r.copy_(weight)
        return r
    def check_free_vram(self):
        device = self.computation_device if not IS_NPU_AVAILABLE else get_device_name()
        gpu_mem_state = getattr(torch, self.computation_device_type).mem_get_info(device)
        used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024**3)
        return used_memory < self.vram_limit
    def offload(self):
        if self.state != 0:
            self.to(dtype=self.offload_dtype, device=self.offload_device)
            self.state = 0
    def onload(self):
        if self.state != 1:
            self.to(dtype=self.onload_dtype, device=self.onload_device)
            self.state = 1
    def param_name(self, name):
        if self.name == "":
            return name
        else:
            return self.name + "." + name
 class AutoWrappedModule(AutoTorchModule):
    def __init__(
        self,
        module: torch.nn.Module,
        offload_dtype: torch.dtype = None,
        offload_device: Union[str, torch.device] = None,
        onload_dtype: torch.dtype = None,
        onload_device: Union[str, torch.device] = None,
        preparing_dtype: torch.dtype = None,
        preparing_device: Union[str, torch.device] = None,
        computation_dtype: torch.dtype = None,
        computation_device: Union[str, torch.device] = None,
        vram_limit: float = None,
        name: str = "",
        disk_map: DiskMap = None,
        **kwargs
    ):
        super().__init__(
            offload_dtype,
            offload_device,
            onload_dtype,
            onload_device,
            preparing_dtype,
            preparing_device,
            computation_dtype,
            computation_device,
            vram_limit,
        )
        self.module = module
        if offload_dtype == "disk":
            self.name = name
            self.disk_map = disk_map
            self.required_params = [name for name, _ in self.module.named_parameters()]
            self.disk_offload = True
        else:
            self.disk_offload = False
    def load_from_disk(self, torch_dtype, device, copy_module=False):
        if copy_module:
            module = copy.deepcopy(self.module)
        else:
            module = self.module
        state_dict = {}
        for name in self.required_params:
            param = self.disk_map[self.param_name(name)]
            param = param.to(dtype=torch_dtype, device=device)
            state_dict[name] = param
        module.load_state_dict(state_dict, assign=True)
        module.to(dtype=torch_dtype, device=device)
        return module
    def offload_to_disk(self, model: torch.nn.Module):
        for buf in model.buffers():
            # If there are some parameters are registed in buffers (not in state dict),
            # We cannot offload the model.
            for children in model.children():
                self.offload_to_disk(children)
            break
        else:
            model.to("meta")
    def offload(self):
        # offload / onload / preparing -> offload
        if self.state != 0:
            if self.disk_offload:
                self.offload_to_disk(self.module)
            else:
                self.to(dtype=self.offload_dtype, device=self.offload_device)
            self.state = 0
    def onload(self):
        # offload / onload / preparing -> onload
        if self.state < 1:
            if self.disk_offload and self.onload_device != "disk" and self.offload_device == "disk":
                self.load_from_disk(self.onload_dtype, self.onload_device)
            elif self.onload_device != "disk":
                self.to(dtype=self.onload_dtype, device=self.onload_device)
            self.state = 1
    def preparing(self):
        # onload / preparing -> preparing
        if self.state != 2:
            if self.disk_offload and self.preparing_device != "disk" and self.onload_device == "disk":
                self.load_from_disk(self.preparing_dtype, self.preparing_device)
            elif self.preparing_device != "disk":
                self.to(dtype=self.preparing_dtype, device=self.preparing_device)
            self.state = 2
    def cast_to(self, module, dtype, device):
        return copy.deepcopy(module).to(dtype=dtype, device=device)
    def computation(self):
        # onload / preparing -> computation (temporary)
        if self.state == 2:
            torch_dtype, device = self.preparing_dtype, self.preparing_device
        else:
            torch_dtype, device = self.onload_dtype, self.onload_device
        if torch_dtype == self.computation_dtype and device == self.computation_device:
            module = self.module
        elif self.disk_offload and device == "disk":
            module = self.load_from_disk(self.computation_dtype, self.computation_device, copy_module=True)
        else:
            module = self.cast_to(self.module, dtype=self.computation_dtype, device=self.computation_device)
        return module
    def forward(self, *args, **kwargs):
        if self.state == 1 and (self.vram_limit is None or self.check_free_vram()):
            self.preparing()
        module = self.computation()
        return module(*args, **kwargs)
    def __getattr__(self, name):
        if name in self.__dict__ or name == "module":
            return super().__getattr__(name)
        else:
            return getattr(self.module, name)
 class AutoWrappedNonRecurseModule(AutoWrappedModule):
    def __init__(
        self,
        module: torch.nn.Module,
        offload_dtype: torch.dtype = None,
        offload_device: Union[str, torch.device] = None,
        onload_dtype: torch.dtype = None,
        onload_device: Union[str, torch.device] = None,
        preparing_dtype: torch.dtype = None,
        preparing_device: Union[str, torch.device] = None,
        computation_dtype: torch.dtype = None,
        computation_device: Union[str, torch.device] = None,
        vram_limit: float = None,
        name: str = "",
        disk_map: DiskMap = None,
        **kwargs
    ):
        super().__init__(
            module,
            offload_dtype,
            offload_device,
            onload_dtype,
            onload_device,
            preparing_dtype,
            preparing_device,
            computation_dtype,
            computation_device,
            vram_limit,
            name,
            disk_map,
            **kwargs
        )
        if self.disk_offload:
            self.required_params = [name for name, _ in self.module.named_parameters(recurse=False)]
    def load_from_disk(self, torch_dtype, device, copy_module=False):
        if copy_module:
            module = copy.deepcopy(self.module)
        else:
            module = self.module
        state_dict = {}
        for name in self.required_params:
            param = self.disk_map[self.param_name(name)]
            param = param.to(dtype=torch_dtype, device=device)
            state_dict[name] = param
        module.load_state_dict(state_dict, assign=True, strict=False)
        return module
    def offload_to_disk(self, model: torch.nn.Module):
        for name in self.required_params:
            getattr(self, name).to("meta")
    def cast_to(self, module, dtype, device):
        # Parameter casting is implemented in the model architecture.
        return module
    def __getattr__(self, name):
        if name in self.__dict__ or name == "module":
            return super().__getattr__(name)
        else:
            return getattr(self.module, name)
 class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
    def __init__(
        self,
        module: torch.nn.Linear,
        offload_dtype: torch.dtype = None,
        offload_device: Union[str, torch.device] = None,
        onload_dtype: torch.dtype = None,
        onload_device: Union[str, torch.device] = None,
        preparing_dtype: torch.dtype = None,
        preparing_device: Union[str, torch.device] = None,
        computation_dtype: torch.dtype = None,
        computation_device: Union[str, torch.device] = None,
        vram_limit: float = None,
        name: str = "",
        disk_map: DiskMap = None,
        **kwargs
    ):
        with skip_model_initialization():
            super().__init__(
                in_features=module.in_features,
                out_features=module.out_features,
                bias=module.bias is not None,
            )
        self.set_dtype_and_device(
            offload_dtype,
            offload_device,
            onload_dtype,
            onload_device,
            preparing_dtype,
            preparing_device,
            computation_dtype,
            computation_device,
            vram_limit,
        )
        self.weight = module.weight
        self.bias = module.bias
        self.state = 0
        self.name = name
        self.lora_A_weights = []
        self.lora_B_weights = []
        self.lora_merger = None
        self.enable_fp8 = computation_dtype in [torch.float8_e4m3fn, torch.float8_e4m3fnuz]
        self.computation_device_type = parse_device_type(self.computation_device)
        if offload_dtype == "disk":
            self.disk_map = disk_map
            self.disk_offload = True
        else:
            self.disk_offload = False
    def fp8_linear(
        self,
        input: torch.Tensor,
        weight: torch.Tensor,
        bias: torch.Tensor = None,
    ) -> torch.Tensor:
        device = input.device
        origin_dtype = input.dtype
        origin_shape = input.shape
        input = input.reshape(-1, origin_shape[-1])
        x_max = torch.max(torch.abs(input), dim=-1, keepdim=True).values
        fp8_max = 448.0
        # For float8_e4m3fnuz, the maximum representable value is half of that of e4m3fn.
        # To avoid overflow and ensure numerical compatibility during FP8 computation,
        # we scale down the input by 2.0 in advance.
        # This scaling will be compensated later during the final result scaling.
        if self.computation_dtype == torch.float8_e4m3fnuz:
            fp8_max = fp8_max / 2.0
        scale_a = torch.clamp(x_max / fp8_max, min=1.0).float().to(device=device)
        scale_b = torch.ones((weight.shape[0], 1)).to(device=device)
        input = input / (scale_a + 1e-8)
        input = input.to(self.computation_dtype)
        weight = weight.to(self.computation_dtype)
        bias = bias.to(torch.bfloat16)
        result = torch._scaled_mm(
            input,
            weight.T,
            scale_a=scale_a,
            scale_b=scale_b.T,
            bias=bias,
            out_dtype=origin_dtype,
        )
        new_shape = origin_shape[:-1] + result.shape[-1:]
        result = result.reshape(new_shape)
        return result
    def load_from_disk(self, torch_dtype, device, assign=True):
        weight = self.disk_map[self.name + ".weight"].to(dtype=torch_dtype, device=device)
        bias = None if self.bias is None else self.disk_map[self.name + ".bias"].to(dtype=torch_dtype, device=device)
        if assign:
            state_dict = {"weight": weight}
            if bias is not None: state_dict["bias"] = bias
            self.load_state_dict(state_dict, assign=True)
        return weight, bias
    def offload(self):
        # offload / onload / preparing -> offload
        if self.state != 0:
            if self.disk_offload:
                self.to("meta")
            else:
                self.to(dtype=self.offload_dtype, device=self.offload_device)
            self.state = 0
    def onload(self):
        # offload / onload / preparing -> onload
        if self.state < 1:
            if self.disk_offload and self.onload_device != "disk" and self.offload_device == "disk":
                self.load_from_disk(self.onload_dtype, self.onload_device)
            elif self.onload_device != "disk":
                self.to(dtype=self.onload_dtype, device=self.onload_device)
            self.state = 1
    def preparing(self):
        # onload / preparing -> preparing
        if self.state != 2:
            if self.disk_offload and self.preparing_device != "disk" and self.onload_device == "disk":
                self.load_from_disk(self.preparing_dtype, self.preparing_device)
            elif self.preparing_device != "disk":
                self.to(dtype=self.preparing_dtype, device=self.preparing_device)
            self.state = 2
    def computation(self):
        # onload / preparing -> computation (temporary)
        if self.state == 2:
            torch_dtype, device = self.preparing_dtype, self.preparing_device
        else:
            torch_dtype, device = self.onload_dtype, self.onload_device
        if torch_dtype == self.computation_dtype and device == self.computation_device:
            weight, bias = self.weight, self.bias
        elif self.disk_offload and device == "disk":
            weight, bias = self.load_from_disk(self.computation_dtype, self.computation_device, assign=False)
        else:
            weight = self.cast_to(self.weight, self.computation_dtype, self.computation_device)
            bias = None if self.bias is None else self.cast_to(self.bias, self.computation_dtype, self.computation_device)
        return weight, bias
    def linear_forward(self, x, weight, bias):
        if self.enable_fp8:
            out = self.fp8_linear(x, weight, bias)
        else:
            out = torch.nn.functional.linear(x, weight, bias)
        return out
    def lora_forward(self, x, out):
        if self.lora_merger is None:
            for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
                out = out + x @ lora_A.T.to(device=x.device, dtype=x.dtype) @ lora_B.T.to(device=x.device, dtype=x.dtype)
        else:
            lora_output = []
            for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
                lora_output.append(x @ lora_A.T @ lora_B.T)
            lora_output = torch.stack(lora_output)
            out = self.lora_merger(out, lora_output)
        return out
    def forward(self, x, *args, **kwargs):
        if self.state == 1 and (self.vram_limit is None or self.check_free_vram()):
            self.preparing()
        weight, bias = self.computation()
        out = self.linear_forward(x, weight, bias)
        if len(self.lora_A_weights) > 0:
            out = self.lora_forward(x, out)
        return out
 def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, vram_config: dict, vram_limit=None, name_prefix="", disk_map=None, **kwargs):
    if isinstance(model, AutoWrappedNonRecurseModule):
        model = model.module
    for name, module in model.named_children():
        layer_name = name if name_prefix == "" else name_prefix + "." + name
        for source_module, target_module in module_map.items():
            if isinstance(module, source_module):
                module_ = target_module(module, **vram_config, vram_limit=vram_limit, name=layer_name, disk_map=disk_map, **kwargs)
                if isinstance(module_, AutoWrappedNonRecurseModule):
                    enable_vram_management_recursively(module_, module_map, vram_config, vram_limit=vram_limit, name_prefix=layer_name, disk_map=disk_map, **kwargs)
                setattr(model, name, module_)
                break
        else:
            enable_vram_management_recursively(module, module_map, vram_config, vram_limit=vram_limit, name_prefix=layer_name, disk_map=disk_map, **kwargs)
 def fill_vram_config(model, vram_config):
    vram_config_ = vram_config.copy()
    vram_config_["onload_dtype"] = vram_config["computation_dtype"]
    vram_config_["onload_device"] = vram_config["computation_device"]
    vram_config_["preparing_dtype"] = vram_config["computation_dtype"]
    vram_config_["preparing_device"] = vram_config["computation_device"]
    for k in vram_config:
        if vram_config[k] != vram_config_[k]:
            print(f"No fine-grained VRAM configuration is provided for {model.__class__.__name__}. [`onload`, `preparing`, `computation`] will be the same state. `vram_config` is set to {vram_config_}")
            break
    return vram_config_
 def enable_vram_management(model: torch.nn.Module, module_map: dict, vram_config: dict, vram_limit=None, disk_map=None, **kwargs):
    for source_module, target_module in module_map.items():
        # If no fine-grained VRAM configuration is provided, the entire model will be managed uniformly.
        if isinstance(model, source_module):
            vram_config = fill_vram_config(model, vram_config)
            model = target_module(model, **vram_config, vram_limit=vram_limit, disk_map=disk_map, **kwargs)
            break
    else:
        enable_vram_management_recursively(model, module_map, vram_config, vram_limit=vram_limit, disk_map=disk_map, **kwargs)
    # `vram_management_enabled` is a flag that allows the pipeline to determine whether VRAM management is enabled.
    model.vram_management_enabled = True
    return model
--- a/diffsynth/data/init.py
+++ b/diffsynth/data/init.py
@@ -1 +0,0 @@
 from .video import VideoData, save_video, save_frames
--- a/diffsynth/diffusion/init.py
+++ b/diffsynth/diffusion/init.py
@@ -0,0 +1,6 @@
 from .flow_match import FlowMatchScheduler
 from .training_module import DiffusionTrainingModule
 from .logger import ModelLogger
 from .runner import launch_training_task, launch_data_process_task
 from .parsers import *
 from .loss import *
--- a/diffsynth/diffusion/base_pipeline.py
+++ b/diffsynth/diffusion/base_pipeline.py
@@ -0,0 +1,500 @@
 from PIL import Image
 import torch
 import numpy as np
 from einops import repeat, reduce
 from typing import Union
 from ..core import AutoTorchModule, AutoWrappedLinear, load_state_dict, ModelConfig, parse_device_type
 from ..core.device.npu_compatible_device import get_device_type
 from ..utils.lora import GeneralLoRALoader
 from ..models.model_loader import ModelPool
 from ..utils.controlnet import ControlNetInput
 from ..core.device import get_device_name, IS_NPU_AVAILABLE
 class PipelineUnit:
    def __init__(
        self,
        seperate_cfg: bool = False,
        take_over: bool = False,
        input_params: tuple[str] = None,
        output_params: tuple[str] = None,
        input_params_posi: dict[str, str] = None,
        input_params_nega: dict[str, str] = None,
        onload_model_names: tuple[str] = None
    ):
        self.seperate_cfg = seperate_cfg
        self.take_over = take_over
        self.input_params = input_params
        self.output_params = output_params
        self.input_params_posi = input_params_posi
        self.input_params_nega = input_params_nega
        self.onload_model_names = onload_model_names
    def fetch_input_params(self):
        params = []
        if self.input_params is not None:
            for param in self.input_params:
                params.append(param)
        if self.input_params_posi is not None:
            for _, param in self.input_params_posi.items():
                params.append(param)
        if self.input_params_nega is not None:
            for _, param in self.input_params_nega.items():
                params.append(param)
        params = sorted(list(set(params)))
        return params
    def fetch_output_params(self):
        params = []
        if self.output_params is not None:
            for param in self.output_params:
                params.append(param)
        return params
    def process(self, pipe, **kwargs) -> dict:
        return {}
    def post_process(self, pipe, **kwargs) -> dict:
        return {}
 class BasePipeline(torch.nn.Module):
    def __init__(
        self,
        device=get_device_type(), torch_dtype=torch.float16,
        height_division_factor=64, width_division_factor=64,
        time_division_factor=None, time_division_remainder=None,
    ):
        super().__init__()
        # The device and torch_dtype is used for the storage of intermediate variables, not models.
        self.device = device
        self.torch_dtype = torch_dtype
        self.device_type = parse_device_type(device)
        # The following parameters are used for shape check.
        self.height_division_factor = height_division_factor
        self.width_division_factor = width_division_factor
        self.time_division_factor = time_division_factor
        self.time_division_remainder = time_division_remainder
        # VRAM management
        self.vram_management_enabled = False
        # Pipeline Unit Runner
        self.unit_runner = PipelineUnitRunner()
        # LoRA Loader
        self.lora_loader = GeneralLoRALoader
    def to(self, *args, **kwargs):
        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
        if device is not None:
            self.device = device
        if dtype is not None:
            self.torch_dtype = dtype
        super().to(*args, **kwargs)
        return self
    def check_resize_height_width(self, height, width, num_frames=None, verbose=1):
        # Shape check
        if height % self.height_division_factor != 0:
            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
            if verbose > 0:
                print(f"height % {self.height_division_factor} != 0. We round it up to {height}.")
        if width % self.width_division_factor != 0:
            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
            if verbose > 0:
                print(f"width % {self.width_division_factor} != 0. We round it up to {width}.")
        if num_frames is None:
            return height, width
        else:
            if num_frames % self.time_division_factor != self.time_division_remainder:
                num_frames = (num_frames + self.time_division_factor - 1) // self.time_division_factor * self.time_division_factor + self.time_division_remainder
                if verbose > 0:
                    print(f"num_frames % {self.time_division_factor} != {self.time_division_remainder}. We round it up to {num_frames}.")
            return height, width, num_frames
    def preprocess_image(self, image, torch_dtype=None, device=None, pattern="B C H W", min_value=-1, max_value=1):
        # Transform a PIL.Image to torch.Tensor
        image = torch.Tensor(np.array(image, dtype=np.float32))
        image = image.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
        image = image * ((max_value - min_value) / 255) + min_value
        image = repeat(image, f"H W C -> {pattern}", **({"B": 1} if "B" in pattern else {}))
        return image
    def preprocess_video(self, video, torch_dtype=None, device=None, pattern="B C T H W", min_value=-1, max_value=1):
        # Transform a list of PIL.Image to torch.Tensor
        video = [self.preprocess_image(image, torch_dtype=torch_dtype, device=device, min_value=min_value, max_value=max_value) for image in video]
        video = torch.stack(video, dim=pattern.index("T") // 2)
        return video
    def vae_output_to_image(self, vae_output, pattern="B C H W", min_value=-1, max_value=1):
        # Transform a torch.Tensor to PIL.Image
        if pattern != "H W C":
            vae_output = reduce(vae_output, f"{pattern} -> H W C", reduction="mean")
        image = ((vae_output - min_value) * (255 / (max_value - min_value))).clip(0, 255)
        image = image.to(device="cpu", dtype=torch.uint8)
        image = Image.fromarray(image.numpy())
        return image
    def vae_output_to_video(self, vae_output, pattern="B C T H W", min_value=-1, max_value=1):
        # Transform a torch.Tensor to list of PIL.Image
        if pattern != "T H W C":
            vae_output = reduce(vae_output, f"{pattern} -> T H W C", reduction="mean")
        video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
        return video
    def output_audio_format_check(self, audio_output):
        # output standard foramt: [C, T], output dtype: float()
        # remove batch dim
        if audio_output.ndim == 3:
            audio_output = audio_output.squeeze(0)
        return audio_output.float().cpu()
    def load_models_to_device(self, model_names):
        if self.vram_management_enabled:
            # offload models
            for name, model in self.named_children():
                if name not in model_names:
                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
                        if hasattr(model, "offload"):
                            model.offload()
                        else:
                            for module in model.modules():
                                if hasattr(module, "offload"):
                                    module.offload()
            getattr(torch, self.device_type).empty_cache()
            # onload models
            for name, model in self.named_children():
                if name in model_names:
                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
                        if hasattr(model, "onload"):
                            model.onload()
                        else:
                            for module in model.modules():
                                if hasattr(module, "onload"):
                                    module.onload()
    def generate_noise(self, shape, seed=None, rand_device="cpu", rand_torch_dtype=torch.float32, device=None, torch_dtype=None):
        # Initialize Gaussian noise
        generator = None if seed is None else torch.Generator(rand_device).manual_seed(seed)
        noise = torch.randn(shape, generator=generator, device=rand_device, dtype=rand_torch_dtype)
        noise = noise.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
        return noise
    def get_vram(self):
        device = self.device if not IS_NPU_AVAILABLE else get_device_name()
        return getattr(torch, self.device_type).mem_get_info(device)[1] / (1024 ** 3)
    def get_module(self, model, name):
        if "." in name:
            name, suffix = name[:name.index(".")], name[name.index(".") + 1:]
            if name.isdigit():
                return self.get_module(model[int(name)], suffix)
            else:
                return self.get_module(getattr(model, name), suffix)
        else:
            return getattr(model, name)
    def freeze_except(self, model_names):
        self.eval()
        self.requires_grad_(False)
        for name in model_names:
            module = self.get_module(self, name)
            if module is None:
                print(f"No {name} models in the pipeline. We cannot enable training on the model. If this occurs during the data processing stage, it is normal.")
                continue
            module.train()
            module.requires_grad_(True)
    def blend_with_mask(self, base, addition, mask):
        return base * (1 - mask) + addition * mask
    def step(self, scheduler, latents, progress_id, noise_pred, input_latents=None, inpaint_mask=None, **kwargs):
        timestep = scheduler.timesteps[progress_id]
        if inpaint_mask is not None:
            noise_pred_expected = scheduler.return_to_timestep(scheduler.timesteps[progress_id], latents, input_latents)
            noise_pred = self.blend_with_mask(noise_pred_expected, noise_pred, inpaint_mask)
        latents_next = scheduler.step(noise_pred, timestep, latents)
        return latents_next
    def split_pipeline_units(self, model_names: list[str]):
        return PipelineUnitGraph().split_pipeline_units(self.units, model_names)
    def flush_vram_management_device(self, device):
        for module in self.modules():
            if isinstance(module, AutoTorchModule):
                module.offload_device = device
                module.onload_device = device
                module.preparing_device = device
                module.computation_device = device
    def load_lora(
        self,
        module: torch.nn.Module,
        lora_config: Union[ModelConfig, str] = None,
        alpha=1,
        hotload=None,
        state_dict=None,
        verbose=1,
    ):
        if state_dict is None:
            if isinstance(lora_config, str):
                lora = load_state_dict(lora_config, torch_dtype=self.torch_dtype, device=self.device)
            else:
                lora_config.download_if_necessary()
                lora = load_state_dict(lora_config.path, torch_dtype=self.torch_dtype, device=self.device)
        else:
            lora = state_dict
        lora_loader = self.lora_loader(torch_dtype=self.torch_dtype, device=self.device)
        lora = lora_loader.convert_state_dict(lora)
        if hotload is None:
            hotload = hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled")
        if hotload:
            if not (hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled")):
                raise ValueError("VRAM Management is not enabled. LoRA hotloading is not supported.")
            updated_num = 0
            for _, module in module.named_modules():
                if isinstance(module, AutoWrappedLinear):
                    name = module.name
                    lora_a_name = f'{name}.lora_A.weight'
                    lora_b_name = f'{name}.lora_B.weight'
                    if lora_a_name in lora and lora_b_name in lora:
                        updated_num += 1
                        module.lora_A_weights.append(lora[lora_a_name] * alpha)
                        module.lora_B_weights.append(lora[lora_b_name])
            if verbose >= 1:
                print(f"{updated_num} tensors are patched by LoRA. You can use `pipe.clear_lora()` to clear all LoRA layers.")
        else:
            lora_loader.fuse_lora_to_base_model(module, lora, alpha=alpha)
    def clear_lora(self, verbose=1):
        cleared_num = 0
        for name, module in self.named_modules():
            if isinstance(module, AutoWrappedLinear):
                if hasattr(module, "lora_A_weights"):
                    if len(module.lora_A_weights) > 0:
                        cleared_num += 1
                    module.lora_A_weights.clear()
                if hasattr(module, "lora_B_weights"):
                    module.lora_B_weights.clear()
        if verbose >= 1:
            print(f"{cleared_num} LoRA layers are cleared.")
    def download_and_load_models(self, model_configs: list[ModelConfig] = [], vram_limit: float = None):
        model_pool = ModelPool()
        for model_config in model_configs:
            model_config.download_if_necessary()
            vram_config = model_config.vram_config()
            vram_config["computation_dtype"] = vram_config["computation_dtype"] or self.torch_dtype
            vram_config["computation_device"] = vram_config["computation_device"] or self.device
            model_pool.auto_load_model(
                model_config.path,
                vram_config=vram_config,
                vram_limit=vram_limit,
                clear_parameters=model_config.clear_parameters,
                state_dict=model_config.state_dict,
            )
        return model_pool
    def check_vram_management_state(self):
        vram_management_enabled = False
        for module in self.children():
            if hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled"):
                vram_management_enabled = True
        return vram_management_enabled
    def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega, **inputs_others):
        if inputs_shared.get("positive_only_lora", None) is not None:
            self.clear_lora(verbose=0)
            self.load_lora(self.dit, state_dict=inputs_shared["positive_only_lora"], verbose=0)
        noise_pred_posi = model_fn(**inputs_posi, **inputs_shared, **inputs_others)
        if cfg_scale != 1.0:
            if inputs_shared.get("positive_only_lora", None) is not None:
                self.clear_lora(verbose=0)
            noise_pred_nega = model_fn(**inputs_nega, **inputs_shared, **inputs_others)
            if isinstance(noise_pred_posi, tuple):
                # Separately handling different output types of latents, eg. video and audio latents.
                noise_pred = tuple(
                    n_nega + cfg_scale * (n_posi - n_nega)
                    for n_posi, n_nega in zip(noise_pred_posi, noise_pred_nega)
                )
            else:
                noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
        else:
            noise_pred = noise_pred_posi
        return noise_pred
    def compile_pipeline(self, mode: str = "default", dynamic: bool = True, fullgraph: bool = False, compile_models: list = None, **kwargs):
        """
        compile the pipeline with torch.compile. The models that will be compiled are determined by the `compilable_models` attribute of the pipeline.
        If a model has `_repeated_blocks` attribute, we will compile these blocks with regional compilation. Otherwise, we will compile the whole model.
        See https://docs.pytorch.org/docs/stable/generated/torch.compile.html#torch.compile for details about compilation arguments.
        Args:
            mode: The compilation mode, which will be passed to `torch.compile`, options are "default", "reduce-overhead", "max-autotune" and "max-autotune-no-cudagraphs. Default to "default".
            dynamic: Whether to enable dynamic graph compilation to support dynamic input shapes, which will be passed to `torch.compile`. Default to True (recommended).
            fullgraph: Whether to use full graph compilation, which will be passed to `torch.compile`. Default to False (recommended).
            compile_models: The list of model names to be compiled. If None, we will compile the models in `pipeline.compilable_models`. Default to None.
            **kwargs: Other arguments for `torch.compile`.
        """
        compile_models = compile_models or getattr(self, "compilable_models", [])
        if len(compile_models) == 0:
            print("No compilable models in the pipeline. Skip compilation.")
            return
        for name in compile_models:
            model = getattr(self, name, None)
            if model is None:
                print(f"Model '{name}' not found in the pipeline.")
                continue
            repeated_blocks = getattr(model, "_repeated_blocks", None)
            # regional compilation for repeated blocks.
            if repeated_blocks is not None:
                for submod in model.modules():
                    if submod.__class__.__name__ in repeated_blocks:
                        submod.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
            # compile the whole model.
            else:
                model.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
            print(f"{name} is compiled with mode={mode}, dynamic={dynamic}, fullgraph={fullgraph}.")
 class PipelineUnitGraph:
    def __init__(self):
        pass
    def build_edges(self, units: list[PipelineUnit]):
        # Establish dependencies between units
        # to search for subsequent related computation units.
        last_compute_unit_id = {}
        edges = []
        for unit_id, unit in enumerate(units):
            for input_param in unit.fetch_input_params():
                if input_param in last_compute_unit_id:
                    edges.append((last_compute_unit_id[input_param], unit_id))
            for output_param in unit.fetch_output_params():
                last_compute_unit_id[output_param] = unit_id
        return edges
    def build_chains(self, units: list[PipelineUnit]):
        # Establish updating chains for each variable
        # to track their computation process.
        params = sum([unit.fetch_input_params() + unit.fetch_output_params() for unit in units], [])
        params = sorted(list(set(params)))
        chains = {param: [] for param in params}
        for unit_id, unit in enumerate(units):
            for param in unit.fetch_output_params():
                chains[param].append(unit_id)
        return chains
    def search_direct_unit_ids(self, units: list[PipelineUnit], model_names: list[str]):
        # Search for units that directly participate in the model's computation.
        related_unit_ids = []
        for unit_id, unit in enumerate(units):
            for model_name in model_names:
                if unit.onload_model_names is not None and model_name in unit.onload_model_names:
                    related_unit_ids.append(unit_id)
                    break
        return related_unit_ids
    def search_related_unit_ids(self, edges, start_unit_ids, direction="target"):
        # Search for subsequent related computation units.
        related_unit_ids = [unit_id for unit_id in start_unit_ids]
        while True:
            neighbors = []
            for source, target in edges:
                if direction == "target" and source in related_unit_ids and target not in related_unit_ids:
                    neighbors.append(target)
                elif direction == "source" and source not in related_unit_ids and target in related_unit_ids:
                    neighbors.append(source)
            neighbors = sorted(list(set(neighbors)))
            if len(neighbors) == 0:
                break
            else:
                related_unit_ids.extend(neighbors)
        related_unit_ids = sorted(list(set(related_unit_ids)))
        return related_unit_ids
    def search_updating_unit_ids(self, units: list[PipelineUnit], chains, related_unit_ids):
        # If the input parameters of this subgraph are updated outside the subgraph,
        # search for the units where these updates occur.
        first_compute_unit_id = {}
        for unit_id in related_unit_ids:
            for param in units[unit_id].fetch_input_params():
                if param not in first_compute_unit_id:
                    first_compute_unit_id[param] = unit_id
        updating_unit_ids = []
        for param in first_compute_unit_id:
            unit_id = first_compute_unit_id[param]
            chain = chains[param]
            if unit_id in chain and chain.index(unit_id) != len(chain) - 1:
                for unit_id_ in chain[chain.index(unit_id) + 1:]:
                    if unit_id_ not in related_unit_ids:
                        updating_unit_ids.append(unit_id_)
        related_unit_ids.extend(updating_unit_ids)
        related_unit_ids = sorted(list(set(related_unit_ids)))
        return related_unit_ids
    def split_pipeline_units(self, units: list[PipelineUnit], model_names: list[str]):
        # Split the computation graph,
        # separating all model-related computations.
        related_unit_ids = self.search_direct_unit_ids(units, model_names)
        edges = self.build_edges(units)
        chains = self.build_chains(units)
        while True:
            num_related_unit_ids = len(related_unit_ids)
            related_unit_ids = self.search_related_unit_ids(edges, related_unit_ids, "target")
            related_unit_ids = self.search_updating_unit_ids(units, chains, related_unit_ids)
            if len(related_unit_ids) == num_related_unit_ids:
                break
            else:
                num_related_unit_ids = len(related_unit_ids)
        related_units = [units[i] for i in related_unit_ids]
        unrelated_units = [units[i] for i in range(len(units)) if i not in related_unit_ids]
        return related_units, unrelated_units
 class PipelineUnitRunner:
    def __init__(self):
        pass
    def __call__(self, unit: PipelineUnit, pipe: BasePipeline, inputs_shared: dict, inputs_posi: dict, inputs_nega: dict) -> tuple[dict, dict]:
        if unit.take_over:
            # Let the pipeline unit take over this function.
            inputs_shared, inputs_posi, inputs_nega = unit.process(pipe, inputs_shared=inputs_shared, inputs_posi=inputs_posi, inputs_nega=inputs_nega)
        elif unit.seperate_cfg:
            # Positive side
            processor_inputs = {name: inputs_posi.get(name_) for name, name_ in unit.input_params_posi.items()}
            if unit.input_params is not None:
                for name in unit.input_params:
                    processor_inputs[name] = inputs_shared.get(name)
            processor_outputs = unit.process(pipe, **processor_inputs)
            inputs_posi.update(processor_outputs)
            # Negative side
            if inputs_shared["cfg_scale"] != 1:
                processor_inputs = {name: inputs_nega.get(name_) for name, name_ in unit.input_params_nega.items()}
                if unit.input_params is not None:
                    for name in unit.input_params:
                        processor_inputs[name] = inputs_shared.get(name)
                processor_outputs = unit.process(pipe, **processor_inputs)
                inputs_nega.update(processor_outputs)
            else:
                inputs_nega.update(processor_outputs)
        else:
            processor_inputs = {name: inputs_shared.get(name) for name in unit.input_params}
            processor_outputs = unit.process(pipe, **processor_inputs)
            inputs_shared.update(processor_outputs)
        return inputs_shared, inputs_posi, inputs_nega
--- a/diffsynth/diffusion/flow_match.py
+++ b/diffsynth/diffusion/flow_match.py
@@ -0,0 +1,282 @@
 import torch, math
 from typing_extensions import Literal
 class FlowMatchScheduler():
    def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning", "ERNIE-Image", "ACE-Step"] = "FLUX.1"):
        self.set_timesteps_fn = {
            "FLUX.1": FlowMatchScheduler.set_timesteps_flux,
            "Wan": FlowMatchScheduler.set_timesteps_wan,
            "Qwen-Image": FlowMatchScheduler.set_timesteps_qwen_image,
            "FLUX.2": FlowMatchScheduler.set_timesteps_flux2,
            "Z-Image": FlowMatchScheduler.set_timesteps_z_image,
            "LTX-2": FlowMatchScheduler.set_timesteps_ltx2,
            "Qwen-Image-Lightning": FlowMatchScheduler.set_timesteps_qwen_image_lightning,
            "ERNIE-Image": FlowMatchScheduler.set_timesteps_ernie_image,
            "ACE-Step": FlowMatchScheduler.set_timesteps_ace_step,
        }.get(template, FlowMatchScheduler.set_timesteps_flux)
        self.num_train_timesteps = 1000
    @staticmethod
    def set_timesteps_flux(num_inference_steps=100, denoising_strength=1.0, shift=None):
        sigma_min = 0.003/1.002
        sigma_max = 1.0
        shift = 3 if shift is None else shift
        num_train_timesteps = 1000
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps)
        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_wan(num_inference_steps=100, denoising_strength=1.0, shift=None):
        sigma_min = 0.0
        sigma_max = 1.0
        shift = 5 if shift is None else shift
        num_train_timesteps = 1000
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def _calculate_shift_qwen_image(image_seq_len, base_seq_len=256, max_seq_len=8192, base_shift=0.5, max_shift=0.9):
        m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
        b = base_shift - m * base_seq_len
        mu = image_seq_len * m + b
        return mu
    @staticmethod
    def set_timesteps_qwen_image(num_inference_steps=100, denoising_strength=1.0, exponential_shift_mu=None, dynamic_shift_len=None):
        sigma_min = 0.0
        sigma_max = 1.0
        num_train_timesteps = 1000
        shift_terminal = 0.02
        # Sigmas
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
        # Mu
        if exponential_shift_mu is not None:
            mu = exponential_shift_mu
        elif dynamic_shift_len is not None:
            mu = FlowMatchScheduler._calculate_shift_qwen_image(dynamic_shift_len)
        else:
            mu = 0.8
        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
        # Shift terminal
        one_minus_z = 1 - sigmas
        scale_factor = one_minus_z[-1] / (1 - shift_terminal)
        sigmas = 1 - (one_minus_z / scale_factor)
        # Timesteps
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_qwen_image_lightning(num_inference_steps=100, denoising_strength=1.0, exponential_shift_mu=None, dynamic_shift_len=None):
        sigma_min = 0.0
        sigma_max = 1.0
        num_train_timesteps = 1000
        base_shift = math.log(3)
        max_shift = math.log(3)
        # Sigmas
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
        # Mu
        if exponential_shift_mu is not None:
            mu = exponential_shift_mu
        elif dynamic_shift_len is not None:
            mu = FlowMatchScheduler._calculate_shift_qwen_image(dynamic_shift_len, base_shift=base_shift, max_shift=max_shift)
        else:
            mu = 0.8
        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
        # Timesteps
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def compute_empirical_mu(image_seq_len, num_steps):
        a1, b1 = 8.73809524e-05, 1.89833333
        a2, b2 = 0.00016927, 0.45666666
        if image_seq_len > 4300:
            mu = a2 * image_seq_len + b2
            return float(mu)
        m_200 = a2 * image_seq_len + b2
        m_10 = a1 * image_seq_len + b1
        a = (m_200 - m_10) / 190.0
        b = m_200 - 200.0 * a
        mu = a * num_steps + b
        return float(mu)
    @staticmethod
    def set_timesteps_flux2(num_inference_steps=100, denoising_strength=1.0, dynamic_shift_len=None):
        sigma_min = 1 / num_inference_steps
        sigma_max = 1.0
        num_train_timesteps = 1000
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps)
        if dynamic_shift_len is None:
            # If you ask me why I set mu=0.8,
            # I can only say that it yields better training results.
            mu = 0.8
        else:
            mu = FlowMatchScheduler.compute_empirical_mu(dynamic_shift_len, num_inference_steps)
        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0, shift=3.0):
        sigma_min = 0.0
        sigma_max = 1.0
        num_train_timesteps = 1000
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
        if shift is not None and shift != 1.0:
            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_ace_step(num_inference_steps=8, denoising_strength=1.0, shift=3.0):
        """ACE-Step Flow Matching scheduler.
        Timesteps range from 1.0 to 0.0 (not multiplied by 1000).
        Shift transformation: t = shift * t / (1 + (shift - 1) * t)
        Args:
            num_inference_steps: Number of diffusion steps.
            denoising_strength: Denoising strength (1.0 = full denoising).
            shift: Timestep shift parameter (default 3.0 for turbo).
        """
        num_train_timesteps = 1000
        sigma_start = denoising_strength
        sigmas = torch.linspace(sigma_start, 0.0, num_inference_steps + 1)[:-1]
        if shift is not None and shift != 1.0:
            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_z_image(num_inference_steps=100, denoising_strength=1.0, shift=None, target_timesteps=None):
        sigma_min = 0.0
        sigma_max = 1.0
        shift = 3 if shift is None else shift
        num_train_timesteps = 1000
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        if target_timesteps is not None:
            target_timesteps = target_timesteps.to(dtype=timesteps.dtype, device=timesteps.device)
            for timestep in target_timesteps:
                timestep_id = torch.argmin((timesteps - timestep).abs())
                timesteps[timestep_id] = timestep
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_joyai_image(num_inference_steps=100, denoising_strength=1.0, shift=None):
        sigma_min = 0.0
        sigma_max = 1.0
        shift = 4.0 if shift is None else shift
        num_train_timesteps = 1000
        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    @staticmethod
    def set_timesteps_ltx2(num_inference_steps=100, denoising_strength=1.0, dynamic_shift_len=None, terminal=0.1, special_case=None):
        num_train_timesteps = 1000
        if special_case == "stage2":
            sigmas = torch.Tensor([0.909375, 0.725, 0.421875])
        elif special_case == "ditilled_stage1":
            sigmas = torch.Tensor([1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875])
        else:
            dynamic_shift_len = dynamic_shift_len or 4096
            sigma_shift = FlowMatchScheduler._calculate_shift_qwen_image(
                image_seq_len=dynamic_shift_len,
                base_seq_len=1024,
                max_seq_len=4096,
                base_shift=0.95,
                max_shift=2.05,
            )
            sigma_min = 0.0
            sigma_max = 1.0
            sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
            sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
            sigmas = math.exp(sigma_shift) / (math.exp(sigma_shift) + (1 / sigmas - 1))
            # Shift terminal
            one_minus_z = 1.0 - sigmas
            scale_factor = one_minus_z[-1] / (1 - terminal)
            sigmas = 1.0 - (one_minus_z / scale_factor)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    def set_training_weight(self):
        steps = 1000
        x = self.timesteps
        y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2)
        y_shifted = y - y.min()
        bsmntw_weighing = y_shifted * (steps / y_shifted.sum())
        if len(self.timesteps) != 1000:
            # This is an empirical formula.
            bsmntw_weighing = bsmntw_weighing * (len(self.timesteps) / steps)
            bsmntw_weighing = bsmntw_weighing + bsmntw_weighing[1]
        self.linear_timesteps_weights = bsmntw_weighing
    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, **kwargs):
        self.sigmas, self.timesteps = self.set_timesteps_fn(
            num_inference_steps=num_inference_steps,
            denoising_strength=denoising_strength,
            **kwargs,
        )
        if training:
            self.set_training_weight()
            self.training = True
        else:
            self.training = False
    def step(self, model_output, timestep, sample, to_final=False, **kwargs):
        if isinstance(timestep, torch.Tensor):
            timestep = timestep.cpu()
        timestep_id = torch.argmin((self.timesteps - timestep).abs())
        sigma = self.sigmas[timestep_id]
        if to_final or timestep_id + 1 >= len(self.timesteps):
            sigma_ = 0
        else:
            sigma_ = self.sigmas[timestep_id + 1]
        prev_sample = sample + model_output * (sigma_ - sigma)
        return prev_sample
    def return_to_timestep(self, timestep, sample, sample_stablized):
        if isinstance(timestep, torch.Tensor):
            timestep = timestep.cpu()
        timestep_id = torch.argmin((self.timesteps - timestep).abs())
        sigma = self.sigmas[timestep_id]
        model_output = (sample - sample_stablized) / sigma
        return model_output
    def add_noise(self, original_samples, noise, timestep):
        if isinstance(timestep, torch.Tensor):
            timestep = timestep.cpu()
        timestep_id = torch.argmin((self.timesteps - timestep).abs())
        sigma = self.sigmas[timestep_id]
        sample = (1 - sigma) * original_samples + sigma * noise
        return sample
    def training_target(self, sample, noise, timestep):
        target = noise - sample
        return target
    def training_weight(self, timestep):
        timestep_id = torch.argmin((self.timesteps - timestep.to(self.timesteps.device)).abs())
        weights = self.linear_timesteps_weights[timestep_id]
        return weights
--- a/diffsynth/diffusion/logger.py
+++ b/diffsynth/diffusion/logger.py
@@ -0,0 +1,43 @@
 import os, torch
 from accelerate import Accelerator
 class ModelLogger:
    def __init__(self, output_path, remove_prefix_in_ckpt=None, state_dict_converter=lambda x:x):
        self.output_path = output_path
        self.remove_prefix_in_ckpt = remove_prefix_in_ckpt
        self.state_dict_converter = state_dict_converter
        self.num_steps = 0
    def on_step_end(self, accelerator: Accelerator, model: torch.nn.Module, save_steps=None, **kwargs):
        self.num_steps += 1
        if save_steps is not None and self.num_steps % save_steps == 0:
            self.save_model(accelerator, model, f"step-{self.num_steps}.safetensors")
    def on_epoch_end(self, accelerator: Accelerator, model: torch.nn.Module, epoch_id):
        accelerator.wait_for_everyone()
        state_dict = accelerator.get_state_dict(model)
        if accelerator.is_main_process:
            state_dict = accelerator.unwrap_model(model).export_trainable_state_dict(state_dict, remove_prefix=self.remove_prefix_in_ckpt)
            state_dict = self.state_dict_converter(state_dict)
            os.makedirs(self.output_path, exist_ok=True)
            path = os.path.join(self.output_path, f"epoch-{epoch_id}.safetensors")
            accelerator.save(state_dict, path, safe_serialization=True)
    def on_training_end(self, accelerator: Accelerator, model: torch.nn.Module, save_steps=None):
        if save_steps is not None and self.num_steps % save_steps != 0:
            self.save_model(accelerator, model, f"step-{self.num_steps}.safetensors")
    def save_model(self, accelerator: Accelerator, model: torch.nn.Module, file_name):
        accelerator.wait_for_everyone()
        state_dict = accelerator.get_state_dict(model)
        if accelerator.is_main_process:
            state_dict = accelerator.unwrap_model(model).export_trainable_state_dict(state_dict, remove_prefix=self.remove_prefix_in_ckpt)
            state_dict = self.state_dict_converter(state_dict)
            os.makedirs(self.output_path, exist_ok=True)
            path = os.path.join(self.output_path, file_name)
            accelerator.save(state_dict, path, safe_serialization=True)
--- a/diffsynth/diffusion/loss.py
+++ b/diffsynth/diffusion/loss.py
@@ -0,0 +1,158 @@
 from .base_pipeline import BasePipeline
 import torch
 def FlowMatchSFTLoss(pipe: BasePipeline, **inputs):
    max_timestep_boundary = int(inputs.get("max_timestep_boundary", 1) * len(pipe.scheduler.timesteps))
    min_timestep_boundary = int(inputs.get("min_timestep_boundary", 0) * len(pipe.scheduler.timesteps))
    timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
    timestep = pipe.scheduler.timesteps[timestep_id].to(dtype=pipe.torch_dtype, device=pipe.device)
    noise = torch.randn_like(inputs["input_latents"])
    inputs["latents"] = pipe.scheduler.add_noise(inputs["input_latents"], noise, timestep)
    training_target = pipe.scheduler.training_target(inputs["input_latents"], noise, timestep)
    if "first_frame_latents" in inputs:
        inputs["latents"][:, :, 0:1] = inputs["first_frame_latents"]
    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
    noise_pred = pipe.model_fn(**models, **inputs, timestep=timestep)
    if "first_frame_latents" in inputs:
        noise_pred = noise_pred[:, :, 1:]
        training_target = training_target[:, :, 1:]
    loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
    loss = loss * pipe.scheduler.training_weight(timestep)
    return loss
 def FlowMatchSFTAudioVideoLoss(pipe: BasePipeline, **inputs):
    max_timestep_boundary = int(inputs.get("max_timestep_boundary", 1) * len(pipe.scheduler.timesteps))
    min_timestep_boundary = int(inputs.get("min_timestep_boundary", 0) * len(pipe.scheduler.timesteps))
    timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
    timestep = pipe.scheduler.timesteps[timestep_id].to(dtype=pipe.torch_dtype, device=pipe.device)
    # video
    noise = torch.randn_like(inputs["input_latents"])
    inputs["video_latents"] = pipe.scheduler.add_noise(inputs["input_latents"], noise, timestep)
    training_target = pipe.scheduler.training_target(inputs["input_latents"], noise, timestep)
    # audio
    if inputs.get("audio_input_latents") is not None:
        audio_noise = torch.randn_like(inputs["audio_input_latents"])
        inputs["audio_latents"] = pipe.scheduler.add_noise(inputs["audio_input_latents"], audio_noise, timestep)
        training_target_audio = pipe.scheduler.training_target(inputs["audio_input_latents"], audio_noise, timestep)
    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
    noise_pred, noise_pred_audio = pipe.model_fn(**models, **inputs, timestep=timestep)
    loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
    loss = loss * pipe.scheduler.training_weight(timestep)
    if inputs.get("audio_input_latents") is not None:
        loss_audio = torch.nn.functional.mse_loss(noise_pred_audio.float(), training_target_audio.float())
        loss_audio = loss_audio * pipe.scheduler.training_weight(timestep)
        loss = loss + loss_audio
    return loss
 def DirectDistillLoss(pipe: BasePipeline, **inputs):
    pipe.scheduler.set_timesteps(inputs["num_inference_steps"])
    pipe.scheduler.training = True
    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
    for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
        timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
        noise_pred = pipe.model_fn(**models, **inputs, timestep=timestep, progress_id=progress_id)
        inputs["latents"] = pipe.step(pipe.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs)
    loss = torch.nn.functional.mse_loss(inputs["latents"].float(), inputs["input_latents"].float())
    return loss
 class TrajectoryImitationLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.initialized = False
    def initialize(self, device):
        import lpips # TODO: remove it
        self.loss_fn = lpips.LPIPS(net='alex').to(device)
        self.initialized = True
    def fetch_trajectory(self, pipe: BasePipeline, timesteps_student, inputs_shared, inputs_posi, inputs_nega, num_inference_steps, cfg_scale):
        trajectory = [inputs_shared["latents"].clone()]
        pipe.scheduler.set_timesteps(num_inference_steps, target_timesteps=timesteps_student)
        models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
        for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
            timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
            noise_pred = pipe.cfg_guided_model_fn(
                pipe.model_fn, cfg_scale,
                inputs_shared, inputs_posi, inputs_nega,
                **models, timestep=timestep, progress_id=progress_id
            )
            inputs_shared["latents"] = pipe.step(pipe.scheduler, progress_id=progress_id, noise_pred=noise_pred.detach(), **inputs_shared)
            trajectory.append(inputs_shared["latents"].clone())
        return pipe.scheduler.timesteps, trajectory
    def align_trajectory(self, pipe: BasePipeline, timesteps_teacher, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, num_inference_steps, cfg_scale):
        loss = 0
        pipe.scheduler.set_timesteps(num_inference_steps, training=True)
        models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
        for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
            timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
            progress_id_teacher = torch.argmin((timesteps_teacher - timestep).abs())
            inputs_shared["latents"] = trajectory_teacher[progress_id_teacher]
            noise_pred = pipe.cfg_guided_model_fn(
                pipe.model_fn, cfg_scale,
                inputs_shared, inputs_posi, inputs_nega,
                **models, timestep=timestep, progress_id=progress_id
            )
            sigma = pipe.scheduler.sigmas[progress_id]
            sigma_ = 0 if progress_id + 1 >= len(pipe.scheduler.timesteps) else pipe.scheduler.sigmas[progress_id + 1]
            if progress_id + 1 >= len(pipe.scheduler.timesteps):
                latents_ = trajectory_teacher[-1]
            else:
                progress_id_teacher = torch.argmin((timesteps_teacher - pipe.scheduler.timesteps[progress_id + 1]).abs())
                latents_ = trajectory_teacher[progress_id_teacher]
            denom = sigma_ - sigma
            denom = torch.sign(denom) * torch.clamp(denom.abs(), min=1e-6)
            target = (latents_ - inputs_shared["latents"]) / denom
            loss = loss + torch.nn.functional.mse_loss(noise_pred.float(), target.float()) * pipe.scheduler.training_weight(timestep)
        return loss
    def compute_regularization(self, pipe: BasePipeline, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, num_inference_steps, cfg_scale):
        inputs_shared["latents"] = trajectory_teacher[0]
        pipe.scheduler.set_timesteps(num_inference_steps)
        models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
        for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
            timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
            noise_pred = pipe.cfg_guided_model_fn(
                pipe.model_fn, cfg_scale,
                inputs_shared, inputs_posi, inputs_nega,
                **models, timestep=timestep, progress_id=progress_id
            )
            inputs_shared["latents"] = pipe.step(pipe.scheduler, progress_id=progress_id, noise_pred=noise_pred.detach(), **inputs_shared)
        image_pred = pipe.vae_decoder(inputs_shared["latents"])
        image_real = pipe.vae_decoder(trajectory_teacher[-1])
        loss = self.loss_fn(image_pred.float(), image_real.float())
        return loss
    def forward(self, pipe: BasePipeline, inputs_shared, inputs_posi, inputs_nega):
        if not self.initialized:
            self.initialize(pipe.device)
        with torch.no_grad():
            pipe.scheduler.set_timesteps(8)
            timesteps_teacher, trajectory_teacher = self.fetch_trajectory(inputs_shared["teacher"], pipe.scheduler.timesteps, inputs_shared, inputs_posi, inputs_nega, 50, 2)
            timesteps_teacher = timesteps_teacher.to(dtype=pipe.torch_dtype, device=pipe.device)
        loss_1 = self.align_trajectory(pipe, timesteps_teacher, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, 8, 1)
        loss_2 = self.compute_regularization(pipe, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, 8, 1)
        loss = loss_1 + loss_2
        return loss
--- a/diffsynth/diffusion/parsers.py
+++ b/diffsynth/diffusion/parsers.py
@@ -0,0 +1,70 @@
 import argparse
 def add_dataset_base_config(parser: argparse.ArgumentParser):
    parser.add_argument("--dataset_base_path", type=str, default="", required=True, help="Base path of the dataset.")
    parser.add_argument("--dataset_metadata_path", type=str, default=None, help="Path to the metadata file of the dataset.")
    parser.add_argument("--dataset_repeat", type=int, default=1, help="Number of times to repeat the dataset per epoch.")
    parser.add_argument("--dataset_num_workers", type=int, default=0, help="Number of workers for data loading.")
    parser.add_argument("--data_file_keys", type=str, default="image,video", help="Data file keys in the metadata. Comma-separated.")
    return parser
 def add_image_size_config(parser: argparse.ArgumentParser):
    parser.add_argument("--height", type=int, default=None, help="Height of images. Leave `height` and `width` empty to enable dynamic resolution.")
    parser.add_argument("--width", type=int, default=None, help="Width of images. Leave `height` and `width` empty to enable dynamic resolution.")
    parser.add_argument("--max_pixels", type=int, default=1024*1024, help="Maximum number of pixels per frame, used for dynamic resolution.")
    return parser
 def add_video_size_config(parser: argparse.ArgumentParser):
    parser.add_argument("--height", type=int, default=None, help="Height of images. Leave `height` and `width` empty to enable dynamic resolution.")
    parser.add_argument("--width", type=int, default=None, help="Width of images. Leave `height` and `width` empty to enable dynamic resolution.")
    parser.add_argument("--max_pixels", type=int, default=1024*1024, help="Maximum number of pixels per frame, used for dynamic resolution.")
    parser.add_argument("--num_frames", type=int, default=81, help="Number of frames per video. Frames are sampled from the video prefix.")
    return parser
 def add_model_config(parser: argparse.ArgumentParser):
    parser.add_argument("--model_paths", type=str, default=None, help="Paths to load models. In JSON format.")
    parser.add_argument("--model_id_with_origin_paths", type=str, default=None, help="Model ID with origin paths, e.g., Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors. Comma-separated.")
    parser.add_argument("--extra_inputs", default=None, help="Additional model inputs, comma-separated.")
    parser.add_argument("--fp8_models", default=None, help="Models with FP8 precision, comma-separated.")
    parser.add_argument("--offload_models", default=None, help="Models with offload, comma-separated. Only used in splited training.")
    return parser
 def add_training_config(parser: argparse.ArgumentParser):
    parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate.")
    parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs.")
    parser.add_argument("--trainable_models", type=str, default=None, help="Models to train, e.g., dit, vae, text_encoder.")
    parser.add_argument("--find_unused_parameters", default=False, action="store_true", help="Whether to find unused parameters in DDP.")
    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay.")
    parser.add_argument("--task", type=str, default="sft", required=False, help="Task type.")
    return parser
 def add_output_config(parser: argparse.ArgumentParser):
    parser.add_argument("--output_path", type=str, default="./models", help="Output save path.")
    parser.add_argument("--remove_prefix_in_ckpt", type=str, default="pipe.dit.", help="Remove prefix in ckpt.")
    parser.add_argument("--save_steps", type=int, default=None, help="Number of checkpoint saving invervals. If None, checkpoints will be saved every epoch.")
    return parser
 def add_lora_config(parser: argparse.ArgumentParser):
    parser.add_argument("--lora_base_model", type=str, default=None, help="Which model LoRA is added to.")
    parser.add_argument("--lora_target_modules", type=str, default="q,k,v,o,ffn.0,ffn.2", help="Which layers LoRA is added to.")
    parser.add_argument("--lora_rank", type=int, default=32, help="Rank of LoRA.")
    parser.add_argument("--lora_checkpoint", type=str, default=None, help="Path to the LoRA checkpoint. If provided, LoRA will be loaded from this checkpoint.")
    parser.add_argument("--preset_lora_path", type=str, default=None, help="Path to the preset LoRA checkpoint. If provided, this LoRA will be fused to the base model.")
    parser.add_argument("--preset_lora_model", type=str, default=None, help="Which model the preset LoRA is fused to.")
    return parser
 def add_gradient_config(parser: argparse.ArgumentParser):
    parser.add_argument("--use_gradient_checkpointing", default=False, action="store_true", help="Whether to use gradient checkpointing.")
    parser.add_argument("--use_gradient_checkpointing_offload", default=False, action="store_true", help="Whether to offload gradient checkpointing to CPU memory.")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Gradient accumulation steps.")
    return parser
 def add_general_config(parser: argparse.ArgumentParser):
    parser = add_dataset_base_config(parser)
    parser = add_model_config(parser)
    parser = add_training_config(parser)
    parser = add_output_config(parser)
    parser = add_lora_config(parser)
    parser = add_gradient_config(parser)
    return parser
--- a/diffsynth/diffusion/runner.py
+++ b/diffsynth/diffusion/runner.py
@@ -0,0 +1,88 @@
 import os, torch
 from tqdm import tqdm
 from accelerate import Accelerator
 from .training_module import DiffusionTrainingModule
 from .logger import ModelLogger
 def launch_training_task(
    accelerator: Accelerator,
    dataset: torch.utils.data.Dataset,
    model: DiffusionTrainingModule,
    model_logger: ModelLogger,
    learning_rate: float = 1e-5,
    weight_decay: float = 1e-2,
    num_workers: int = 1,
    save_steps: int = None,
    num_epochs: int = 1,
    args = None,
 ):
    if args is not None:
        learning_rate = args.learning_rate
        weight_decay = args.weight_decay
        num_workers = args.dataset_num_workers
        save_steps = args.save_steps
        num_epochs = args.num_epochs
    optimizer = torch.optim.AdamW(model.trainable_modules(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer)
    dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, collate_fn=lambda x: x[0], num_workers=num_workers)
    model.to(device=accelerator.device)
    model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
    initialize_deepspeed_gradient_checkpointing(accelerator)
    for epoch_id in range(num_epochs):
        for data in tqdm(dataloader):
            with accelerator.accumulate(model):
                if dataset.load_from_cache:
                    loss = model({}, inputs=data)
                else:
                    loss = model(data)
                accelerator.backward(loss)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model_logger.on_step_end(accelerator, model, save_steps, loss=loss)
        if save_steps is None:
            model_logger.on_epoch_end(accelerator, model, epoch_id)
    model_logger.on_training_end(accelerator, model, save_steps)
 def launch_data_process_task(
    accelerator: Accelerator,
    dataset: torch.utils.data.Dataset,
    model: DiffusionTrainingModule,
    model_logger: ModelLogger,
    num_workers: int = 8,
    args = None,
 ):
    if args is not None:
        num_workers = args.dataset_num_workers
    dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=lambda x: x[0], num_workers=num_workers)
    model.to(device=accelerator.device)
    model, dataloader = accelerator.prepare(model, dataloader)
    for data_id, data in enumerate(tqdm(dataloader)):
        with accelerator.accumulate(model):
            with torch.no_grad():
                folder = os.path.join(model_logger.output_path, str(accelerator.process_index))
                os.makedirs(folder, exist_ok=True)
                save_path = os.path.join(model_logger.output_path, str(accelerator.process_index), f"{data_id}.pth")
                data = model(data)
                torch.save(data, save_path)
 def initialize_deepspeed_gradient_checkpointing(accelerator: Accelerator):
    if getattr(accelerator.state, "deepspeed_plugin", None) is not None:
        ds_config = accelerator.state.deepspeed_plugin.deepspeed_config
        if "activation_checkpointing" in ds_config:
            import deepspeed
            act_config = ds_config["activation_checkpointing"]
            deepspeed.checkpointing.configure(
                mpu_=None, 
                partition_activations=act_config.get("partition_activations", False),
                checkpoint_in_cpu=act_config.get("cpu_checkpointing", False),
                contiguous_checkpointing=act_config.get("contiguous_memory_optimization", False)
            )
        else:
            print("Do not find activation_checkpointing config in deepspeed config, skip initializing deepspeed gradient checkpointing.")
--- a/diffsynth/diffusion/training_module.py
+++ b/diffsynth/diffusion/training_module.py
@@ -0,0 +1,302 @@
 import torch, json, os, inspect
 from ..core import ModelConfig, load_state_dict
 from ..utils.controlnet import ControlNetInput
 from .base_pipeline import PipelineUnit
 from peft import LoraConfig, inject_adapter_in_model
 class GeneralUnit_RemoveCache(PipelineUnit):
    def __init__(self, required_params=tuple(), force_remove_params_shared=tuple(), force_remove_params_posi=tuple(), force_remove_params_nega=tuple()):
        super().__init__(take_over=True)
        self.required_params = required_params
        self.force_remove_params_shared = force_remove_params_shared
        self.force_remove_params_posi = force_remove_params_posi
        self.force_remove_params_nega = force_remove_params_nega
    def process_params(self, inputs, required_params, force_remove_params):
        inputs_ = {}
        for name, param in inputs.items():
            if name in required_params and name not in force_remove_params:
                inputs_[name] = param
        return inputs_
    def process(self, pipe, inputs_shared, inputs_posi, inputs_nega):
        inputs_shared = self.process_params(inputs_shared, self.required_params, self.force_remove_params_shared)
        inputs_posi = self.process_params(inputs_posi, self.required_params, self.force_remove_params_posi)
        inputs_nega = self.process_params(inputs_nega, self.required_params, self.force_remove_params_nega)
        return inputs_shared, inputs_posi, inputs_nega
 class DiffusionTrainingModule(torch.nn.Module):
    def __init__(self):
        super().__init__()
    def to(self, *args, **kwargs):
        for name, model in self.named_children():
            model.to(*args, **kwargs)
        return self
    def trainable_modules(self):
        trainable_modules = filter(lambda p: p.requires_grad, self.parameters())
        return trainable_modules
    def trainable_param_names(self):
        trainable_param_names = list(filter(lambda named_param: named_param[1].requires_grad, self.named_parameters()))
        trainable_param_names = set([named_param[0] for named_param in trainable_param_names])
        return trainable_param_names
    def add_lora_to_model(self, model, target_modules, lora_rank, lora_alpha=None, upcast_dtype=None):
        if lora_alpha is None:
            lora_alpha = lora_rank
        if isinstance(target_modules, list) and len(target_modules) == 1:
            target_modules = target_modules[0]
        lora_config = LoraConfig(r=lora_rank, lora_alpha=lora_alpha, target_modules=target_modules)
        model = inject_adapter_in_model(lora_config, model)
        if upcast_dtype is not None:
            for param in model.parameters():
                if param.requires_grad:
                    param.data = param.to(upcast_dtype)
        return model
    def mapping_lora_state_dict(self, state_dict):
        new_state_dict = {}
        for key, value in state_dict.items():
            if "lora_A.weight" in key or "lora_B.weight" in key:
                new_key = key.replace("lora_A.weight", "lora_A.default.weight").replace("lora_B.weight", "lora_B.default.weight")
                new_state_dict[new_key] = value
            elif "lora_A.default.weight" in key or "lora_B.default.weight" in key:
                new_state_dict[key] = value
        return new_state_dict
    def export_trainable_state_dict(self, state_dict, remove_prefix=None):
        trainable_param_names = self.trainable_param_names()
        state_dict = {name: param for name, param in state_dict.items() if name in trainable_param_names}
        if remove_prefix is not None:
            state_dict_ = {}
            for name, param in state_dict.items():
                if name.startswith(remove_prefix):
                    name = name[len(remove_prefix):]
                state_dict_[name] = param
            state_dict = state_dict_
        return state_dict
    def transfer_data_to_device(self, data, device, torch_float_dtype=None):
        if data is None:
            return data
        elif isinstance(data, torch.Tensor):
            data = data.to(device)
            if torch_float_dtype is not None and data.dtype in [torch.float, torch.float16, torch.bfloat16]:
                data = data.to(torch_float_dtype)
            return data
        elif isinstance(data, tuple):
            data = tuple(self.transfer_data_to_device(x, device, torch_float_dtype) for x in data)
            return data
        elif isinstance(data, list):
            data = list(self.transfer_data_to_device(x, device, torch_float_dtype) for x in data)
            return data
        elif isinstance(data, dict):
            data = {i: self.transfer_data_to_device(data[i], device, torch_float_dtype) for i in data}
            return data
        else:
            return data
    def parse_vram_config(self, fp8=False, offload=False, device="cpu"):
        if fp8:
            return {
                "offload_dtype": torch.float8_e4m3fn,
                "offload_device": device,
                "onload_dtype": torch.float8_e4m3fn,
                "onload_device": device,
                "preparing_dtype": torch.float8_e4m3fn,
                "preparing_device": device,
                "computation_dtype": torch.bfloat16,
                "computation_device": device,
            }
        elif offload:
            return {
                "offload_dtype": "disk",
                "offload_device": "disk",
                "onload_dtype": "disk",
                "onload_device": "disk",
                "preparing_dtype": torch.bfloat16,
                "preparing_device": device,
                "computation_dtype": torch.bfloat16,
                "computation_device": device,
                "clear_parameters": True,
            }
        else:
            return {}
    def parse_model_configs(self, model_paths, model_id_with_origin_paths, fp8_models=None, offload_models=None, device="cpu"):
        fp8_models = [] if fp8_models is None else fp8_models.split(",")
        offload_models = [] if offload_models is None else offload_models.split(",")
        model_configs = []
        if model_paths is not None:
            model_paths = json.loads(model_paths)
            for path in model_paths:
                vram_config = self.parse_vram_config(
                    fp8=path in fp8_models,
                    offload=path in offload_models,
                    device=device
                )
                model_configs.append(ModelConfig(path=path, **vram_config))
        if model_id_with_origin_paths is not None:
            model_id_with_origin_paths = model_id_with_origin_paths.split(",")
            for model_id_with_origin_path in model_id_with_origin_paths:
                vram_config = self.parse_vram_config(
                    fp8=model_id_with_origin_path in fp8_models,
                    offload=model_id_with_origin_path in offload_models,
                    device=device
                )
                config = self.parse_path_or_model_id(model_id_with_origin_path)
                model_configs.append(ModelConfig(model_id=config.model_id, origin_file_pattern=config.origin_file_pattern, **vram_config))
        return model_configs
    def parse_path_or_model_id(self, model_id_with_origin_path, default_value=None):
        if model_id_with_origin_path is None:
            return default_value
        elif os.path.exists(model_id_with_origin_path):
            return ModelConfig(path=model_id_with_origin_path)
        else:
            if ":" not in model_id_with_origin_path:
                raise ValueError(f"Failed to parse model config: {model_id_with_origin_path}. This is neither a valid path nor in the format of `model_id/origin_file_pattern`.")
            split_id = model_id_with_origin_path.rfind(":")
            model_id = model_id_with_origin_path[:split_id]
            origin_file_pattern = model_id_with_origin_path[split_id + 1:]
            return ModelConfig(model_id=model_id, origin_file_pattern=origin_file_pattern)
    def auto_detect_lora_target_modules(
        self,
        model: torch.nn.Module,
        search_for_linear=False,
        linear_detector=lambda x: min(x.weight.shape) >= 512,
        block_list_detector=lambda x: isinstance(x, torch.nn.ModuleList) and len(x) > 1,
        name_prefix="",
    ):
        lora_target_modules = []
        if search_for_linear:
            for name, module in model.named_modules():
                module_name = name_prefix + ["", "."][name_prefix != ""] + name
                if isinstance(module, torch.nn.Linear) and linear_detector(module):
                    lora_target_modules.append(module_name)
        else:
            for name, module in model.named_children():
                module_name = name_prefix + ["", "."][name_prefix != ""] + name
                lora_target_modules += self.auto_detect_lora_target_modules(
                    module,
                    search_for_linear=block_list_detector(module),
                    linear_detector=linear_detector,
                    block_list_detector=block_list_detector,
                    name_prefix=module_name,
                )
        return lora_target_modules
    def parse_lora_target_modules(self, model, lora_target_modules):
        if lora_target_modules == "":
            print("No LoRA target modules specified. The framework will automatically search for them.")
            lora_target_modules = self.auto_detect_lora_target_modules(model)
            print(f"LoRA will be patched at {lora_target_modules}.")
        else:
            lora_target_modules = lora_target_modules.split(",")
        return lora_target_modules
    def switch_pipe_to_training_mode(
        self,
        pipe,
        trainable_models=None,
        lora_base_model=None, lora_target_modules="", lora_rank=32, lora_checkpoint=None,
        preset_lora_path=None, preset_lora_model=None,
        task="sft",
    ):
        # Scheduler
        pipe.scheduler.set_timesteps(1000, training=True)
        # Freeze untrainable models
        pipe.freeze_except([] if trainable_models is None else trainable_models.split(","))
        # Preset LoRA
        if preset_lora_path is not None:
            pipe.load_lora(getattr(pipe, preset_lora_model), preset_lora_path)
        # FP8
        # FP8 relies on a model-specific memory management scheme.
        # It is delegated to the subclass.
        # Add LoRA to the base models
        if lora_base_model is not None and not task.endswith(":data_process"):
            if (not hasattr(pipe, lora_base_model)) or getattr(pipe, lora_base_model) is None:
                print(f"No {lora_base_model} models in the pipeline. We cannot patch LoRA on the model. If this occurs during the data processing stage, it is normal.")
                return
            model = self.add_lora_to_model(
                getattr(pipe, lora_base_model),
                target_modules=self.parse_lora_target_modules(getattr(pipe, lora_base_model), lora_target_modules),
                lora_rank=lora_rank,
                upcast_dtype=pipe.torch_dtype,
            )
            if lora_checkpoint is not None:
                state_dict = load_state_dict(lora_checkpoint)
                state_dict = self.mapping_lora_state_dict(state_dict)
                load_result = model.load_state_dict(state_dict, strict=False)
                print(f"LoRA checkpoint loaded: {lora_checkpoint}, total {len(state_dict)} keys")
                if len(load_result[1]) > 0:
                    print(f"Warning, LoRA key mismatch! Unexpected keys in LoRA checkpoint: {load_result[1]}")
            setattr(pipe, lora_base_model, model)
    def split_pipeline_units(
        self, task, pipe,
        trainable_models=None, lora_base_model=None,
        # TODO: set `remove_unnecessary_params` to `True` by default
        remove_unnecessary_params=False,
        # TODO: move `loss_required_params` to `loss.py`
        loss_required_params=("input_latents", "max_timestep_boundary", "min_timestep_boundary", "first_frame_latents", "video_latents", "audio_input_latents", "num_inference_steps"),
        force_remove_params_shared=tuple(),
        force_remove_params_posi=tuple(),
        force_remove_params_nega=tuple(),
    ):
        models_require_backward = []
        if trainable_models is not None:
            models_require_backward += trainable_models.split(",")
        if lora_base_model is not None:
            models_require_backward += [lora_base_model]
        if task.endswith(":data_process"):
            other_units, pipe.units = pipe.split_pipeline_units(models_require_backward)
            if remove_unnecessary_params:
                required_params = list(loss_required_params) + [i for i in inspect.signature(self.pipe.model_fn).parameters]
                for unit in other_units:
                    required_params.extend(unit.fetch_input_params())
                required_params = sorted(list(set(required_params)))
                pipe.units.append(GeneralUnit_RemoveCache(required_params, force_remove_params_shared, force_remove_params_posi, force_remove_params_nega))
        elif task.endswith(":train"):
            pipe.units, _ = pipe.split_pipeline_units(models_require_backward)
        return pipe
    def parse_extra_inputs(self, data, extra_inputs, inputs_shared):
        controlnet_keys_map = (
            ("blockwise_controlnet_", "blockwise_controlnet_inputs",),
            ("controlnet_", "controlnet_inputs"),
        )
        controlnet_inputs = {}
        for extra_input in extra_inputs:
            for prefix, name in controlnet_keys_map:
                if extra_input.startswith(prefix):
                    if name not in controlnet_inputs:
                        controlnet_inputs[name] = {}
                    controlnet_inputs[name][extra_input.replace(prefix, "")] = data[extra_input]
                    break
            else:
                inputs_shared[extra_input] = data[extra_input]
        for name, params in controlnet_inputs.items():
            inputs_shared[name] = [ControlNetInput(**params)]
        return inputs_shared
--- a/diffsynth/extensions/ESRGAN/init.py
+++ b/diffsynth/extensions/ESRGAN/init.py
@@ -1,118 +0,0 @@
 import torch
 from einops import repeat
 from PIL import Image
 import numpy as np
 class ResidualDenseBlock(torch.nn.Module):
    def __init__(self, num_feat=64, num_grow_ch=32):
        super(ResidualDenseBlock, self).__init__()
        self.conv1 = torch.nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
        self.conv2 = torch.nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
        self.conv3 = torch.nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
        self.conv4 = torch.nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
        self.conv5 = torch.nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
        self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
    def forward(self, x):
        x1 = self.lrelu(self.conv1(x))
        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
        return x5 * 0.2 + x
 class RRDB(torch.nn.Module):
    def __init__(self, num_feat, num_grow_ch=32):
        super(RRDB, self).__init__()
        self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
        self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
        self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
    def forward(self, x):
        out = self.rdb1(x)
        out = self.rdb2(out)
        out = self.rdb3(out)
        return out * 0.2 + x
 class RRDBNet(torch.nn.Module):
    def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32):
        super(RRDBNet, self).__init__()
        self.conv_first = torch.nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
        self.body = torch.torch.nn.Sequential(*[RRDB(num_feat=num_feat, num_grow_ch=num_grow_ch) for _ in range(num_block)])
        self.conv_body = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
        # upsample
        self.conv_up1 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
        self.conv_up2 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
        self.conv_hr = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
        self.conv_last = torch.nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
        self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
    def forward(self, x):
        feat = x
        feat = self.conv_first(feat)
        body_feat = self.conv_body(self.body(feat))
        feat = feat + body_feat
        # upsample
        feat = repeat(feat, "B C H W -> B C (H 2) (W 2)")
        feat = self.lrelu(self.conv_up1(feat))
        feat = repeat(feat, "B C H W -> B C (H 2) (W 2)")
        feat = self.lrelu(self.conv_up2(feat))
        out = self.conv_last(self.lrelu(self.conv_hr(feat)))
        return out
 class ESRGAN(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    @staticmethod
    def from_pretrained(model_path):
        model = RRDBNet()
        state_dict = torch.load(model_path, map_location="cpu")["params_ema"]
        model.load_state_dict(state_dict)
        model.eval()
        return ESRGAN(model)
    def process_image(self, image):
        image = torch.Tensor(np.array(image, dtype=np.float32) / 255).permute(2, 0, 1)
        return image
    def process_images(self, images):
        images = [self.process_image(image) for image in images]
        images = torch.stack(images)
        return images
    def decode_images(self, images):
        images = (images.permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8)
        images = [Image.fromarray(image) for image in images]
        return images
    @torch.no_grad()
    def upscale(self, images, batch_size=4, progress_bar=lambda x:x):
        # Preprocess
        input_tensor = self.process_images(images)
        # Interpolate
        output_tensor = []
        for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)):
            batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
            batch_input_tensor = input_tensor[batch_id: batch_id_]
            batch_input_tensor = batch_input_tensor.to(
                device=self.model.conv_first.weight.device,
                dtype=self.model.conv_first.weight.dtype)
            batch_output_tensor = self.model(batch_input_tensor)
            output_tensor.append(batch_output_tensor.cpu())
        # Output
        output_tensor = torch.concat(output_tensor, dim=0)
        # To images
        output_images = self.decode_images(output_tensor)
        return output_images
--- a/diffsynth/extensions/FastBlend/init.py
+++ b/diffsynth/extensions/FastBlend/init.py
@@ -1,63 +0,0 @@
 from .runners.fast import TableManager, PyramidPatchMatcher
 from PIL import Image
 import numpy as np
 import cupy as cp
 class FastBlendSmoother:
    def __init__(self):
        self.batch_size = 8
        self.window_size = 64
        self.ebsynth_config = {
            "minimum_patch_size": 5,
            "threads_per_block": 8,
            "num_iter": 5,
            "gpu_id": 0,
            "guide_weight": 10.0,
            "initialize": "identity",
            "tracking_window_size": 0,
        }
    @staticmethod
    def from_model_manager(model_manager):
        # TODO: fetch GPU ID from model_manager
        return FastBlendSmoother()
    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config):
        frames_guide = [np.array(frame) for frame in frames_guide]
        frames_style = [np.array(frame) for frame in frames_style]
        table_manager = TableManager()
        patch_match_engine = PyramidPatchMatcher(
            image_height=frames_style[0].shape[0],
            image_width=frames_style[0].shape[1],
            channel=3,
            **ebsynth_config
        )
        # left part
        table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="FastBlend Step 1/4")
        table_l = table_manager.remapping_table_to_blending_table(table_l)
        table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="FastBlend Step 2/4")
        # right part
        table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="FastBlend Step 3/4")
        table_r = table_manager.remapping_table_to_blending_table(table_r)
        table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="FastBlend Step 4/4")[::-1]
        # merge
        frames = []
        for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r):
            weight_m = -1
            weight = weight_l + weight_m + weight_r
            frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight)
            frames.append(frame)
        frames = [Image.fromarray(frame.clip(0, 255).astype("uint8")) for frame in frames]
        return frames
    def __call__(self, rendered_frames, original_frames=None, **kwargs):
        frames = self.run(
            original_frames, rendered_frames,
            self.batch_size, self.window_size, self.ebsynth_config
        )
        mempool = cp.get_default_memory_pool()
        pinned_mempool = cp.get_default_pinned_memory_pool()
        mempool.free_all_blocks()
        pinned_mempool.free_all_blocks()
        return frames
--- a/diffsynth/extensions/FastBlend/api.py
+++ b/diffsynth/extensions/FastBlend/api.py
@@ -1,397 +0,0 @@
 from .runners import AccurateModeRunner, FastModeRunner, BalancedModeRunner, InterpolationModeRunner, InterpolationModeSingleFrameRunner
 from .data import VideoData, get_video_fps, save_video, search_for_images
 import os
 import gradio as gr
 def check_input_for_blending(video_guide, video_guide_folder, video_style, video_style_folder):
    frames_guide = VideoData(video_guide, video_guide_folder)
    frames_style = VideoData(video_style, video_style_folder)
    message = ""
    if len(frames_guide) < len(frames_style):
        message += f"The number of frames mismatches. Only the first {len(frames_guide)} frames of style video will be used.\n"
        frames_style.set_length(len(frames_guide))
    elif len(frames_guide) > len(frames_style):
        message += f"The number of frames mismatches. Only the first {len(frames_style)} frames of guide video will be used.\n"
        frames_guide.set_length(len(frames_style))
    height_guide, width_guide = frames_guide.shape()
    height_style, width_style = frames_style.shape()
    if height_guide != height_style or width_guide != width_style:
        message += f"The shape of frames mismatches. The frames in style video will be resized to (height: {height_guide}, width: {width_guide})\n"
        frames_style.set_shape(height_guide, width_guide)
    return frames_guide, frames_style, message
 def smooth_video(
    video_guide,
    video_guide_folder,
    video_style,
    video_style_folder,
    mode,
    window_size,
    batch_size,
    tracking_window_size,
    output_path,
    fps,
    minimum_patch_size,
    num_iter,
    guide_weight,
    initialize,
    progress = None,
 ):
    # input
    frames_guide, frames_style, message = check_input_for_blending(video_guide, video_guide_folder, video_style, video_style_folder)
    if len(message) > 0:
        print(message)
    # output
    if output_path == "":
        if video_style is None:
            output_path = os.path.join(video_style_folder, "output")
        else:
            output_path = os.path.join(os.path.split(video_style)[0], "output")
        os.makedirs(output_path, exist_ok=True)
        print("No valid output_path. Your video will be saved here:", output_path)
    elif not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)
        print("Your video will be saved here:", output_path)
    frames_path = os.path.join(output_path, "frames")
    video_path = os.path.join(output_path, "video.mp4")
    os.makedirs(frames_path, exist_ok=True)
    # process
    if mode == "Fast" or mode == "Balanced":
        tracking_window_size = 0
    ebsynth_config = {
        "minimum_patch_size": minimum_patch_size,
        "threads_per_block": 8,
        "num_iter": num_iter,
        "gpu_id": 0,
        "guide_weight": guide_weight,
        "initialize": initialize,
        "tracking_window_size": tracking_window_size,
    }
    if mode == "Fast":
        FastModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
    elif mode == "Balanced":
        BalancedModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
    elif mode == "Accurate":
        AccurateModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
    # output
    try:
        fps = int(fps)
    except:
        fps = get_video_fps(video_style) if video_style is not None else 30
    print("Fps:", fps)
    print("Saving video...")
    video_path = save_video(frames_path, video_path, num_frames=len(frames_style), fps=fps)
    print("Success!")
    print("Your frames are here:", frames_path)
    print("Your video is here:", video_path)
    return output_path, fps, video_path
 class KeyFrameMatcher:
    def __init__(self):
        pass
    def extract_number_from_filename(self, file_name):
        result = []
        number = -1
        for i in file_name:
            if ord(i)>=ord("0") and ord(i)<=ord("9"):
                if number == -1:
                    number = 0
                number = number*10 + ord(i) - ord("0")
            else:
                if number != -1:
                    result.append(number)
                    number = -1
        if number != -1:
            result.append(number)
        result = tuple(result)
        return result
    def extract_number_from_filenames(self, file_names):
        numbers = [self.extract_number_from_filename(file_name) for file_name in file_names]
        min_length = min(len(i) for i in numbers)
        for i in range(min_length-1, -1, -1):
            if len(set(number[i] for number in numbers))==len(file_names):
                return [number[i] for number in numbers]
        return list(range(len(file_names)))
    def match_using_filename(self, file_names_a, file_names_b):
        file_names_b_set = set(file_names_b)
        matched_file_name = []
        for file_name in file_names_a:
            if file_name not in file_names_b_set:
                matched_file_name.append(None)
            else:
                matched_file_name.append(file_name)
        return matched_file_name
    def match_using_numbers(self, file_names_a, file_names_b):
        numbers_a = self.extract_number_from_filenames(file_names_a)
        numbers_b = self.extract_number_from_filenames(file_names_b)
        numbers_b_dict = {number: file_name for number, file_name in zip(numbers_b, file_names_b)}
        matched_file_name = []
        for number in numbers_a:
            if number in numbers_b_dict:
                matched_file_name.append(numbers_b_dict[number])
            else:
                matched_file_name.append(None)
        return matched_file_name
    def match_filenames(self, file_names_a, file_names_b):
        matched_file_name = self.match_using_filename(file_names_a, file_names_b)
        if sum([i is not None for i in matched_file_name]) > 0:
            return matched_file_name
        matched_file_name = self.match_using_numbers(file_names_a, file_names_b)
        return matched_file_name
 def detect_frames(frames_path, keyframes_path):
    if not os.path.exists(frames_path) and not os.path.exists(keyframes_path):
        return "Please input the directory of guide video and rendered frames"
    elif not os.path.exists(frames_path):
        return "Please input the directory of guide video"
    elif not os.path.exists(keyframes_path):
        return "Please input the directory of rendered frames"
    frames = [os.path.split(i)[-1] for i in search_for_images(frames_path)]
    keyframes = [os.path.split(i)[-1] for i in search_for_images(keyframes_path)]
    if len(frames)==0:
        return f"No images detected in {frames_path}"
    if len(keyframes)==0:
        return f"No images detected in {keyframes_path}"
    matched_keyframes = KeyFrameMatcher().match_filenames(frames, keyframes)
    max_filename_length = max([len(i) for i in frames])
    if sum([i is not None for i in matched_keyframes])==0:
        message = ""
        for frame, matched_keyframe in zip(frames, matched_keyframes):
            message += frame + " " * (max_filename_length - len(frame) + 1)
            message += "--> No matched keyframes\n"
    else:
        message = ""
        for frame, matched_keyframe in zip(frames, matched_keyframes):
            message += frame + " " * (max_filename_length - len(frame) + 1)
            if matched_keyframe is None:
                message += "--> [to be rendered]\n"
            else:
                message += f"--> {matched_keyframe}\n"
    return message
 def check_input_for_interpolating(frames_path, keyframes_path):
    # search for images
    frames = [os.path.split(i)[-1] for i in search_for_images(frames_path)]
    keyframes = [os.path.split(i)[-1] for i in search_for_images(keyframes_path)]
    # match frames
    matched_keyframes = KeyFrameMatcher().match_filenames(frames, keyframes)
    file_list = [file_name for file_name in matched_keyframes if file_name is not None]
    index_style = [i for i, file_name in enumerate(matched_keyframes) if file_name is not None]
    frames_guide = VideoData(None, frames_path)
    frames_style = VideoData(None, keyframes_path, file_list=file_list)
    # match shape
    message = ""
    height_guide, width_guide = frames_guide.shape()
    height_style, width_style = frames_style.shape()
    if height_guide != height_style or width_guide != width_style:
        message += f"The shape of frames mismatches. The rendered keyframes will be resized to (height: {height_guide}, width: {width_guide})\n"
        frames_style.set_shape(height_guide, width_guide)
    return frames_guide, frames_style, index_style, message
 def interpolate_video(
    frames_path,
    keyframes_path,
    output_path,
    fps,
    batch_size,
    tracking_window_size,
    minimum_patch_size,
    num_iter,
    guide_weight,
    initialize,
    progress = None,
 ):
    # input
    frames_guide, frames_style, index_style, message = check_input_for_interpolating(frames_path, keyframes_path)
    if len(message) > 0:
        print(message)
    # output
    if output_path == "":
        output_path = os.path.join(keyframes_path, "output")
        os.makedirs(output_path, exist_ok=True)
        print("No valid output_path. Your video will be saved here:", output_path)
    elif not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)
        print("Your video will be saved here:", output_path)
    output_frames_path = os.path.join(output_path, "frames")
    output_video_path = os.path.join(output_path, "video.mp4")
    os.makedirs(output_frames_path, exist_ok=True)
    # process
    ebsynth_config = {
        "minimum_patch_size": minimum_patch_size,
        "threads_per_block": 8,
        "num_iter": num_iter,
        "gpu_id": 0,
        "guide_weight": guide_weight,
        "initialize": initialize,
        "tracking_window_size": tracking_window_size
    }
    if len(index_style)==1:
        InterpolationModeSingleFrameRunner().run(frames_guide, frames_style, index_style, batch_size=batch_size, ebsynth_config=ebsynth_config, save_path=output_frames_path)
    else:
        InterpolationModeRunner().run(frames_guide, frames_style, index_style, batch_size=batch_size, ebsynth_config=ebsynth_config, save_path=output_frames_path)
    try:
        fps = int(fps)
    except:
        fps = 30
    print("Fps:", fps)
    print("Saving video...")
    video_path = save_video(output_frames_path, output_video_path, num_frames=len(frames_guide), fps=fps)
    print("Success!")
    print("Your frames are here:", output_frames_path)
    print("Your video is here:", video_path)
    return output_path, fps, video_path
 def on_ui_tabs():
    with gr.Blocks(analytics_enabled=False) as ui_component:
        with gr.Tab("Blend"):
            gr.Markdown("""
 # Blend
 Given a guide video and a style video, this algorithm will make the style video fluent according to the motion features of the guide video. Click [here](https://github.com/Artiprocher/sd-webui-fastblend/assets/35051019/208d902d-6aba-48d7-b7d5-cd120ebd306d) to see the example. Note that this extension doesn't support long videos. Please use short videos (e.g., several seconds). The algorithm is mainly designed for 512*512 resolution. Please use a larger `Minimum patch size` for higher resolution.
            """)
            with gr.Row():
                with gr.Column():
                    with gr.Tab("Guide video"):
                        video_guide = gr.Video(label="Guide video")
                    with gr.Tab("Guide video (images format)"):
                        video_guide_folder = gr.Textbox(label="Guide video (images format)", value="")
                with gr.Column():
                    with gr.Tab("Style video"):
                        video_style = gr.Video(label="Style video")
                    with gr.Tab("Style video (images format)"):
                        video_style_folder = gr.Textbox(label="Style video (images format)", value="")
                with gr.Column():
                    output_path = gr.Textbox(label="Output directory", value="", placeholder="Leave empty to use the directory of style video")
                    fps = gr.Textbox(label="Fps", value="", placeholder="Leave empty to use the default fps")
                    video_output = gr.Video(label="Output video", interactive=False, show_share_button=True)
            btn = gr.Button(value="Blend")
            with gr.Row():
                with gr.Column():
                    gr.Markdown("# Settings")
                    mode = gr.Radio(["Fast", "Balanced", "Accurate"], label="Inference mode", value="Fast", interactive=True)
                    window_size = gr.Slider(label="Sliding window size", value=15, minimum=1, maximum=1000, step=1, interactive=True)
                    batch_size = gr.Slider(label="Batch size", value=8, minimum=1, maximum=128, step=1, interactive=True)
                    tracking_window_size = gr.Slider(label="Tracking window size (only for accurate mode)", value=0, minimum=0, maximum=10, step=1, interactive=True)
                    gr.Markdown("## Advanced Settings")
                    minimum_patch_size = gr.Slider(label="Minimum patch size (odd number)", value=5, minimum=5, maximum=99, step=2, interactive=True)
                    num_iter = gr.Slider(label="Number of iterations", value=5, minimum=1, maximum=10, step=1, interactive=True)
                    guide_weight = gr.Slider(label="Guide weight", value=10.0, minimum=0.0, maximum=100.0, step=0.1, interactive=True)
                    initialize = gr.Radio(["identity", "random"], label="NNF initialization", value="identity", interactive=True)
                with gr.Column():
                    gr.Markdown("""
 # Reference
 * Output directory: the directory to save the video.
 * Inference mode
 |Mode|Time|Memory|Quality|Frame by frame output|Description|
 |-|-|-|-|-|-|
 |Fast|■|■■■|■■|No|Blend the frames using a tree-like data structure, which requires much RAM but is fast.|
 |Balanced|■■|■|■■|Yes|Blend the frames naively.|
 |Accurate|■■■|■|■■■|Yes|Blend the frames and align them together for higher video quality. When [batch size] >= [sliding window size] * 2 + 1, the performance is the best.|
 * Sliding window size: our algorithm will blend the frames in a sliding windows. If the size is n, each frame will be blended with the last n frames and the next n frames. A large sliding window can make the video fluent but sometimes smoggy.
 * Batch size: a larger batch size makes the program faster but requires more VRAM.
 * Tracking window size (only for accurate mode): The size of window in which our algorithm tracks moving objects. Empirically, 1 is enough.
 * Advanced settings
    * Minimum patch size (odd number): the minimum patch size used for patch matching. (Default: 5)
    * Number of iterations: the number of iterations of patch matching. (Default: 5)
    * Guide weight: a parameter that determines how much motion feature applied to the style video. (Default: 10)
    * NNF initialization: how to initialize the NNF (Nearest Neighbor Field). (Default: identity)
                    """)
            btn.click(
                smooth_video,
                inputs=[
                    video_guide,
                    video_guide_folder,
                    video_style,
                    video_style_folder,
                    mode,
                    window_size,
                    batch_size,
                    tracking_window_size,
                    output_path,
                    fps,
                    minimum_patch_size,
                    num_iter,
                    guide_weight,
                    initialize
                ],
                outputs=[output_path, fps, video_output]
            )
        with gr.Tab("Interpolate"):
            gr.Markdown("""
 # Interpolate
 Given a guide video and some rendered keyframes, this algorithm will render the remaining frames. Click [here](https://github.com/Artiprocher/sd-webui-fastblend/assets/35051019/3490c5b4-8f67-478f-86de-f9adc2ace16a) to see the example. The algorithm is experimental and is only tested for 512*512 resolution.
            """)
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        with gr.Column():
                            video_guide_folder_ = gr.Textbox(label="Guide video (images format)", value="")
                        with gr.Column():
                            rendered_keyframes_ = gr.Textbox(label="Rendered keyframes (images format)", value="")
                    with gr.Row():
                        detected_frames = gr.Textbox(label="Detected frames", value="Please input the directory of guide video and rendered frames", lines=9, max_lines=9, interactive=False)
                    video_guide_folder_.change(detect_frames, inputs=[video_guide_folder_, rendered_keyframes_], outputs=detected_frames)
                    rendered_keyframes_.change(detect_frames, inputs=[video_guide_folder_, rendered_keyframes_], outputs=detected_frames)
                with gr.Column():
                    output_path_ = gr.Textbox(label="Output directory", value="", placeholder="Leave empty to use the directory of rendered keyframes")
                    fps_ = gr.Textbox(label="Fps", value="", placeholder="Leave empty to use the default fps")
                    video_output_ = gr.Video(label="Output video", interactive=False, show_share_button=True)
            btn_ = gr.Button(value="Interpolate")
            with gr.Row():
                with gr.Column():
                    gr.Markdown("# Settings")
                    batch_size_ = gr.Slider(label="Batch size", value=8, minimum=1, maximum=128, step=1, interactive=True)
                    tracking_window_size_ = gr.Slider(label="Tracking window size", value=0, minimum=0, maximum=10, step=1, interactive=True)
                    gr.Markdown("## Advanced Settings")
                    minimum_patch_size_ = gr.Slider(label="Minimum patch size (odd number, larger is better)", value=15, minimum=5, maximum=99, step=2, interactive=True)
                    num_iter_ = gr.Slider(label="Number of iterations", value=5, minimum=1, maximum=10, step=1, interactive=True)
                    guide_weight_ = gr.Slider(label="Guide weight", value=10.0, minimum=0.0, maximum=100.0, step=0.1, interactive=True)
                    initialize_ = gr.Radio(["identity", "random"], label="NNF initialization", value="identity", interactive=True)
                with gr.Column():
                    gr.Markdown("""
 # Reference
 * Output directory: the directory to save the video.
 * Batch size: a larger batch size makes the program faster but requires more VRAM.
 * Tracking window size (only for accurate mode): The size of window in which our algorithm tracks moving objects. Empirically, 1 is enough.
 * Advanced settings
    * Minimum patch size (odd number): the minimum patch size used for patch matching. **This parameter should be larger than that in blending. (Default: 15)**
    * Number of iterations: the number of iterations of patch matching. (Default: 5)
    * Guide weight: a parameter that determines how much motion feature applied to the style video. (Default: 10)
    * NNF initialization: how to initialize the NNF (Nearest Neighbor Field). (Default: identity)
                    """)
            btn_.click(
                interpolate_video,
                inputs=[
                    video_guide_folder_,
                    rendered_keyframes_,
                    output_path_,
                    fps_,
                    batch_size_,
                    tracking_window_size_,
                    minimum_patch_size_,
                    num_iter_,
                    guide_weight_,
                    initialize_,
                ],
                outputs=[output_path_, fps_, video_output_]
            )
        return [(ui_component, "FastBlend", "FastBlend_ui")]
--- a/diffsynth/extensions/FastBlend/cupy_kernels.py
+++ b/diffsynth/extensions/FastBlend/cupy_kernels.py
@@ -1,119 +0,0 @@
 import cupy as cp
 remapping_kernel = cp.RawKernel(r'''
 extern "C" __global__
 void remap(
    const int height,
    const int width,
    const int channel,
    const int patch_size,
    const int pad_size,
    const float* source_style,
    const int* nnf,
    float* target_style
 ) {
    const int r = (patch_size - 1) / 2;
    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;
    if (x >= height or y >= width) return;
    const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
    const int pid = (x + pad_size) * (width + pad_size * 2) + (y + pad_size);
    const int min_px = x < r ? -x : -r;
    const int max_px = x + r > height - 1 ? height - 1 - x : r;
    const int min_py = y < r ? -y : -r;
    const int max_py = y + r > width - 1 ? width - 1 - y : r;
    int num = 0;
    for (int px = min_px; px <= max_px; px++){
        for (int py = min_py; py <= max_py; py++){
            const int nid = (x + px) * width + y + py;
            const int x_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 0] - px;
            const int y_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 1] - py;
            if (x_ < 0 or y_ < 0 or x_ >= height or y_ >= width)continue;
            const int pid_ = (x_ + pad_size) * (width + pad_size * 2) + (y_ + pad_size);
            num++;
            for (int c = 0; c < channel; c++){
                target_style[z + pid * channel + c] += source_style[z + pid_ * channel + c];
            }
        }
    }
    for (int c = 0; c < channel; c++){
        target_style[z + pid * channel + c] /= num;
    }
 }
 ''', 'remap')
 patch_error_kernel = cp.RawKernel(r'''
 extern "C" __global__
 void patch_error(
    const int height,
    const int width,
    const int channel,
    const int patch_size,
    const int pad_size,
    const float* source,
    const int* nnf,
    const float* target,
    float* error
 ) {
    const int r = (patch_size - 1) / 2;
    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;
    const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
    if (x >= height or y >= width) return;
    const int x_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 0];
    const int y_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 1];
    float e = 0;
    for (int px = -r; px <= r; px++){
        for (int py = -r; py <= r; py++){
            const int pid = (x + pad_size + px) * (width + pad_size * 2) + y + pad_size + py;
            const int pid_ = (x_ + pad_size + px) * (width + pad_size * 2) + y_ + pad_size + py;
            for (int c = 0; c < channel; c++){
                const float diff = target[z + pid * channel + c] - source[z + pid_ * channel + c];
                e += diff * diff;
            }
        }
    }
    error[blockIdx.z * height * width + x * width + y] = e;
 }
 ''', 'patch_error')
 pairwise_patch_error_kernel = cp.RawKernel(r'''
 extern "C" __global__
 void pairwise_patch_error(
    const int height,
    const int width,
    const int channel,
    const int patch_size,
    const int pad_size,
    const float* source_a,
    const int* nnf_a,
    const float* source_b,
    const int* nnf_b,
    float* error
 ) {
    const int r = (patch_size - 1) / 2;
    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;
    const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
    if (x >= height or y >= width) return;
    const int z_nnf = blockIdx.z * height * width * 2 + (x * width + y) * 2;
    const int x_a = nnf_a[z_nnf + 0];
    const int y_a = nnf_a[z_nnf + 1];
    const int x_b = nnf_b[z_nnf + 0];
    const int y_b = nnf_b[z_nnf + 1];
    float e = 0;
    for (int px = -r; px <= r; px++){
        for (int py = -r; py <= r; py++){
            const int pid_a = (x_a + pad_size + px) * (width + pad_size * 2) + y_a + pad_size + py;
            const int pid_b = (x_b + pad_size + px) * (width + pad_size * 2) + y_b + pad_size + py;
            for (int c = 0; c < channel; c++){
                const float diff = source_a[z + pid_a * channel + c] - source_b[z + pid_b * channel + c];
                e += diff * diff;
            }
        }
    }
    error[blockIdx.z * height * width + x * width + y] = e;
 }
 ''', 'pairwise_patch_error')
--- a/diffsynth/extensions/FastBlend/data.py
+++ b/diffsynth/extensions/FastBlend/data.py
@@ -1,146 +0,0 @@
 import imageio, os
 import numpy as np
 from PIL import Image
 def read_video(file_name):
    reader = imageio.get_reader(file_name)
    video = []
    for frame in reader:
        frame = np.array(frame)
        video.append(frame)
    reader.close()
    return video
 def get_video_fps(file_name):
    reader = imageio.get_reader(file_name)
    fps = reader.get_meta_data()["fps"]
    reader.close()
    return fps
 def save_video(frames_path, video_path, num_frames, fps):
    writer = imageio.get_writer(video_path, fps=fps, quality=9)
    for i in range(num_frames):
        frame = np.array(Image.open(os.path.join(frames_path, "%05d.png" % i)))
        writer.append_data(frame)
    writer.close()
    return video_path
 class LowMemoryVideo:
    def __init__(self, file_name):
        self.reader = imageio.get_reader(file_name)
    def __len__(self):
        return self.reader.count_frames()
    def __getitem__(self, item):
        return np.array(self.reader.get_data(item))
    def __del__(self):
        self.reader.close()
 def split_file_name(file_name):
    result = []
    number = -1
    for i in file_name:
        if ord(i)>=ord("0") and ord(i)<=ord("9"):
            if number == -1:
                number = 0
            number = number*10 + ord(i) - ord("0")
        else:
            if number != -1:
                result.append(number)
                number = -1
            result.append(i)
    if number != -1:
        result.append(number)
    result = tuple(result)
    return result
 def search_for_images(folder):
    file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")]
    file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
    file_list = [i[1] for i in sorted(file_list)]
    file_list = [os.path.join(folder, i) for i in file_list]
    return file_list
 def read_images(folder):
    file_list = search_for_images(folder)
    frames = [np.array(Image.open(i)) for i in file_list]
    return frames
 class LowMemoryImageFolder:
    def __init__(self, folder, file_list=None):
        if file_list is None:
            self.file_list = search_for_images(folder)
        else:
            self.file_list = [os.path.join(folder, file_name) for file_name in file_list]
    def __len__(self):
        return len(self.file_list)
    def __getitem__(self, item):
        return np.array(Image.open(self.file_list[item]))
    def __del__(self):
        pass
 class VideoData:
    def __init__(self, video_file, image_folder, **kwargs):
        if video_file is not None:
            self.data_type = "video"
            self.data = LowMemoryVideo(video_file, **kwargs)
        elif image_folder is not None:
            self.data_type = "images"
            self.data = LowMemoryImageFolder(image_folder, **kwargs)
        else:
            raise ValueError("Cannot open video or image folder")
        self.length = None
        self.height = None
        self.width = None
    def raw_data(self):
        frames = []
        for i in range(self.__len__()):
            frames.append(self.__getitem__(i))
        return frames
    def set_length(self, length):
        self.length = length
    def set_shape(self, height, width):
        self.height = height
        self.width = width
    def __len__(self):
        if self.length is None:
            return len(self.data)
        else:
            return self.length
    def shape(self):
        if self.height is not None and self.width is not None:
            return self.height, self.width
        else:
            height, width, _ = self.__getitem__(0).shape
            return height, width
    def __getitem__(self, item):
        frame = self.data.__getitem__(item)
        height, width, _ = frame.shape
        if self.height is not None and self.width is not None:
            if self.height != height or self.width != width:
                frame = Image.fromarray(frame).resize((self.width, self.height))
                frame = np.array(frame)
        return frame
    def __del__(self):
        pass
--- a/diffsynth/extensions/FastBlend/patch_match.py
+++ b/diffsynth/extensions/FastBlend/patch_match.py
@@ -1,298 +0,0 @@
 from .cupy_kernels import remapping_kernel, patch_error_kernel, pairwise_patch_error_kernel
 import numpy as np
 import cupy as cp
 import cv2
 class PatchMatcher:
    def __init__(
        self, height, width, channel, minimum_patch_size,
        threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0,
        random_search_steps=3, random_search_range=4,
        use_mean_target_style=False, use_pairwise_patch_error=False,
        tracking_window_size=0
    ):
        self.height = height
        self.width = width
        self.channel = channel
        self.minimum_patch_size = minimum_patch_size
        self.threads_per_block = threads_per_block
        self.num_iter = num_iter
        self.gpu_id = gpu_id
        self.guide_weight = guide_weight
        self.random_search_steps = random_search_steps
        self.random_search_range = random_search_range
        self.use_mean_target_style = use_mean_target_style
        self.use_pairwise_patch_error = use_pairwise_patch_error
        self.tracking_window_size = tracking_window_size
        self.patch_size_list = [minimum_patch_size + i*2 for i in range(num_iter)][::-1]
        self.pad_size = self.patch_size_list[0] // 2
        self.grid = (
            (height + threads_per_block - 1) // threads_per_block,
            (width + threads_per_block - 1) // threads_per_block
        )
        self.block = (threads_per_block, threads_per_block)
    def pad_image(self, image):
        return cp.pad(image, ((0, 0), (self.pad_size, self.pad_size), (self.pad_size, self.pad_size), (0, 0)))
    def unpad_image(self, image):
        return image[:, self.pad_size: -self.pad_size, self.pad_size: -self.pad_size, :]
    def apply_nnf_to_image(self, nnf, source):
        batch_size = source.shape[0]
        target = cp.zeros((batch_size, self.height + self.pad_size * 2, self.width + self.pad_size * 2, self.channel), dtype=cp.float32)
        remapping_kernel(
            self.grid + (batch_size,),
            self.block,
            (self.height, self.width, self.channel, self.patch_size, self.pad_size, source, nnf, target)
        )
        return target
    def get_patch_error(self, source, nnf, target):
        batch_size = source.shape[0]
        error = cp.zeros((batch_size, self.height, self.width), dtype=cp.float32)
        patch_error_kernel(
            self.grid + (batch_size,),
            self.block,
            (self.height, self.width, self.channel, self.patch_size, self.pad_size, source, nnf, target, error)
        )
        return error
    def get_pairwise_patch_error(self, source, nnf):
        batch_size = source.shape[0]//2
        error = cp.zeros((batch_size, self.height, self.width), dtype=cp.float32)
        source_a, nnf_a = source[0::2].copy(), nnf[0::2].copy()
        source_b, nnf_b = source[1::2].copy(), nnf[1::2].copy()
        pairwise_patch_error_kernel(
            self.grid + (batch_size,),
            self.block,
            (self.height, self.width, self.channel, self.patch_size, self.pad_size, source_a, nnf_a, source_b, nnf_b, error)
        )
        error = error.repeat(2, axis=0)
        return error
    def get_error(self, source_guide, target_guide, source_style, target_style, nnf):
        error_guide = self.get_patch_error(source_guide, nnf, target_guide)
        if self.use_mean_target_style:
            target_style = self.apply_nnf_to_image(nnf, source_style)
            target_style = target_style.mean(axis=0, keepdims=True)
            target_style = target_style.repeat(source_guide.shape[0], axis=0)
        if self.use_pairwise_patch_error:
            error_style = self.get_pairwise_patch_error(source_style, nnf)
        else:
            error_style = self.get_patch_error(source_style, nnf, target_style)
        error = error_guide * self.guide_weight + error_style
        return error
    def clamp_bound(self, nnf):
        nnf[:,:,:,0] = cp.clip(nnf[:,:,:,0], 0, self.height-1)
        nnf[:,:,:,1] = cp.clip(nnf[:,:,:,1], 0, self.width-1)
        return nnf
    def random_step(self, nnf, r):
        batch_size = nnf.shape[0]
        step = cp.random.randint(-r, r+1, size=(batch_size, self.height, self.width, 2), dtype=cp.int32)
        upd_nnf = self.clamp_bound(nnf + step)
        return upd_nnf
    def neighboor_step(self, nnf, d):
        if d==0:
            upd_nnf = cp.concatenate([nnf[:, :1, :], nnf[:, :-1, :]], axis=1)
            upd_nnf[:, :, :, 0] += 1
        elif d==1:
            upd_nnf = cp.concatenate([nnf[:, :, :1], nnf[:, :, :-1]], axis=2)
            upd_nnf[:, :, :, 1] += 1
        elif d==2:
            upd_nnf = cp.concatenate([nnf[:, 1:, :], nnf[:, -1:, :]], axis=1)
            upd_nnf[:, :, :, 0] -= 1
        elif d==3:
            upd_nnf = cp.concatenate([nnf[:, :, 1:], nnf[:, :, -1:]], axis=2)
            upd_nnf[:, :, :, 1] -= 1
        upd_nnf = self.clamp_bound(upd_nnf)
        return upd_nnf
    def shift_nnf(self, nnf, d):
        if d>0:
            d = min(nnf.shape[0], d)
            upd_nnf = cp.concatenate([nnf[d:]] + [nnf[-1:]] * d, axis=0)
        else:
            d = max(-nnf.shape[0], d)
            upd_nnf = cp.concatenate([nnf[:1]] * (-d) + [nnf[:d]], axis=0)
        return upd_nnf
    def track_step(self, nnf, d):
        if self.use_pairwise_patch_error:
            upd_nnf = cp.zeros_like(nnf)
            upd_nnf[0::2] = self.shift_nnf(nnf[0::2], d)
            upd_nnf[1::2] = self.shift_nnf(nnf[1::2], d)
        else:
            upd_nnf = self.shift_nnf(nnf, d)
        return upd_nnf
    def C(self, n, m):
        # not used
        c = 1
        for i in range(1, n+1):
            c *= i
        for i in range(1, m+1):
            c //= i
        for i in range(1, n-m+1):
            c //= i
        return c
    def bezier_step(self, nnf, r):
        # not used
        n = r * 2 - 1
        upd_nnf = cp.zeros(shape=nnf.shape, dtype=cp.float32)
        for i, d in enumerate(list(range(-r, 0)) + list(range(1, r+1))):
            if d>0:
                ctl_nnf = cp.concatenate([nnf[d:]] + [nnf[-1:]] * d, axis=0)
            elif d<0:
                ctl_nnf = cp.concatenate([nnf[:1]] * (-d) + [nnf[:d]], axis=0)
            upd_nnf += ctl_nnf * (self.C(n, i) / 2**n)
        upd_nnf = self.clamp_bound(upd_nnf).astype(nnf.dtype)
        return upd_nnf
    def update(self, source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf):
        upd_err = self.get_error(source_guide, target_guide, source_style, target_style, upd_nnf)
        upd_idx = (upd_err < err)
        nnf[upd_idx] = upd_nnf[upd_idx]
        err[upd_idx] = upd_err[upd_idx]
        return nnf, err
    def propagation(self, source_guide, target_guide, source_style, target_style, nnf, err):
        for d in cp.random.permutation(4):
            upd_nnf = self.neighboor_step(nnf, d)
            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
        return nnf, err
    def random_search(self, source_guide, target_guide, source_style, target_style, nnf, err):
        for i in range(self.random_search_steps):
            upd_nnf = self.random_step(nnf, self.random_search_range)
            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
        return nnf, err
    def track(self, source_guide, target_guide, source_style, target_style, nnf, err):
        for d in range(1, self.tracking_window_size + 1):
            upd_nnf = self.track_step(nnf, d)
            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
            upd_nnf = self.track_step(nnf, -d)
            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
        return nnf, err
    def iteration(self, source_guide, target_guide, source_style, target_style, nnf, err):
        nnf, err = self.propagation(source_guide, target_guide, source_style, target_style, nnf, err)
        nnf, err = self.random_search(source_guide, target_guide, source_style, target_style, nnf, err)
        nnf, err = self.track(source_guide, target_guide, source_style, target_style, nnf, err)
        return nnf, err
    def estimate_nnf(self, source_guide, target_guide, source_style, nnf):
        with cp.cuda.Device(self.gpu_id):
            source_guide = self.pad_image(source_guide)
            target_guide = self.pad_image(target_guide)
            source_style = self.pad_image(source_style)
            for it in range(self.num_iter):
                self.patch_size = self.patch_size_list[it]
                target_style = self.apply_nnf_to_image(nnf, source_style)
                err = self.get_error(source_guide, target_guide, source_style, target_style, nnf)
                nnf, err = self.iteration(source_guide, target_guide, source_style, target_style, nnf, err)
            target_style = self.unpad_image(self.apply_nnf_to_image(nnf, source_style))
        return nnf, target_style
 class PyramidPatchMatcher:
    def __init__(
        self, image_height, image_width, channel, minimum_patch_size,
        threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0,
        use_mean_target_style=False, use_pairwise_patch_error=False,
        tracking_window_size=0,
        initialize="identity"
    ):
        maximum_patch_size = minimum_patch_size + (num_iter - 1) * 2
        self.pyramid_level = int(np.log2(min(image_height, image_width) / maximum_patch_size))
        self.pyramid_heights = []
        self.pyramid_widths = []
        self.patch_matchers = []
        self.minimum_patch_size = minimum_patch_size
        self.num_iter = num_iter
        self.gpu_id = gpu_id
        self.initialize = initialize
        for level in range(self.pyramid_level):
            height = image_height//(2**(self.pyramid_level - 1 - level))
            width = image_width//(2**(self.pyramid_level - 1 - level))
            self.pyramid_heights.append(height)
            self.pyramid_widths.append(width)
            self.patch_matchers.append(PatchMatcher(
                height, width, channel, minimum_patch_size=minimum_patch_size,
                threads_per_block=threads_per_block, num_iter=num_iter, gpu_id=gpu_id, guide_weight=guide_weight,
                use_mean_target_style=use_mean_target_style, use_pairwise_patch_error=use_pairwise_patch_error,
                tracking_window_size=tracking_window_size
            ))
    def resample_image(self, images, level):
        height, width = self.pyramid_heights[level], self.pyramid_widths[level]
        images = images.get()
        images_resample = []
        for image in images:
            image_resample = cv2.resize(image, (width, height), interpolation=cv2.INTER_AREA)
            images_resample.append(image_resample)
        images_resample = cp.array(np.stack(images_resample), dtype=cp.float32)
        return images_resample
    def initialize_nnf(self, batch_size):
        if self.initialize == "random":
            height, width = self.pyramid_heights[0], self.pyramid_widths[0]
            nnf = cp.stack([
                cp.random.randint(0, height, (batch_size, height, width), dtype=cp.int32),
                cp.random.randint(0, width, (batch_size, height, width), dtype=cp.int32)
            ], axis=3)
        elif self.initialize == "identity":
            height, width = self.pyramid_heights[0], self.pyramid_widths[0]
            nnf = cp.stack([
                cp.repeat(cp.arange(height), width).reshape(height, width),
                cp.tile(cp.arange(width), height).reshape(height, width)
            ], axis=2)
            nnf = cp.stack([nnf] * batch_size)
        else:
            raise NotImplementedError()
        return nnf
    def update_nnf(self, nnf, level):
        # upscale
        nnf = nnf.repeat(2, axis=1).repeat(2, axis=2) * 2
        nnf[:,[i for i in range(nnf.shape[0]) if i&1],:,0] += 1
        nnf[:,:,[i for i in range(nnf.shape[0]) if i&1],1] += 1
        # check if scale is 2
        height, width = self.pyramid_heights[level], self.pyramid_widths[level]
        if height != nnf.shape[0] * 2 or width != nnf.shape[1] * 2:
            nnf = nnf.get().astype(np.float32)
            nnf = [cv2.resize(n, (width, height), interpolation=cv2.INTER_LINEAR) for n in nnf]
            nnf = cp.array(np.stack(nnf), dtype=cp.int32)
            nnf = self.patch_matchers[level].clamp_bound(nnf)
        return nnf
    def apply_nnf_to_image(self, nnf, image):
        with cp.cuda.Device(self.gpu_id):
            image = self.patch_matchers[-1].pad_image(image)
            image = self.patch_matchers[-1].apply_nnf_to_image(nnf, image)
        return image
    def estimate_nnf(self, source_guide, target_guide, source_style):
        with cp.cuda.Device(self.gpu_id):
            if not isinstance(source_guide, cp.ndarray):
                source_guide = cp.array(source_guide, dtype=cp.float32)
            if not isinstance(target_guide, cp.ndarray):
                target_guide = cp.array(target_guide, dtype=cp.float32)
            if not isinstance(source_style, cp.ndarray):
                source_style = cp.array(source_style, dtype=cp.float32)
            for level in range(self.pyramid_level):
                nnf = self.initialize_nnf(source_guide.shape[0]) if level==0 else self.update_nnf(nnf, level)
                source_guide_ = self.resample_image(source_guide, level)
                target_guide_ = self.resample_image(target_guide, level)
                source_style_ = self.resample_image(source_style, level)
                nnf, target_style = self.patch_matchers[level].estimate_nnf(
                    source_guide_, target_guide_, source_style_, nnf
                )
        return nnf.get(), target_style.get()
--- a/diffsynth/extensions/FastBlend/runners/init.py
+++ b/diffsynth/extensions/FastBlend/runners/init.py
@@ -1,4 +0,0 @@
 from .accurate import AccurateModeRunner
 from .fast import FastModeRunner
 from .balanced import BalancedModeRunner
 from .interpolation import InterpolationModeRunner, InterpolationModeSingleFrameRunner
--- a/diffsynth/extensions/FastBlend/runners/accurate.py
+++ b/diffsynth/extensions/FastBlend/runners/accurate.py
@@ -1,35 +0,0 @@
 from ..patch_match import PyramidPatchMatcher
 import os
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
 class AccurateModeRunner:
    def __init__(self):
        pass
    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Accurate Mode", save_path=None):
        patch_match_engine = PyramidPatchMatcher(
            image_height=frames_style[0].shape[0],
            image_width=frames_style[0].shape[1],
            channel=3,
            use_mean_target_style=True,
            **ebsynth_config
        )
        # run
        n = len(frames_style)
        for target in tqdm(range(n), desc=desc):
            l, r = max(target - window_size, 0), min(target + window_size + 1, n)
            remapped_frames = []
            for i in range(l, r, batch_size):
                j = min(i + batch_size, r)
                source_guide = np.stack([frames_guide[source] for source in range(i, j)])
                target_guide = np.stack([frames_guide[target]] * (j - i))
                source_style = np.stack([frames_style[source] for source in range(i, j)])
                _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
                remapped_frames.append(target_style)
            frame = np.concatenate(remapped_frames, axis=0).mean(axis=0)
            frame = frame.clip(0, 255).astype("uint8")
            if save_path is not None:
                Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
--- a/diffsynth/extensions/FastBlend/runners/balanced.py
+++ b/diffsynth/extensions/FastBlend/runners/balanced.py
@@ -1,46 +0,0 @@
 from ..patch_match import PyramidPatchMatcher
 import os
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
 class BalancedModeRunner:
    def __init__(self):
        pass
    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Balanced Mode", save_path=None):
        patch_match_engine = PyramidPatchMatcher(
            image_height=frames_style[0].shape[0],
            image_width=frames_style[0].shape[1],
            channel=3,
            **ebsynth_config
        )
        # tasks
        n = len(frames_style)
        tasks = []
        for target in range(n):
            for source in range(target - window_size, target + window_size + 1):
                if source >= 0 and source < n and source != target:
                    tasks.append((source, target))
        # run
        frames = [(None, 1) for i in range(n)]
        for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
            tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
            source_guide = np.stack([frames_guide[source] for source, target in tasks_batch])
            target_guide = np.stack([frames_guide[target] for source, target in tasks_batch])
            source_style = np.stack([frames_style[source] for source, target in tasks_batch])
            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
            for (source, target), result in zip(tasks_batch, target_style):
                frame, weight = frames[target]
                if frame is None:
                    frame = frames_style[target]
                frames[target] = (
                    frame * (weight / (weight + 1)) + result / (weight + 1),
                    weight + 1
                )
                if weight + 1 == min(n, target + window_size + 1) - max(0, target - window_size):
                    frame = frame.clip(0, 255).astype("uint8")
                    if save_path is not None:
                        Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
                    frames[target] = (None, 1)
--- a/diffsynth/extensions/FastBlend/runners/fast.py
+++ b/diffsynth/extensions/FastBlend/runners/fast.py
@@ -1,141 +0,0 @@
 from ..patch_match import PyramidPatchMatcher
 import functools, os
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
 class TableManager:
    def __init__(self):
        pass
    def task_list(self, n):
        tasks = []
        max_level = 1
        while (1<<max_level)<=n:
            max_level += 1
        for i in range(n):
            j = i
            for level in range(max_level):
                if i&(1<<level):
                    continue
                j |= 1<<level
                if j>=n:
                    break
                meta_data = {
                    "source": i,
                    "target": j,
                    "level": level + 1
                }
                tasks.append(meta_data)
        tasks.sort(key=functools.cmp_to_key(lambda u, v: u["level"]-v["level"]))
        return tasks
    def build_remapping_table(self, frames_guide, frames_style, patch_match_engine, batch_size, desc=""):
        n = len(frames_guide)
        tasks = self.task_list(n)
        remapping_table = [[(frames_style[i], 1)] for i in range(n)]
        for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
            tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
            source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch])
            target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch])
            source_style = np.stack([frames_style[task["source"]] for task in tasks_batch])
            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
            for task, result in zip(tasks_batch, target_style):
                target, level = task["target"], task["level"]
                if len(remapping_table[target])==level:
                    remapping_table[target].append((result, 1))
                else:
                    frame, weight = remapping_table[target][level]
                    remapping_table[target][level] = (
                        frame * (weight / (weight + 1)) + result / (weight + 1),
                        weight + 1
                    )
        return remapping_table
    def remapping_table_to_blending_table(self, table):
        for i in range(len(table)):
            for j in range(1, len(table[i])):
                frame_1, weight_1 = table[i][j-1]
                frame_2, weight_2 = table[i][j]
                frame = (frame_1 + frame_2) / 2
                weight = weight_1 + weight_2
                table[i][j] = (frame, weight)
        return table
    def tree_query(self, leftbound, rightbound):
        node_list = []
        node_index = rightbound
        while node_index>=leftbound:
            node_level = 0
            while (1<<node_level)&node_index and node_index-(1<<node_level+1)+1>=leftbound:
                node_level += 1
            node_list.append((node_index, node_level))
            node_index -= 1<<node_level
        return node_list
    def process_window_sum(self, frames_guide, blending_table, patch_match_engine, window_size, batch_size, desc=""):
        n = len(blending_table)
        tasks = []
        frames_result = []
        for target in range(n):
            node_list = self.tree_query(max(target-window_size, 0), target)
            for source, level in node_list:
                if source!=target:
                    meta_data = {
                        "source": source,
                        "target": target,
                        "level": level
                    }
                    tasks.append(meta_data)
                else:
                    frames_result.append(blending_table[target][level])
        for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
            tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
            source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch])
            target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch])
            source_style = np.stack([blending_table[task["source"]][task["level"]][0] for task in tasks_batch])
            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
            for task, frame_2 in zip(tasks_batch, target_style):
                source, target, level = task["source"], task["target"], task["level"]
                frame_1, weight_1 = frames_result[target]
                weight_2 = blending_table[source][level][1]
                weight = weight_1 + weight_2
                frame = frame_1 * (weight_1 / weight) + frame_2 * (weight_2 / weight)
                frames_result[target] = (frame, weight)
        return frames_result
 class FastModeRunner:
    def __init__(self):
        pass
    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, save_path=None):
        frames_guide = frames_guide.raw_data()
        frames_style = frames_style.raw_data()
        table_manager = TableManager()
        patch_match_engine = PyramidPatchMatcher(
            image_height=frames_style[0].shape[0],
            image_width=frames_style[0].shape[1],
            channel=3,
            **ebsynth_config
        )
        # left part
        table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="Fast Mode Step 1/4")
        table_l = table_manager.remapping_table_to_blending_table(table_l)
        table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="Fast Mode Step 2/4")
        # right part
        table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="Fast Mode Step 3/4")
        table_r = table_manager.remapping_table_to_blending_table(table_r)
        table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="Fast Mode Step 4/4")[::-1]
        # merge
        frames = []
        for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r):
            weight_m = -1
            weight = weight_l + weight_m + weight_r
            frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight)
            frames.append(frame)
        frames = [frame.clip(0, 255).astype("uint8") for frame in frames]
        if save_path is not None:
            for target, frame in enumerate(frames):
                Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
--- a/diffsynth/extensions/FastBlend/runners/interpolation.py
+++ b/diffsynth/extensions/FastBlend/runners/interpolation.py
@@ -1,121 +0,0 @@
 from ..patch_match import PyramidPatchMatcher
 import os
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
 class InterpolationModeRunner:
    def __init__(self):
        pass
    def get_index_dict(self, index_style):
        index_dict = {}
        for i, index in enumerate(index_style):
            index_dict[index] = i
        return index_dict
    def get_weight(self, l, m, r):
        weight_l, weight_r = abs(m - r), abs(m - l)
        if weight_l + weight_r == 0:
            weight_l, weight_r = 0.5, 0.5
        else:
            weight_l, weight_r = weight_l / (weight_l + weight_r), weight_r / (weight_l + weight_r)
        return weight_l, weight_r
    def get_task_group(self, index_style, n):
        task_group = []
        index_style = sorted(index_style)
        # first frame
        if index_style[0]>0:
            tasks = []
            for m in range(index_style[0]):
                tasks.append((index_style[0], m, index_style[0]))
            task_group.append(tasks)
        # middle frames
        for l, r in zip(index_style[:-1], index_style[1:]):
            tasks = []
            for m in range(l, r):
                tasks.append((l, m, r))
            task_group.append(tasks)
        # last frame
        tasks = []
        for m in range(index_style[-1], n):
            tasks.append((index_style[-1], m, index_style[-1]))
        task_group.append(tasks)
        return task_group
    def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None):
        patch_match_engine = PyramidPatchMatcher(
            image_height=frames_style[0].shape[0],
            image_width=frames_style[0].shape[1],
            channel=3,
            use_mean_target_style=False,
            use_pairwise_patch_error=True,
            **ebsynth_config
        )
        # task
        index_dict = self.get_index_dict(index_style)
        task_group = self.get_task_group(index_style, len(frames_guide))
        # run
        for tasks in task_group:
            index_start, index_end = min([i[1] for i in tasks]), max([i[1] for i in tasks])
            for batch_id in tqdm(range(0, len(tasks), batch_size), desc=f"Rendering frames {index_start}...{index_end}"):
                tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
                source_guide, target_guide, source_style = [], [], []
                for l, m, r in tasks_batch:
                    # l -> m
                    source_guide.append(frames_guide[l])
                    target_guide.append(frames_guide[m])
                    source_style.append(frames_style[index_dict[l]])
                    # r -> m
                    source_guide.append(frames_guide[r])
                    target_guide.append(frames_guide[m])
                    source_style.append(frames_style[index_dict[r]])
                source_guide = np.stack(source_guide)
                target_guide = np.stack(target_guide)
                source_style = np.stack(source_style)
                _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
                if save_path is not None:
                    for frame_l, frame_r, (l, m, r) in zip(target_style[0::2], target_style[1::2], tasks_batch):
                        weight_l, weight_r = self.get_weight(l, m, r)
                        frame = frame_l * weight_l + frame_r * weight_r
                        frame = frame.clip(0, 255).astype("uint8")
                        Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % m))
 class InterpolationModeSingleFrameRunner:
    def __init__(self):
        pass
    def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None):
        # check input
        tracking_window_size = ebsynth_config["tracking_window_size"]
        if tracking_window_size * 2 >= batch_size:
            raise ValueError("batch_size should be larger than track_window_size * 2")
        frame_style = frames_style[0]
        frame_guide = frames_guide[index_style[0]]
        patch_match_engine = PyramidPatchMatcher(
            image_height=frame_style.shape[0],
            image_width=frame_style.shape[1],
            channel=3,
            **ebsynth_config
        )
        # run
        frame_id, n = 0, len(frames_guide)
        for i in tqdm(range(0, n, batch_size - tracking_window_size * 2), desc=f"Rendering frames 0...{n}"):
            if i + batch_size > n:
                l, r = max(n - batch_size, 0), n
            else:
                l, r = i, i + batch_size
            source_guide = np.stack([frame_guide] * (r-l))
            target_guide = np.stack([frames_guide[i] for i in range(l, r)])
            source_style = np.stack([frame_style] * (r-l))
            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
            for i, frame in zip(range(l, r), target_style):
                if i==frame_id:
                    frame = frame.clip(0, 255).astype("uint8")
                    Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % frame_id))
                    frame_id += 1
                if r < n and r-frame_id <= tracking_window_size:
                    break
--- a/diffsynth/extensions/RIFE/init.py
+++ b/diffsynth/extensions/RIFE/init.py
@@ -1,241 +0,0 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 from PIL import Image
 def warp(tenInput, tenFlow, device):
    backwarp_tenGrid = {}
    k = (str(tenFlow.device), str(tenFlow.size()))
    if k not in backwarp_tenGrid:
        tenHorizontal = torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device).view(
            1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
        tenVertical = torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device).view(
            1, 1, tenFlow.shape[2], 1).expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
        backwarp_tenGrid[k] = torch.cat(
            [tenHorizontal, tenVertical], 1).to(device)
    tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
                         tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)], 1)
    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
    return torch.nn.functional.grid_sample(input=tenInput, grid=g, mode='bilinear', padding_mode='border', align_corners=True)
 def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
    return nn.Sequential(
        nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
                  padding=padding, dilation=dilation, bias=True),        
        nn.PReLU(out_planes)
    )
 class IFBlock(nn.Module):
    def __init__(self, in_planes, c=64):
        super(IFBlock, self).__init__()
        self.conv0 = nn.Sequential(conv(in_planes, c//2, 3, 2, 1), conv(c//2, c, 3, 2, 1),)
        self.convblock0 = nn.Sequential(conv(c, c), conv(c, c))
        self.convblock1 = nn.Sequential(conv(c, c), conv(c, c))
        self.convblock2 = nn.Sequential(conv(c, c), conv(c, c))
        self.convblock3 = nn.Sequential(conv(c, c), conv(c, c))
        self.conv1 = nn.Sequential(nn.ConvTranspose2d(c, c//2, 4, 2, 1), nn.PReLU(c//2), nn.ConvTranspose2d(c//2, 4, 4, 2, 1))
        self.conv2 = nn.Sequential(nn.ConvTranspose2d(c, c//2, 4, 2, 1), nn.PReLU(c//2), nn.ConvTranspose2d(c//2, 1, 4, 2, 1))
    def forward(self, x, flow, scale=1):
        x = F.interpolate(x, scale_factor= 1. / scale, mode="bilinear", align_corners=False, recompute_scale_factor=False)
        flow = F.interpolate(flow, scale_factor= 1. / scale, mode="bilinear", align_corners=False, recompute_scale_factor=False) * 1. / scale
        feat = self.conv0(torch.cat((x, flow), 1))
        feat = self.convblock0(feat) + feat
        feat = self.convblock1(feat) + feat
        feat = self.convblock2(feat) + feat
        feat = self.convblock3(feat) + feat        
        flow = self.conv1(feat)
        mask = self.conv2(feat)
        flow = F.interpolate(flow, scale_factor=scale, mode="bilinear", align_corners=False, recompute_scale_factor=False) * scale
        mask = F.interpolate(mask, scale_factor=scale, mode="bilinear", align_corners=False, recompute_scale_factor=False)
        return flow, mask
 class IFNet(nn.Module):
    def __init__(self):
        super(IFNet, self).__init__()
        self.block0 = IFBlock(7+4, c=90)
        self.block1 = IFBlock(7+4, c=90)
        self.block2 = IFBlock(7+4, c=90)
        self.block_tea = IFBlock(10+4, c=90)
    def forward(self, x, scale_list=[4, 2, 1], training=False):
        if training == False:
            channel = x.shape[1] // 2
            img0 = x[:, :channel]
            img1 = x[:, channel:]
        flow_list = []
        merged = []
        mask_list = []
        warped_img0 = img0
        warped_img1 = img1
        flow = (x[:, :4]).detach() * 0
        mask = (x[:, :1]).detach() * 0
        block = [self.block0, self.block1, self.block2]
        for i in range(3):
            f0, m0 = block[i](torch.cat((warped_img0[:, :3], warped_img1[:, :3], mask), 1), flow, scale=scale_list[i])
            f1, m1 = block[i](torch.cat((warped_img1[:, :3], warped_img0[:, :3], -mask), 1), torch.cat((flow[:, 2:4], flow[:, :2]), 1), scale=scale_list[i])
            flow = flow + (f0 + torch.cat((f1[:, 2:4], f1[:, :2]), 1)) / 2
            mask = mask + (m0 + (-m1)) / 2
            mask_list.append(mask)
            flow_list.append(flow)
            warped_img0 = warp(img0, flow[:, :2], device=x.device)
            warped_img1 = warp(img1, flow[:, 2:4], device=x.device)
            merged.append((warped_img0, warped_img1))
        '''
        c0 = self.contextnet(img0, flow[:, :2])
        c1 = self.contextnet(img1, flow[:, 2:4])
        tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
        res = tmp[:, 1:4] * 2 - 1
        '''
        for i in range(3):
            mask_list[i] = torch.sigmoid(mask_list[i])
            merged[i] = merged[i][0] * mask_list[i] + merged[i][1] * (1 - mask_list[i])    
        return flow_list, mask_list[2], merged
    def state_dict_converter(self):
        return IFNetStateDictConverter()
 class IFNetStateDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        state_dict_ = {k.replace("module.", ""): v for k, v in state_dict.items()}
        return state_dict_
    def from_civitai(self, state_dict):
        return self.from_diffusers(state_dict)
 class RIFEInterpolater:
    def __init__(self, model, device="cuda"):
        self.model = model
        self.device = device
        # IFNet only does not support float16
        self.torch_dtype = torch.float32
    @staticmethod
    def from_model_manager(model_manager):
        return RIFEInterpolater(model_manager.RIFE, device=model_manager.device)
    def process_image(self, image):
        width, height = image.size
        if width % 32 != 0 or height % 32 != 0:
            width = (width + 31) // 32
            height = (height + 31) // 32
            image = image.resize((width, height))
        image = torch.Tensor(np.array(image, dtype=np.float32)[:, :, [2,1,0]] / 255).permute(2, 0, 1)
        return image
    def process_images(self, images):
        images = [self.process_image(image) for image in images]
        images = torch.stack(images)
        return images
    def decode_images(self, images):
        images = (images[:, [2,1,0]].permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8)
        images = [Image.fromarray(image) for image in images]
        return images
    def add_interpolated_images(self, images, interpolated_images):
        output_images = []
        for image, interpolated_image in zip(images, interpolated_images):
            output_images.append(image)
            output_images.append(interpolated_image)
        output_images.append(images[-1])
        return output_images
    @torch.no_grad()
    def interpolate_(self, images, scale=1.0):
        input_tensor = self.process_images(images)
        input_tensor = torch.cat((input_tensor[:-1], input_tensor[1:]), dim=1)
        input_tensor = input_tensor.to(device=self.device, dtype=self.torch_dtype)
        flow, mask, merged = self.model(input_tensor, [4/scale, 2/scale, 1/scale])
        output_images = self.decode_images(merged[2].cpu())
        if output_images[0].size != images[0].size:
            output_images = [image.resize(images[0].size) for image in output_images]
        return output_images
    @torch.no_grad()
    def interpolate(self, images, scale=1.0, batch_size=4, num_iter=1, progress_bar=lambda x:x):
        # Preprocess
        processed_images = self.process_images(images)
        for iter in range(num_iter):
            # Input
            input_tensor = torch.cat((processed_images[:-1], processed_images[1:]), dim=1)
            # Interpolate
            output_tensor = []
            for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)):
                batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
                batch_input_tensor = input_tensor[batch_id: batch_id_]
                batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype)
                flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale])
                output_tensor.append(merged[2].cpu())
            # Output
            output_tensor = torch.concat(output_tensor, dim=0).clip(0, 1)
            processed_images = self.add_interpolated_images(processed_images, output_tensor)
            processed_images = torch.stack(processed_images)
        # To images
        output_images = self.decode_images(processed_images)
        if output_images[0].size != images[0].size:
            output_images = [image.resize(images[0].size) for image in output_images]
        return output_images
 class RIFESmoother(RIFEInterpolater):
    def __init__(self, model, device="cuda"):
        super(RIFESmoother, self).__init__(model, device=device)
    @staticmethod
    def from_model_manager(model_manager):
        return RIFESmoother(model_manager.RIFE, device=model_manager.device)
    def process_tensors(self, input_tensor, scale=1.0, batch_size=4):
        output_tensor = []
        for batch_id in range(0, input_tensor.shape[0], batch_size):
            batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
            batch_input_tensor = input_tensor[batch_id: batch_id_]
            batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype)
            flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale])
            output_tensor.append(merged[2].cpu())
        output_tensor = torch.concat(output_tensor, dim=0)
        return output_tensor
    @torch.no_grad()
    def __call__(self, rendered_frames, scale=1.0, batch_size=4, num_iter=1, **kwargs):
        # Preprocess
        processed_images = self.process_images(rendered_frames)
        for iter in range(num_iter):
            # Input
            input_tensor = torch.cat((processed_images[:-2], processed_images[2:]), dim=1)
            # Interpolate
            output_tensor = self.process_tensors(input_tensor, scale=scale, batch_size=batch_size)
            # Blend
            input_tensor = torch.cat((processed_images[1:-1], output_tensor), dim=1)
            output_tensor = self.process_tensors(input_tensor, scale=scale, batch_size=batch_size)
            # Add to frames
            processed_images[1:-1] = output_tensor
        # To images
        output_images = self.decode_images(processed_images)
        if output_images[0].size != rendered_frames[0].size:
            output_images = [image.resize(rendered_frames[0].size) for image in output_images]
        return output_images
--- a/diffsynth/models/init.py
+++ b/diffsynth/models/init.py
@@ -1,482 +0,0 @@
 import torch, os
 from safetensors import safe_open
 from .sd_text_encoder import SDTextEncoder
 from .sd_unet import SDUNet
 from .sd_vae_encoder import SDVAEEncoder
 from .sd_vae_decoder import SDVAEDecoder
 from .sd_lora import SDLoRA
 from .sdxl_text_encoder import SDXLTextEncoder, SDXLTextEncoder2
 from .sdxl_unet import SDXLUNet
 from .sdxl_vae_decoder import SDXLVAEDecoder
 from .sdxl_vae_encoder import SDXLVAEEncoder
 from .sd_controlnet import SDControlNet
 from .sd_motion import SDMotionModel
 from .sdxl_motion import SDXLMotionModel
 from .svd_image_encoder import SVDImageEncoder
 from .svd_unet import SVDUNet
 from .svd_vae_decoder import SVDVAEDecoder
 from .svd_vae_encoder import SVDVAEEncoder
 from .sd_ipadapter import SDIpAdapter, IpAdapterCLIPImageEmbedder
 from .sdxl_ipadapter import SDXLIpAdapter, IpAdapterXLCLIPImageEmbedder
 from .hunyuan_dit_text_encoder import HunyuanDiTCLIPTextEncoder, HunyuanDiTT5TextEncoder
 from .hunyuan_dit import HunyuanDiT
 class ModelManager:
    def __init__(self, torch_dtype=torch.float16, device="cuda"):
        self.torch_dtype = torch_dtype
        self.device = device
        self.model = {}
        self.model_path = {}
        self.textual_inversion_dict = {}
    def is_stable_video_diffusion(self, state_dict):
        param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight"
        return param_name in state_dict
    def is_RIFE(self, state_dict):
        param_name = "block_tea.convblock3.0.1.weight"
        return param_name in state_dict or ("module." + param_name) in state_dict
    def is_beautiful_prompt(self, state_dict):
        param_name = "transformer.h.9.self_attention.query_key_value.weight"
        return param_name in state_dict
    def is_stabe_diffusion_xl(self, state_dict):
        param_name = "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight"
        return param_name in state_dict
    def is_stable_diffusion(self, state_dict):
        if self.is_stabe_diffusion_xl(state_dict):
            return False
        param_name = "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight"
        return param_name in state_dict
    def is_controlnet(self, state_dict):
        param_name = "control_model.time_embed.0.weight"
        param_name_2 = "mid_block.resnets.1.time_emb_proj.weight" # For controlnets in diffusers format
        return param_name in state_dict or param_name_2 in state_dict
    def is_animatediff(self, state_dict):
        param_name = "mid_block.motion_modules.0.temporal_transformer.proj_out.weight"
        return param_name in state_dict
    def is_animatediff_xl(self, state_dict):
        param_name = "up_blocks.2.motion_modules.2.temporal_transformer.transformer_blocks.0.ff_norm.weight"
        return param_name in state_dict
    def is_sd_lora(self, state_dict):
        param_name = "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_2.lora_up.weight"
        return param_name in state_dict
    def is_translator(self, state_dict):
        param_name = "model.encoder.layers.5.self_attn_layer_norm.weight"
        return param_name in state_dict and len(state_dict) == 254
    def is_ipadapter(self, state_dict):
        return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([3072, 1024])
    def is_ipadapter_image_encoder(self, state_dict):
        param_name = "vision_model.encoder.layers.31.self_attn.v_proj.weight"
        return param_name in state_dict and len(state_dict) == 521
    def is_ipadapter_xl(self, state_dict):
        return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([8192, 1280])
    def is_ipadapter_xl_image_encoder(self, state_dict):
        param_name = "vision_model.encoder.layers.47.self_attn.v_proj.weight"
        return param_name in state_dict and len(state_dict) == 777
    def is_hunyuan_dit_clip_text_encoder(self, state_dict):
        param_name = "bert.encoder.layer.23.attention.output.dense.weight"
        return param_name in state_dict
    def is_hunyuan_dit_t5_text_encoder(self, state_dict):
        param_name = "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
        return param_name in state_dict
    def is_hunyuan_dit(self, state_dict):
        param_name = "final_layer.adaLN_modulation.1.weight"
        return param_name in state_dict
    def is_diffusers_vae(self, state_dict):
        param_name = "quant_conv.weight"
        return param_name in state_dict
    def is_ExVideo_StableVideoDiffusion(self, state_dict):
        param_name = "blocks.185.positional_embedding.embeddings"
        return param_name in state_dict
    def load_stable_video_diffusion(self, state_dict, components=None, file_path="", add_positional_conv=None):
        component_dict = {
            "image_encoder": SVDImageEncoder,
            "unet": SVDUNet,
            "vae_decoder": SVDVAEDecoder,
            "vae_encoder": SVDVAEEncoder,
        }
        if components is None:
            components = ["image_encoder", "unet", "vae_decoder", "vae_encoder"]
        for component in components:
            if component == "unet":
                self.model[component] = component_dict[component](add_positional_conv=add_positional_conv)
                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict, add_positional_conv=add_positional_conv), strict=False)
            else:
                self.model[component] = component_dict[component]()
                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
            self.model[component].to(self.torch_dtype).to(self.device)
            self.model_path[component] = file_path
    def load_stable_diffusion(self, state_dict, components=None, file_path=""):
        component_dict = {
            "text_encoder": SDTextEncoder,
            "unet": SDUNet,
            "vae_decoder": SDVAEDecoder,
            "vae_encoder": SDVAEEncoder,
            "refiner": SDXLUNet,
        }
        if components is None:
            components = ["text_encoder", "unet", "vae_decoder", "vae_encoder"]
        for component in components:
            if component == "text_encoder":
                # Add additional token embeddings to text encoder
                token_embeddings = [state_dict["cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"]]
                for keyword in self.textual_inversion_dict:
                    _, embeddings = self.textual_inversion_dict[keyword]
                    token_embeddings.append(embeddings.to(dtype=token_embeddings[0].dtype))
                token_embeddings = torch.concat(token_embeddings, dim=0)
                state_dict["cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"] = token_embeddings
                self.model[component] = component_dict[component](vocab_size=token_embeddings.shape[0])
                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
                self.model[component].to(self.torch_dtype).to(self.device)
            else:
                self.model[component] = component_dict[component]()
                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
                self.model[component].to(self.torch_dtype).to(self.device)
            self.model_path[component] = file_path
    def load_stable_diffusion_xl(self, state_dict, components=None, file_path=""):
        component_dict = {
            "text_encoder": SDXLTextEncoder,
            "text_encoder_2": SDXLTextEncoder2,
            "unet": SDXLUNet,
            "vae_decoder": SDXLVAEDecoder,
            "vae_encoder": SDXLVAEEncoder,
        }
        if components is None:
            components = ["text_encoder", "text_encoder_2", "unet", "vae_decoder", "vae_encoder"]
        for component in components:
            self.model[component] = component_dict[component]()
            self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
            if component in ["vae_decoder", "vae_encoder"]:
                # These two model will output nan when float16 is enabled.
                # The precision problem happens in the last three resnet blocks.
                # I do not know how to solve this problem.
                self.model[component].to(torch.float32).to(self.device)
            else:
                self.model[component].to(self.torch_dtype).to(self.device)
            self.model_path[component] = file_path
    def load_controlnet(self, state_dict, file_path=""):
        component = "controlnet"
        if component not in self.model:
            self.model[component] = []
            self.model_path[component] = []
        model = SDControlNet()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component].append(model)
        self.model_path[component].append(file_path)
    def load_animatediff(self, state_dict, file_path="", add_positional_conv=None):
        component = "motion_modules"
        model = SDMotionModel(add_positional_conv=add_positional_conv)
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict, add_positional_conv=add_positional_conv))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_animatediff_xl(self, state_dict, file_path=""):
        component = "motion_modules_xl"
        model = SDXLMotionModel()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_beautiful_prompt(self, state_dict, file_path=""):
        component = "beautiful_prompt"
        from transformers import AutoModelForCausalLM
        model_folder = os.path.dirname(file_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_folder, state_dict=state_dict, local_files_only=True, torch_dtype=self.torch_dtype
        ).to(self.device).eval()
        self.model[component] = model
        self.model_path[component] = file_path
    def load_RIFE(self, state_dict, file_path=""):
        component = "RIFE"
        from ..extensions.RIFE import IFNet
        model = IFNet().eval()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(torch.float32).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_sd_lora(self, state_dict, alpha):
        SDLoRA().add_lora_to_text_encoder(self.model["text_encoder"], state_dict, alpha=alpha, device=self.device)
        SDLoRA().add_lora_to_unet(self.model["unet"], state_dict, alpha=alpha, device=self.device)
    def load_translator(self, state_dict, file_path=""):
        # This model is lightweight, we do not place it on GPU.
        component = "translator"
        from transformers import AutoModelForSeq2SeqLM
        model_folder = os.path.dirname(file_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_folder).eval()
        self.model[component] = model
        self.model_path[component] = file_path
    def load_ipadapter(self, state_dict, file_path=""):
        component = "ipadapter"
        model = SDIpAdapter()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_ipadapter_image_encoder(self, state_dict, file_path=""):
        component = "ipadapter_image_encoder"
        model = IpAdapterCLIPImageEmbedder()
        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_ipadapter_xl(self, state_dict, file_path=""):
        component = "ipadapter_xl"
        model = SDXLIpAdapter()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_ipadapter_xl_image_encoder(self, state_dict, file_path=""):
        component = "ipadapter_xl_image_encoder"
        model = IpAdapterXLCLIPImageEmbedder()
        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_hunyuan_dit_clip_text_encoder(self, state_dict, file_path=""):
        component = "hunyuan_dit_clip_text_encoder"
        model = HunyuanDiTCLIPTextEncoder()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_hunyuan_dit_t5_text_encoder(self, state_dict, file_path=""):
        component = "hunyuan_dit_t5_text_encoder"
        model = HunyuanDiTT5TextEncoder()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_hunyuan_dit(self, state_dict, file_path=""):
        component = "hunyuan_dit"
        model = HunyuanDiT()
        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_diffusers_vae(self, state_dict, file_path=""):
        # TODO: detect SD and SDXL
        component = "vae_encoder"
        model = SDXLVAEEncoder()
        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
        component = "vae_decoder"
        model = SDXLVAEDecoder()
        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
        model.to(self.torch_dtype).to(self.device)
        self.model[component] = model
        self.model_path[component] = file_path
    def load_ExVideo_StableVideoDiffusion(self, state_dict, file_path=""):
        unet_state_dict = self.model["unet"].state_dict()
        self.model["unet"].to("cpu")
        del self.model["unet"]
        add_positional_conv = state_dict["blocks.185.positional_embedding.embeddings"].shape[0]
        self.model["unet"] = SVDUNet(add_positional_conv=add_positional_conv)
        self.model["unet"].load_state_dict(unet_state_dict, strict=False)
        self.model["unet"].load_state_dict(state_dict, strict=False)
        self.model["unet"].to(self.torch_dtype).to(self.device)
    def search_for_embeddings(self, state_dict):
        embeddings = []
        for k in state_dict:
            if isinstance(state_dict[k], torch.Tensor):
                embeddings.append(state_dict[k])
            elif isinstance(state_dict[k], dict):
                embeddings += self.search_for_embeddings(state_dict[k])
        return embeddings
    def load_textual_inversions(self, folder):
        # Store additional tokens here
        self.textual_inversion_dict = {}
        # Load every textual inversion file
        for file_name in os.listdir(folder):
            if file_name.endswith(".txt"):
                continue
            keyword = os.path.splitext(file_name)[0]
            state_dict = load_state_dict(os.path.join(folder, file_name))
            # Search for embeddings
            for embeddings in self.search_for_embeddings(state_dict):
                if len(embeddings.shape) == 2 and embeddings.shape[1] == 768:
                    tokens = [f"{keyword}_{i}" for i in range(embeddings.shape[0])]
                    self.textual_inversion_dict[keyword] = (tokens, embeddings)
                    break
    def load_model(self, file_path, components=None, lora_alphas=[]):
        state_dict = load_state_dict(file_path, torch_dtype=self.torch_dtype)
        if self.is_stable_video_diffusion(state_dict):
            self.load_stable_video_diffusion(state_dict, file_path=file_path)
        elif self.is_animatediff(state_dict):
            self.load_animatediff(state_dict, file_path=file_path)
        elif self.is_animatediff_xl(state_dict):
            self.load_animatediff_xl(state_dict, file_path=file_path)
        elif self.is_controlnet(state_dict):
            self.load_controlnet(state_dict, file_path=file_path)
        elif self.is_stabe_diffusion_xl(state_dict):
            self.load_stable_diffusion_xl(state_dict, components=components, file_path=file_path)
        elif self.is_stable_diffusion(state_dict):
            self.load_stable_diffusion(state_dict, components=components, file_path=file_path)
        elif self.is_sd_lora(state_dict):
            self.load_sd_lora(state_dict, alpha=lora_alphas.pop(0))
        elif self.is_beautiful_prompt(state_dict):
            self.load_beautiful_prompt(state_dict, file_path=file_path)
        elif self.is_RIFE(state_dict):
            self.load_RIFE(state_dict, file_path=file_path)
        elif self.is_translator(state_dict):
            self.load_translator(state_dict, file_path=file_path)
        elif self.is_ipadapter(state_dict):
            self.load_ipadapter(state_dict, file_path=file_path)
        elif self.is_ipadapter_image_encoder(state_dict):
            self.load_ipadapter_image_encoder(state_dict, file_path=file_path)
        elif self.is_ipadapter_xl(state_dict):
            self.load_ipadapter_xl(state_dict, file_path=file_path)
        elif self.is_ipadapter_xl_image_encoder(state_dict):
            self.load_ipadapter_xl_image_encoder(state_dict, file_path=file_path)
        elif self.is_hunyuan_dit_clip_text_encoder(state_dict):
            self.load_hunyuan_dit_clip_text_encoder(state_dict, file_path=file_path)
        elif self.is_hunyuan_dit_t5_text_encoder(state_dict):
            self.load_hunyuan_dit_t5_text_encoder(state_dict, file_path=file_path)
        elif self.is_hunyuan_dit(state_dict):
            self.load_hunyuan_dit(state_dict, file_path=file_path)
        elif self.is_diffusers_vae(state_dict):
            self.load_diffusers_vae(state_dict, file_path=file_path)
        elif self.is_ExVideo_StableVideoDiffusion(state_dict):
            self.load_ExVideo_StableVideoDiffusion(state_dict, file_path=file_path)
    def load_models(self, file_path_list, lora_alphas=[]):
        for file_path in file_path_list:
            self.load_model(file_path, lora_alphas=lora_alphas)
    def to(self, device):
        for component in self.model:
            if isinstance(self.model[component], list):
                for model in self.model[component]:
                    model.to(device)
            else:
                self.model[component].to(device)
        torch.cuda.empty_cache()
    def get_model_with_model_path(self, model_path):
        for component in self.model_path:
            if isinstance(self.model_path[component], str):
                if os.path.samefile(self.model_path[component], model_path):
                    return self.model[component]
            elif isinstance(self.model_path[component], list):
                for i, model_path_ in enumerate(self.model_path[component]):
                    if os.path.samefile(model_path_, model_path):
                        return self.model[component][i]
        raise ValueError(f"Please load model {model_path} before you use it.")
    def __getattr__(self, __name):
        if __name in self.model:
            return self.model[__name]
        else:
            return super.__getattribute__(__name)
 def load_state_dict(file_path, torch_dtype=None):
    if file_path.endswith(".safetensors"):
        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
    else:
        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
 def load_state_dict_from_safetensors(file_path, torch_dtype=None):
    state_dict = {}
    with safe_open(file_path, framework="pt", device="cpu") as f:
        for k in f.keys():
            state_dict[k] = f.get_tensor(k)
            if torch_dtype is not None:
                state_dict[k] = state_dict[k].to(torch_dtype)
    return state_dict
 def load_state_dict_from_bin(file_path, torch_dtype=None):
    state_dict = torch.load(file_path, map_location="cpu")
    if torch_dtype is not None:
        for i in state_dict:
            if isinstance(state_dict[i], torch.Tensor):
                state_dict[i] = state_dict[i].to(torch_dtype)
    return state_dict
 def search_parameter(param, state_dict):
    for name, param_ in state_dict.items():
        if param.numel() == param_.numel():
            if param.shape == param_.shape:
                if torch.dist(param, param_) < 1e-6:
                    return name
            else:
                if torch.dist(param.flatten(), param_.flatten()) < 1e-6:
                    return name
    return None
 def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
    matched_keys = set()
    with torch.no_grad():
        for name in source_state_dict:
            rename = search_parameter(source_state_dict[name], target_state_dict)
            if rename is not None:
                print(f'"{name}": "{rename}",')
                matched_keys.add(rename)
            elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
                length = source_state_dict[name].shape[0] // 3
                rename = []
                for i in range(3):
                    rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
                if None not in rename:
                    print(f'"{name}": {rename},')
                    for rename_ in rename:
                        matched_keys.add(rename_)
    for name in target_state_dict:
        if name not in matched_keys:
            print("Cannot find", name, target_state_dict[name].shape)
--- a/diffsynth/models/ace_step_conditioner.py
+++ b/diffsynth/models/ace_step_conditioner.py
@@ -0,0 +1,695 @@
 # Copyright 2025 The ACESTEO Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional
 import torch
 import torch.nn.functional as F
 from torch import nn
 from einops import rearrange
 from ..core.attention import attention_forward
 from ..core.gradient import gradient_checkpoint_forward
 from transformers.cache_utils import Cache
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.processing_utils import Unpack
 from transformers.utils import can_return_tuple, logging
 from transformers.models.qwen3.modeling_qwen3 import (
    Qwen3MLP,
    Qwen3RMSNorm,
    Qwen3RotaryEmbedding,
    apply_rotary_pos_emb,
 )
 logger = logging.get_logger(__name__)
 def create_4d_mask(
    seq_len: int,
    dtype: torch.dtype,
    device: torch.device,
    attention_mask: Optional[torch.Tensor] = None,
    sliding_window: Optional[int] = None,
    is_sliding_window: bool = False,
    is_causal: bool = True,
 ) -> torch.Tensor:
    indices = torch.arange(seq_len, device=device)
    diff = indices.unsqueeze(1) - indices.unsqueeze(0)
    valid_mask = torch.ones((seq_len, seq_len), device=device, dtype=torch.bool)
    if is_causal:
        valid_mask = valid_mask & (diff >= 0)
    if is_sliding_window and sliding_window is not None:
        if is_causal:
            valid_mask = valid_mask & (diff <= sliding_window)
        else:
            valid_mask = valid_mask & (torch.abs(diff) <= sliding_window)
    valid_mask = valid_mask.unsqueeze(0).unsqueeze(0)
    if attention_mask is not None:
        padding_mask_4d = attention_mask.view(attention_mask.shape[0], 1, 1, seq_len).to(torch.bool)
        valid_mask = valid_mask & padding_mask_4d
    min_dtype = torch.finfo(dtype).min
    mask_tensor = torch.full(valid_mask.shape, min_dtype, dtype=dtype, device=device)
    mask_tensor.masked_fill_(valid_mask, 0.0)
    return mask_tensor
 def pack_sequences(hidden1: torch.Tensor, hidden2: torch.Tensor, mask1: torch.Tensor, mask2: torch.Tensor):
    hidden_cat = torch.cat([hidden1, hidden2], dim=1)
    mask_cat = torch.cat([mask1, mask2], dim=1)
    B, L, D = hidden_cat.shape
    sort_idx = mask_cat.argsort(dim=1, descending=True, stable=True)
    hidden_left = torch.gather(hidden_cat, 1, sort_idx.unsqueeze(-1).expand(B, L, D))
    lengths = mask_cat.sum(dim=1)
    new_mask = (torch.arange(L, dtype=torch.long, device=hidden_cat.device).unsqueeze(0) < lengths.unsqueeze(1))
    return hidden_left, new_mask
 class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func
    def forward(self, x):
        return self.func(x)
 class AceStepAttention(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        rms_norm_eps: float,
        attention_bias: bool,
        attention_dropout: float,
        layer_types: list,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
        is_cross_attention: bool = False,
        is_causal: bool = False,
    ):
        super().__init__()
        self.layer_idx = layer_idx
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.num_key_value_groups = num_attention_heads // num_key_value_heads
        self.scaling = self.head_dim ** -0.5
        self.attention_dropout = attention_dropout
        if is_cross_attention:
            is_causal = False
        self.is_causal = is_causal
        self.is_cross_attention = is_cross_attention
        self.q_proj = nn.Linear(hidden_size, num_attention_heads * self.head_dim, bias=attention_bias)
        self.k_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim, bias=attention_bias)
        self.v_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim, bias=attention_bias)
        self.o_proj = nn.Linear(num_attention_heads * self.head_dim, hidden_size, bias=attention_bias)
        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.attention_type = layer_types[layer_idx]
        self.sliding_window = sliding_window if layer_types[layer_idx] == "sliding_attention" else None
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        position_embeddings: tuple[torch.Tensor, torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        is_cross_attention = self.is_cross_attention and encoder_hidden_states is not None
        if is_cross_attention:
            encoder_hidden_shape = (*encoder_hidden_states.shape[:-1], -1, self.head_dim)
            if past_key_value is not None:
                is_updated = past_key_value.is_updated.get(self.layer_idx)
                curr_past_key_value = past_key_value.cross_attention_cache
                if not is_updated:
                    key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                    value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
                    key_states, value_states = curr_past_key_value.update(key_states, value_states, self.layer_idx)
                    past_key_value.is_updated[self.layer_idx] = True
                else:
                    key_states = curr_past_key_value.layers[self.layer_idx].keys
                    value_states = curr_past_key_value.layers[self.layer_idx].values
            else:
                key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
        else:
            key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
            value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
            if position_embeddings is not None:
                cos, sin = position_embeddings
                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        if self.num_key_value_groups > 1:
            key_states = key_states.unsqueeze(2).expand(-1, -1, self.num_key_value_groups, -1, -1).flatten(1, 2)
            value_states = value_states.unsqueeze(2).expand(-1, -1, self.num_key_value_groups, -1, -1).flatten(1, 2)
        attn_output = attention_forward(
            query_states, key_states, value_states,
            q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d",
            attn_mask=attention_mask,
        )
        attn_weights = None
        attn_output = attn_output.transpose(1, 2).flatten(2, 3).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights
 class AceStepEncoderLayer(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        rms_norm_eps: float,
        attention_bias: bool,
        attention_dropout: float,
        layer_types: list,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.layer_idx = layer_idx
        self.self_attn = AceStepAttention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            layer_idx=layer_idx,
            is_cross_attention=False,
            is_causal=False,
        )
        self.input_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        self.post_attention_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        mlp_config = type('Config', (), {
            'hidden_size': hidden_size,
            'intermediate_size': intermediate_size,
            'hidden_act': 'silu',
        })()
        self.mlp = Qwen3MLP(mlp_config)
        self.attention_type = layer_types[layer_idx]
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs,
    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            use_cache=False,
            past_key_value=None,
            **kwargs,
        )
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)
        if output_attentions:
            outputs += (self_attn_weights,)
        return outputs
 class AceStepLyricEncoder(nn.Module):
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_hidden_layers: int = 24,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        use_cache: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        text_hidden_dim: int = 1024,
        num_lyric_encoder_hidden_layers: int = 8,
        **kwargs,
    ):
        super().__init__()
        self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
        self.text_hidden_dim = text_hidden_dim
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.rms_norm_eps = rms_norm_eps
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * (num_hidden_layers // 2))
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.sliding_window = sliding_window
        self.use_sliding_window = use_sliding_window
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self._attn_implementation = kwargs.get("_attn_implementation", "sdpa")
        self.embed_tokens = nn.Linear(text_hidden_dim, hidden_size)
        self.norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        rope_config = type('RopeConfig', (), {
            'hidden_size': hidden_size,
            'num_attention_heads': num_attention_heads,
            'num_key_value_heads': num_key_value_heads,
            'head_dim': head_dim,
            'max_position_embeddings': max_position_embeddings,
            'rope_theta': rope_theta,
            'rope_parameters': {'rope_type': 'default', 'rope_theta': rope_theta},
            'rms_norm_eps': rms_norm_eps,
            'attention_bias': attention_bias,
            'attention_dropout': attention_dropout,
            'hidden_act': 'silu',
            'intermediate_size': intermediate_size,
            'layer_types': self.layer_types,
            'sliding_window': sliding_window,
            '_attn_implementation': self._attn_implementation,
        })()
        self.rotary_emb = Qwen3RotaryEmbedding(rope_config)
        self.gradient_checkpointing = False
        self.layers = nn.ModuleList([
            AceStepEncoderLayer(
                hidden_size=hidden_size,
                intermediate_size=intermediate_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                attention_dropout=attention_dropout,
                layer_types=self.layer_types,
                head_dim=head_dim,
                sliding_window=sliding_window,
                layer_idx=layer_idx,
            )
            for layer_idx in range(num_lyric_encoder_hidden_layers)
        ])
    @can_return_tuple
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> BaseModelOutput:
        output_attentions = output_attentions if output_attentions is not None else False
        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
        assert input_ids is None, "Only `inputs_embeds` is supported for the lyric encoder."
        assert attention_mask is not None, "Attention mask must be provided for the lyric encoder."
        assert inputs_embeds is not None, "Inputs embeddings must be provided for the lyric encoder."
        inputs_embeds = self.embed_tokens(inputs_embeds)
        cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)
        seq_len = inputs_embeds.shape[1]
        dtype = inputs_embeds.dtype
        device = inputs_embeds.device
        full_attn_mask = create_4d_mask(
            seq_len=seq_len, dtype=dtype, device=device,
            attention_mask=attention_mask, sliding_window=None,
            is_sliding_window=False, is_causal=False
        )
        sliding_attn_mask = None
        if self.use_sliding_window:
            sliding_attn_mask = create_4d_mask(
                seq_len=seq_len, dtype=dtype, device=device,
                attention_mask=attention_mask, sliding_window=self.sliding_window,
                is_sliding_window=True, is_causal=False
            )
        self_attn_mask_mapping = {
            "full_attention": full_attn_mask,
            "sliding_attention": sliding_attn_mask,
        }
        hidden_states = inputs_embeds
        position_embeddings = self.rotary_emb(hidden_states, position_ids)
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        for layer_module in self.layers[: self.num_lyric_encoder_hidden_layers]:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            layer_outputs = layer_module(
                hidden_states, position_embeddings,
                self_attn_mask_mapping[layer_module.attention_type],
                position_ids, output_attentions,
                **flash_attn_kwargs,
            )
            hidden_states = layer_outputs[0]
            if output_attentions:
                all_self_attns += (layer_outputs[1],)
        hidden_states = self.norm(hidden_states)
        if output_hidden_states:
            all_hidden_states += (hidden_states,)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
 class AceStepTimbreEncoder(nn.Module):
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_hidden_layers: int = 24,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        use_cache: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        timbre_hidden_dim: int = 64,
        num_timbre_encoder_hidden_layers: int = 4,
        **kwargs,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.rms_norm_eps = rms_norm_eps
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * (num_hidden_layers // 2))
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.sliding_window = sliding_window
        self.use_sliding_window = use_sliding_window
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.timbre_hidden_dim = timbre_hidden_dim
        self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
        self._attn_implementation = kwargs.get("_attn_implementation", "sdpa")
        self.embed_tokens = nn.Linear(timbre_hidden_dim, hidden_size)
        self.norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        rope_config = type('RopeConfig', (), {
            'hidden_size': hidden_size,
            'num_attention_heads': num_attention_heads,
            'num_key_value_heads': num_key_value_heads,
            'head_dim': head_dim,
            'max_position_embeddings': max_position_embeddings,
            'rope_theta': rope_theta,
            'rope_parameters': {'rope_type': 'default', 'rope_theta': rope_theta},
            'rms_norm_eps': rms_norm_eps,
            'attention_bias': attention_bias,
            'attention_dropout': attention_dropout,
            'hidden_act': 'silu',
            'intermediate_size': intermediate_size,
            'layer_types': self.layer_types,
            'sliding_window': sliding_window,
            '_attn_implementation': self._attn_implementation,
        })()
        self.rotary_emb = Qwen3RotaryEmbedding(rope_config)
        self.gradient_checkpointing = False
        self.special_token = nn.Parameter(torch.randn(1, 1, hidden_size))
        self.layers = nn.ModuleList([
            AceStepEncoderLayer(
                hidden_size=hidden_size,
                intermediate_size=intermediate_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                attention_dropout=attention_dropout,
                layer_types=self.layer_types,
                head_dim=head_dim,
                sliding_window=sliding_window,
                layer_idx=layer_idx,
            )
            for layer_idx in range(num_timbre_encoder_hidden_layers)
        ])
    def unpack_timbre_embeddings(self, timbre_embs_packed, refer_audio_order_mask):
        N, d = timbre_embs_packed.shape
        device = timbre_embs_packed.device
        dtype = timbre_embs_packed.dtype
        B = int(refer_audio_order_mask.max().item() + 1)
        counts = torch.bincount(refer_audio_order_mask, minlength=B)
        max_count = counts.max().item()
        sorted_indices = torch.argsort(refer_audio_order_mask * N + torch.arange(N, device=device), stable=True)
        sorted_batch_ids = refer_audio_order_mask[sorted_indices]
        positions = torch.arange(N, device=device)
        batch_starts = torch.cat([torch.tensor([0], device=device), torch.cumsum(counts, dim=0)[:-1]])
        positions_in_sorted = positions - batch_starts[sorted_batch_ids]
        inverse_indices = torch.empty_like(sorted_indices)
        inverse_indices[sorted_indices] = torch.arange(N, device=device)
        positions_in_batch = positions_in_sorted[inverse_indices]
        indices_2d = refer_audio_order_mask * max_count + positions_in_batch
        one_hot = F.one_hot(indices_2d, num_classes=B * max_count).to(dtype)
        timbre_embs_flat = one_hot.t() @ timbre_embs_packed
        timbre_embs_unpack = timbre_embs_flat.reshape(B, max_count, d)
        mask_flat = (one_hot.sum(dim=0) > 0).long()
        new_mask = mask_flat.reshape(B, max_count)
        return timbre_embs_unpack, new_mask
    @can_return_tuple
    def forward(
        self,
        refer_audio_acoustic_hidden_states_packed: Optional[torch.FloatTensor] = None,
        refer_audio_order_mask: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> BaseModelOutput:
        inputs_embeds = refer_audio_acoustic_hidden_states_packed
        inputs_embeds = self.embed_tokens(inputs_embeds)
        seq_len = inputs_embeds.shape[1]
        cache_position = torch.arange(0, seq_len, device=inputs_embeds.device)
        position_ids = cache_position.unsqueeze(0)
        dtype = inputs_embeds.dtype
        device = inputs_embeds.device
        full_attn_mask = create_4d_mask(
            seq_len=seq_len, dtype=dtype, device=device,
            attention_mask=attention_mask, sliding_window=None,
            is_sliding_window=False, is_causal=False
        )
        sliding_attn_mask = None
        if self.use_sliding_window:
            sliding_attn_mask = create_4d_mask(
                seq_len=seq_len, dtype=dtype, device=device,
                attention_mask=attention_mask, sliding_window=self.sliding_window,
                is_sliding_window=True, is_causal=False
            )
        self_attn_mask_mapping = {
            "full_attention": full_attn_mask,
            "sliding_attention": sliding_attn_mask,
        }
        hidden_states = inputs_embeds
        position_embeddings = self.rotary_emb(hidden_states, position_ids)
        for layer_module in self.layers[: self.num_timbre_encoder_hidden_layers]:
            layer_outputs = layer_module(
                hidden_states, position_embeddings,
                self_attn_mask_mapping[layer_module.attention_type],
                position_ids,
                **flash_attn_kwargs,
            )
            hidden_states = layer_outputs[0]
        hidden_states = self.norm(hidden_states)
        hidden_states = hidden_states[:, 0, :]
        # For packed input: reshape [1, T, D] -> [T, D] for unpacking
        timbre_embs_unpack, timbre_embs_mask = self.unpack_timbre_embeddings(hidden_states, refer_audio_order_mask)
        return timbre_embs_unpack, timbre_embs_mask
 class AceStepConditionEncoder(nn.Module):
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_hidden_layers: int = 24,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        use_cache: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        text_hidden_dim: int = 1024,
        timbre_hidden_dim: int = 64,
        num_lyric_encoder_hidden_layers: int = 8,
        num_timbre_encoder_hidden_layers: int = 4,
        **kwargs,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.rms_norm_eps = rms_norm_eps
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * (num_hidden_layers // 2))
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.sliding_window = sliding_window
        self.use_sliding_window = use_sliding_window
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.text_hidden_dim = text_hidden_dim
        self.timbre_hidden_dim = timbre_hidden_dim
        self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
        self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
        self._attn_implementation = kwargs.get("_attn_implementation", "sdpa")
        self.text_projector = nn.Linear(text_hidden_dim, hidden_size, bias=False)
        self.null_condition_emb = nn.Parameter(torch.randn(1, 1, hidden_size))
        self.lyric_encoder = AceStepLyricEncoder(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            use_sliding_window=use_sliding_window,
            rope_theta=rope_theta,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            text_hidden_dim=text_hidden_dim,
            num_lyric_encoder_hidden_layers=num_lyric_encoder_hidden_layers,
        )
        self.timbre_encoder = AceStepTimbreEncoder(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            use_sliding_window=use_sliding_window,
            rope_theta=rope_theta,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            timbre_hidden_dim=timbre_hidden_dim,
            num_timbre_encoder_hidden_layers=num_timbre_encoder_hidden_layers,
        )
    def forward(
        self,
        text_hidden_states: Optional[torch.FloatTensor] = None,
        text_attention_mask: Optional[torch.Tensor] = None,
        lyric_hidden_states: Optional[torch.LongTensor] = None,
        lyric_attention_mask: Optional[torch.Tensor] = None,
        reference_latents: Optional[torch.Tensor] = None,
        refer_audio_order_mask: Optional[torch.LongTensor] = None,
    ):
        text_hidden_states = self.text_projector(text_hidden_states)
        lyric_encoder_outputs = self.lyric_encoder(
            inputs_embeds=lyric_hidden_states,
            attention_mask=lyric_attention_mask,
        )
        lyric_hidden_states = lyric_encoder_outputs.last_hidden_state
        timbre_embs_unpack, timbre_embs_mask = self.timbre_encoder(reference_latents, refer_audio_order_mask)
        encoder_hidden_states, encoder_attention_mask = pack_sequences(
            lyric_hidden_states, timbre_embs_unpack, lyric_attention_mask, timbre_embs_mask
        )
        encoder_hidden_states, encoder_attention_mask = pack_sequences(
            encoder_hidden_states, text_hidden_states, encoder_attention_mask, text_attention_mask
        )
        return encoder_hidden_states, encoder_attention_mask
--- a/diffsynth/models/ace_step_dit.py
+++ b/diffsynth/models/ace_step_dit.py
@@ -0,0 +1,901 @@
 # Copyright 2025 The ACESTEO Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional
 import torch
 import torch.nn.functional as F
 from torch import nn
 from ..core.attention.attention import attention_forward
 from ..core import gradient_checkpoint_forward
 from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.processing_utils import Unpack
 from transformers.utils import logging
 from transformers.models.qwen3.modeling_qwen3 import (
    Qwen3MLP,
    Qwen3RMSNorm,
    Qwen3RotaryEmbedding,
    apply_rotary_pos_emb,
 )
 logger = logging.get_logger(__name__)
 def create_4d_mask(
    seq_len: int,
    dtype: torch.dtype,
    device: torch.device,
    attention_mask: Optional[torch.Tensor] = None,  # [Batch, Seq_Len]
    sliding_window: Optional[int] = None,
    is_sliding_window: bool = False,
    is_causal: bool = True,
 ) -> torch.Tensor:
    """
    General 4D Attention Mask generator compatible with CPU/Mac/SDPA and Eager mode.
    Supports use cases:
    1. Causal Full: is_causal=True, is_sliding_window=False (standard GPT)
    2. Causal Sliding: is_causal=True, is_sliding_window=True (Mistral/Qwen local window)
    3. Bidirectional Full: is_causal=False, is_sliding_window=False (BERT/Encoder)
    4. Bidirectional Sliding: is_causal=False, is_sliding_window=True (Longformer local)
    Returns:
        [Batch, 1, Seq_Len, Seq_Len] additive mask (0.0 for keep, -inf for mask)
    """
    # ------------------------------------------------------
    # 1. Construct basic geometry mask [Seq_Len, Seq_Len]
    # ------------------------------------------------------
    # Build index matrices
    # i (Query): [0, 1, ..., L-1]
    # j (Key):   [0, 1, ..., L-1]
    indices = torch.arange(seq_len, device=device)
    # diff = i - j
    diff = indices.unsqueeze(1) - indices.unsqueeze(0)
    # Initialize all True (all positions visible)
    valid_mask = torch.ones((seq_len, seq_len), device=device, dtype=torch.bool)
    # (A) Handle causality (Causal)
    if is_causal:
        # i >= j  =>  diff >= 0
        valid_mask = valid_mask & (diff >= 0)
    # (B) Handle sliding window
    if is_sliding_window and sliding_window is not None:
        if is_causal:
            # Causal sliding: only attend to past window steps
            # i - j <= window  =>  diff <= window
            # (diff >= 0 already handled above)
            valid_mask = valid_mask & (diff <= sliding_window)
        else:
            # Bidirectional sliding: attend past and future window steps
            # |i - j| <= window  =>  abs(diff) <= sliding_window
            valid_mask = valid_mask & (torch.abs(diff) <= sliding_window)
    # Expand dimensions to [1, 1, Seq_Len, Seq_Len] for broadcasting
    valid_mask = valid_mask.unsqueeze(0).unsqueeze(0)
    # ------------------------------------------------------
    # 2. Apply padding mask (Key Masking)
    # ------------------------------------------------------
    if attention_mask is not None:
        # attention_mask shape: [Batch, Seq_Len] (1=valid, 0=padding)
        # We want to mask out invalid keys (columns)
        # Expand shape: [Batch, 1, 1, Seq_Len]
        padding_mask_4d = attention_mask.view(attention_mask.shape[0], 1, 1, seq_len).to(torch.bool)
        # Broadcasting: Geometry Mask [1, 1, L, L] & Padding Mask [B, 1, 1, L]
        # Result shape: [B, 1, L, L]
        valid_mask = valid_mask & padding_mask_4d
    # ------------------------------------------------------
    # 3. Convert to additive mask
    # ------------------------------------------------------
    # Get the minimal value for current dtype
    min_dtype = torch.finfo(dtype).min
    # Create result tensor filled with -inf by default
    mask_tensor = torch.full(valid_mask.shape, min_dtype, dtype=dtype, device=device)
    # Set valid positions to 0.0
    mask_tensor.masked_fill_(valid_mask, 0.0)
    return mask_tensor
 def pack_sequences(hidden1: torch.Tensor, hidden2: torch.Tensor, mask1: torch.Tensor, mask2: torch.Tensor):
    """
    Pack two sequences by concatenating and sorting them based on mask values.
    Args:
        hidden1: First hidden states tensor of shape [B, L1, D]
        hidden2: Second hidden states tensor of shape [B, L2, D]
        mask1: First mask tensor of shape [B, L1]
        mask2: Second mask tensor of shape [B, L2]
    Returns:
        Tuple of (packed_hidden_states, new_mask) where:
        - packed_hidden_states: Packed hidden states with valid tokens (mask=1) first, shape [B, L1+L2, D]
        - new_mask: New mask tensor indicating valid positions, shape [B, L1+L2]
    """
    # Step 1: Concatenate hidden states and masks along sequence dimension
    hidden_cat = torch.cat([hidden1, hidden2], dim=1)  # [B, L, D]
    mask_cat = torch.cat([mask1, mask2], dim=1)  # [B, L]
    B, L, D = hidden_cat.shape
    # Step 2: Sort indices so that mask values of 1 come before 0
    sort_idx = mask_cat.argsort(dim=1, descending=True, stable=True)  # [B, L]
    # Step 3: Reorder hidden states using sorted indices
    hidden_left = torch.gather(hidden_cat, 1, sort_idx.unsqueeze(-1).expand(B, L, D))
    # Step 4: Create new mask based on valid sequence lengths
    lengths = mask_cat.sum(dim=1)  # [B]
    new_mask = (torch.arange(L, dtype=torch.long, device=hidden_cat.device).unsqueeze(0) < lengths.unsqueeze(1))
    return hidden_left, new_mask
 class TimestepEmbedding(nn.Module):
    """
    Timestep embedding module for diffusion models.
    Converts timestep values into high-dimensional embeddings using sinusoidal
    positional encoding, followed by MLP layers. Used for conditioning diffusion
    models on timestep information.
    """
    def __init__(
        self,
        in_channels: int,
        time_embed_dim: int,
        scale: float = 1,
    ):
        super().__init__()
        self.linear_1 = nn.Linear(in_channels, time_embed_dim, bias=True)
        self.act1 = nn.SiLU()
        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, bias=True)
        self.in_channels = in_channels
        self.act2 = nn.SiLU()
        self.time_proj = nn.Linear(time_embed_dim, time_embed_dim * 6)
        self.scale = scale
    def timestep_embedding(self, t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        Args:
            t: A 1-D tensor of N indices, one per batch element. These may be fractional.
            dim: The dimension of the output embeddings.
            max_period: Controls the minimum frequency of the embeddings.
        Returns:
            An (N, D) tensor of positional embeddings.
        """
        t = t * self.scale
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
        ).to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding
    def forward(self, t):
        t_freq = self.timestep_embedding(t, self.in_channels)
        temb = self.linear_1(t_freq.to(t.dtype))
        temb = self.act1(temb)
        temb = self.linear_2(temb)
        timestep_proj = self.time_proj(self.act2(temb)).unflatten(1, (6, -1))
        return temb, timestep_proj
 class AceStepAttention(nn.Module):
    """
    Multi-headed attention module for AceStep model.
    Implements the attention mechanism from 'Attention Is All You Need' paper,
    with support for both self-attention and cross-attention modes. Uses RMSNorm
    for query and key normalization, and supports sliding window attention for
    efficient long-sequence processing.
    """
    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        rms_norm_eps: float,
        attention_bias: bool,
        attention_dropout: float,
        layer_types: list,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
        is_cross_attention: bool = False,
        is_causal: bool = False,
    ):
        super().__init__()
        self.layer_idx = layer_idx
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.num_key_value_groups = num_attention_heads // num_key_value_heads
        self.scaling = self.head_dim ** -0.5
        self.attention_dropout = attention_dropout
        if is_cross_attention:
            is_causal = False
        self.is_causal = is_causal
        self.is_cross_attention = is_cross_attention
        self.q_proj = nn.Linear(hidden_size, num_attention_heads * self.head_dim, bias=attention_bias)
        self.k_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim, bias=attention_bias)
        self.v_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim, bias=attention_bias)
        self.o_proj = nn.Linear(num_attention_heads * self.head_dim, hidden_size, bias=attention_bias)
        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.attention_type = layer_types[layer_idx]
        self.sliding_window = sliding_window if layer_types[layer_idx] == "sliding_attention" else None
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        position_embeddings: tuple[torch.Tensor, torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        # Project and normalize query states
        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        # Determine if this is cross-attention (requires encoder_hidden_states)
        is_cross_attention = self.is_cross_attention and encoder_hidden_states is not None
        # Cross-attention path: attend to encoder hidden states
        if is_cross_attention:
            encoder_hidden_shape = (*encoder_hidden_states.shape[:-1], -1, self.head_dim)
            if past_key_value is not None:
                is_updated = past_key_value.is_updated.get(self.layer_idx)
                # After the first generated token, we can reuse all key/value states from cache
                curr_past_key_value = past_key_value.cross_attention_cache
                # Conditions for calculating key and value states
                if not is_updated:
                    # Compute and cache K/V for the first time
                    key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                    value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
                    # Update cache: save all key/value states to cache for fast auto-regressive generation
                    key_states, value_states = curr_past_key_value.update(key_states, value_states, self.layer_idx)
                    # Set flag that this layer's cross-attention cache is updated
                    past_key_value.is_updated[self.layer_idx] = True
                else:
                    # Reuse cached key/value states for subsequent tokens
                    key_states = curr_past_key_value.layers[self.layer_idx].keys
                    value_states = curr_past_key_value.layers[self.layer_idx].values
            else:
                # No cache used, compute K/V directly
                key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
        # Self-attention path: attend to the same sequence
        else:
            # Project and normalize key/value states for self-attention
            key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
            value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
            # Apply rotary position embeddings (RoPE) if provided
            if position_embeddings is not None:
                cos, sin = position_embeddings
                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
            # Update cache for auto-regressive generation
            if past_key_value is not None:
                # Sin and cos are specific to RoPE models; cache_position needed for the static cache
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        # GGA expansion: if num_key_value_heads < num_attention_heads
        if self.num_key_value_groups > 1:
            key_states = key_states.unsqueeze(2).expand(-1, -1, self.num_key_value_groups, -1, -1).flatten(1, 2)
            value_states = value_states.unsqueeze(2).expand(-1, -1, self.num_key_value_groups, -1, -1).flatten(1, 2)
        # Use DiffSynth unified attention
        # Tensors are already in (batch, heads, seq, dim) format -> "b n s d"
        attn_output = attention_forward(
            query_states, key_states, value_states,
            q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d",
            attn_mask=attention_mask,
        )
        attn_weights = None  # attention_forward doesn't return weights
        # Flatten and project output: (B, n_heads, seq, dim) -> (B, seq, n_heads*dim)
        attn_output = attn_output.transpose(1, 2).flatten(2, 3).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights
 class AceStepEncoderLayer(nn.Module):
    """
    Encoder layer for AceStep model.
    Consists of self-attention and MLP (feed-forward) sub-layers with residual connections.
    """
    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        intermediate_size: int = 6144,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: list = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.layer_idx = layer_idx
        self.self_attn = AceStepAttention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            layer_idx=layer_idx,
            is_cross_attention=False,
            is_causal=False,
        )
        self.input_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        self.post_attention_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        # MLP (feed-forward) sub-layer
        self.mlp = Qwen3MLP(
            config=type('Config', (), {
                'hidden_size': hidden_size,
                'intermediate_size': intermediate_size,
                'hidden_act': 'silu',
            })()
        )
        self.attention_type = layer_types[layer_idx]
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs,
    ) -> tuple[
        torch.FloatTensor,
        Optional[tuple[torch.FloatTensor, torch.FloatTensor]],
    ]:
        # Self-attention with residual connection
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            # Encoders don't use cache
            use_cache=False,
            past_key_value=None,
            **kwargs,
        )
        hidden_states = residual + hidden_states
        # MLP with residual connection
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)
        if output_attentions:
            outputs += (self_attn_weights,)
        return outputs
 class AceStepDiTLayer(nn.Module):
    """
    DiT (Diffusion Transformer) layer for AceStep model.
    Implements a transformer layer with three main components:
    1. Self-attention with adaptive layer norm (AdaLN)
    2. Cross-attention (optional) for conditioning on encoder outputs
    3. Feed-forward MLP with adaptive layer norm
    Uses scale-shift modulation from timestep embeddings for adaptive normalization.
    """
    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        intermediate_size: int,
        rms_norm_eps: float,
        attention_bias: bool,
        attention_dropout: float,
        layer_types: list,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
        use_cross_attention: bool = True,
    ):
        super().__init__()
        self.self_attn_norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        self.self_attn = AceStepAttention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            layer_idx=layer_idx,
        )
        self.use_cross_attention = use_cross_attention
        if self.use_cross_attention:
            self.cross_attn_norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
            self.cross_attn = AceStepAttention(
                hidden_size=hidden_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                attention_dropout=attention_dropout,
                layer_types=layer_types,
                head_dim=head_dim,
                sliding_window=sliding_window,
                layer_idx=layer_idx,
                is_cross_attention=True,
            )
        self.mlp_norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        self.mlp = Qwen3MLP(
            config=type('Config', (), {
                'hidden_size': hidden_size,
                'intermediate_size': intermediate_size,
                'hidden_act': 'silu',
            })()
        )
        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, hidden_size) / hidden_size**0.5)
        self.attention_type = layer_types[layer_idx]
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        temb: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[EncoderDecoderCache] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        # Extract scale-shift parameters for adaptive layer norm from timestep embeddings
        # 6 values: (shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa)
        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
            self.scale_shift_table.to(temb.device) + temb
        ).chunk(6, dim=1)
        # Step 1: Self-attention with adaptive layer norm (AdaLN)
        # Apply adaptive normalization: norm(x) * (1 + scale) + shift
        norm_hidden_states = (self.self_attn_norm(hidden_states) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
        attn_output, self_attn_weights = self.self_attn(
            hidden_states=norm_hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            use_cache=False,
            past_key_value=None,
            **kwargs,
        )
        # Apply gated residual connection: x = x + attn_output * gate
        hidden_states = (hidden_states + attn_output * gate_msa).type_as(hidden_states)
        # Step 2: Cross-attention (if enabled) for conditioning on encoder outputs
        if self.use_cross_attention:
            norm_hidden_states = self.cross_attn_norm(hidden_states).type_as(hidden_states)
            attn_output, cross_attn_weights = self.cross_attn(
                hidden_states=norm_hidden_states,
                encoder_hidden_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
                **kwargs,
            )
            # Standard residual connection for cross-attention
            hidden_states = hidden_states + attn_output
        # Step 3: Feed-forward (MLP) with adaptive layer norm
        # Apply adaptive normalization for MLP: norm(x) * (1 + scale) + shift
        norm_hidden_states = (self.mlp_norm(hidden_states) * (1 + c_scale_msa) + c_shift_msa).type_as(hidden_states)
        ff_output = self.mlp(norm_hidden_states)
        # Apply gated residual connection: x = x + mlp_output * gate
        hidden_states = (hidden_states + ff_output * c_gate_msa).type_as(hidden_states)
        outputs = (hidden_states,)
        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)
        return outputs
 class Lambda(nn.Module):
    """
    Wrapper module for arbitrary lambda functions.
    Allows using lambda functions in nn.Sequential by wrapping them in a Module.
    Useful for simple transformations like transpose operations.
    """
    def __init__(self, func):
        super().__init__()
        self.func = func
    def forward(self, x):
        return self.func(x)
 class AceStepDiTModel(nn.Module):
    """
    DiT (Diffusion Transformer) model for AceStep.
    Main diffusion model that generates audio latents conditioned on text, lyrics,
    and timbre. Uses patch-based processing with transformer layers, timestep
    conditioning, and cross-attention to encoder outputs.
    """
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_hidden_layers: int = 24,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        use_cache: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        patch_size: int = 2,
        in_channels: int = 192,
        audio_acoustic_hidden_dim: int = 64,
        encoder_hidden_size: Optional[int] = None,
        **kwargs,
    ):
        super().__init__()
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * (num_hidden_layers // 2))
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window
        self.use_cache = use_cache
        encoder_hidden_size = encoder_hidden_size or hidden_size
        # Rotary position embeddings for transformer layers
        rope_config = type('RopeConfig', (), {
            'hidden_size': hidden_size,
            'num_attention_heads': num_attention_heads,
            'num_key_value_heads': num_key_value_heads,
            'head_dim': head_dim,
            'max_position_embeddings': max_position_embeddings,
            'rope_theta': rope_theta,
            'rope_parameters': {'rope_type': 'default', 'rope_theta': rope_theta},
            'rms_norm_eps': rms_norm_eps,
            'attention_bias': attention_bias,
            'attention_dropout': attention_dropout,
            'hidden_act': 'silu',
            'intermediate_size': intermediate_size,
            'layer_types': self.layer_types,
            'sliding_window': sliding_window,
        })()
        self.rotary_emb = Qwen3RotaryEmbedding(rope_config)
        # Stack of DiT transformer layers
        self.layers = nn.ModuleList([
            AceStepDiTLayer(
                hidden_size=hidden_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                intermediate_size=intermediate_size,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                attention_dropout=attention_dropout,
                layer_types=self.layer_types,
                head_dim=head_dim,
                sliding_window=sliding_window,
                layer_idx=layer_idx,
            )
            for layer_idx in range(num_hidden_layers)
        ])
        self.patch_size = patch_size
        # Input projection: patch embedding using 1D convolution
        self.proj_in = nn.Sequential(
            Lambda(lambda x: x.transpose(1, 2)),
            nn.Conv1d(
                in_channels=in_channels,
                out_channels=hidden_size,
                kernel_size=patch_size,
                stride=patch_size,
                padding=0,
            ),
            Lambda(lambda x: x.transpose(1, 2)),
        )
        # Timestep embeddings for diffusion conditioning
        self.time_embed = TimestepEmbedding(in_channels=256, time_embed_dim=hidden_size)
        self.time_embed_r = TimestepEmbedding(in_channels=256, time_embed_dim=hidden_size)
        # Project encoder hidden states to model dimension
        self.condition_embedder = nn.Linear(encoder_hidden_size, hidden_size, bias=True)
        # Output normalization and projection
        self.norm_out = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        self.proj_out = nn.Sequential(
            Lambda(lambda x: x.transpose(1, 2)),
            nn.ConvTranspose1d(
                in_channels=hidden_size,
                out_channels=audio_acoustic_hidden_dim,
                kernel_size=patch_size,
                stride=patch_size,
                padding=0,
            ),
            Lambda(lambda x: x.transpose(1, 2)),
        )
        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, hidden_size) / hidden_size**0.5)
        self.gradient_checkpointing = False
    def forward(
        self,
        hidden_states: torch.Tensor,
        timestep: torch.Tensor,
        timestep_r: torch.Tensor,
        attention_mask: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
        encoder_attention_mask: torch.Tensor,
        context_latents: torch.Tensor,
        use_cache: Optional[bool] = False,
        past_key_values: Optional[EncoderDecoderCache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = False,
        return_hidden_states: int = None,
        custom_layers_config: Optional[dict] = None,
        enable_early_exit: bool = False,
        use_gradient_checkpointing: bool = False,
        use_gradient_checkpointing_offload: bool = False,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ):
        use_cache = use_cache if use_cache is not None else self.use_cache
        # Disable cache during training or when gradient checkpointing is enabled
        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False
        if self.training:
            use_cache = False
        # Initialize cache if needed (only during inference for auto-regressive generation)
        if not self.training and use_cache and past_key_values is None:
            past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
        # Compute timestep embeddings for diffusion conditioning
        # Two embeddings: one for timestep t, one for timestep difference (t - r)
        temb_t, timestep_proj_t = self.time_embed(timestep)
        temb_r, timestep_proj_r = self.time_embed_r(timestep - timestep_r)
        # Combine embeddings
        temb = temb_t + temb_r
        timestep_proj = timestep_proj_t + timestep_proj_r
        # Concatenate context latents (source latents + chunk masks) with hidden states
        hidden_states = torch.cat([context_latents, hidden_states], dim=-1)
        # Record original sequence length for later restoration after padding
        original_seq_len = hidden_states.shape[1]
        # Apply padding if sequence length is not divisible by patch_size
        # This ensures proper patch extraction
        pad_length = 0
        if hidden_states.shape[1] % self.patch_size != 0:
            pad_length = self.patch_size - (hidden_states.shape[1] % self.patch_size)
            hidden_states = F.pad(hidden_states, (0, 0, 0, pad_length), mode='constant', value=0)
        # Project input to patches and project encoder states
        hidden_states = self.proj_in(hidden_states)
        encoder_hidden_states = self.condition_embedder(encoder_hidden_states)
        # Cache positions
        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=hidden_states.device
            )
        # Position IDs
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)
        seq_len = hidden_states.shape[1]
        encoder_seq_len = encoder_hidden_states.shape[1]
        dtype = hidden_states.dtype
        device = hidden_states.device
        # Initialize Mask variables
        full_attn_mask = None
        sliding_attn_mask = None
        encoder_attn_mask = None
        decoder_attn_mask = None
        # Target library discards the passed-in attention_mask for 4D mask
        # construction (line 1384: attention_mask = None)
        attention_mask = None
        # 1. Full Attention (Bidirectional, Global)
        full_attn_mask = create_4d_mask(
            seq_len=seq_len,
            dtype=dtype,
            device=device,
            attention_mask=attention_mask,
            sliding_window=None,
            is_sliding_window=False,
            is_causal=False
        )
        max_len = max(seq_len, encoder_seq_len)
        encoder_attn_mask = create_4d_mask(
            seq_len=max_len,
            dtype=dtype,
            device=device,
            attention_mask=attention_mask,
            sliding_window=None,
            is_sliding_window=False,
            is_causal=False
        )
        encoder_attn_mask = encoder_attn_mask[:, :, :seq_len, :encoder_seq_len]
        # 2. Sliding Attention (Bidirectional, Local)
        if self.use_sliding_window:
            sliding_attn_mask = create_4d_mask(
                seq_len=seq_len,
                dtype=dtype,
                device=device,
                attention_mask=attention_mask,
                sliding_window=self.sliding_window,
                is_sliding_window=True,
                is_causal=False
            )
        # Build mask mapping
        self_attn_mask_mapping = {
            "full_attention": full_attn_mask,
            "sliding_attention": sliding_attn_mask,
            "encoder_attention_mask": encoder_attn_mask,
        }
        # Create position embeddings to be shared across all decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)
        all_cross_attentions = () if output_attentions else None
        # Handle early exit for custom layer configurations
        max_needed_layer = float('inf')
        if custom_layers_config is not None and enable_early_exit:
            max_needed_layer = max(custom_layers_config.keys())
            output_attentions = True
            if all_cross_attentions is None:
                all_cross_attentions = ()
        # Process through transformer layers
        for index_block, layer_module in enumerate(self.layers):
            # Early exit optimization
            if index_block > max_needed_layer:
                break
            # Prepare layer arguments
            layer_args = (
                hidden_states,
                position_embeddings,
                timestep_proj,
                self_attn_mask_mapping[layer_module.attention_type],
                position_ids,
                past_key_values,
                output_attentions,
                use_cache,
                cache_position,
                encoder_hidden_states,
                self_attn_mask_mapping["encoder_attention_mask"],
            )
            layer_kwargs = flash_attn_kwargs
            # Use gradient checkpointing if enabled
            layer_outputs = gradient_checkpoint_forward(
                layer_module,
                use_gradient_checkpointing,
                use_gradient_checkpointing_offload,
                *layer_args,
                **layer_kwargs,
            )
            hidden_states = layer_outputs[0]
            if output_attentions and self.layers[index_block].use_cross_attention:
                # layer_outputs structure: (hidden_states, self_attn_weights, cross_attn_weights)
                if len(layer_outputs) >= 3:
                    all_cross_attentions += (layer_outputs[2],)
        if return_hidden_states:
            return hidden_states
        # Extract scale-shift parameters for adaptive output normalization
        shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
        shift = shift.to(hidden_states.device)
        scale = scale.to(hidden_states.device)
        # Apply adaptive layer norm: norm(x) * (1 + scale) + shift
        hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).type_as(hidden_states)
        # Project output: de-patchify back to original sequence format
        hidden_states = self.proj_out(hidden_states)
        # Crop back to original sequence length to ensure exact length match (remove padding)
        hidden_states = hidden_states[:, :original_seq_len, :]
        outputs = (hidden_states, past_key_values)
        if output_attentions:
            outputs += (all_cross_attentions,)
        return outputs
--- a/diffsynth/models/ace_step_text_encoder.py
+++ b/diffsynth/models/ace_step_text_encoder.py
@@ -0,0 +1,53 @@
 import torch
 class AceStepTextEncoder(torch.nn.Module):
    def __init__(
        self,
    ):
        super().__init__()
        from transformers import Qwen3Config, Qwen3Model
        config = Qwen3Config(
            attention_bias=False,
            attention_dropout=0.0,
            bos_token_id=151643,
            dtype="bfloat16",
            eos_token_id=151643,
            head_dim=128,
            hidden_act="silu",
            hidden_size=1024,
            initializer_range=0.02,
            intermediate_size=3072,
            layer_types=["full_attention"] * 28,
            max_position_embeddings=32768,
            max_window_layers=28,
            model_type="qwen3",
            num_attention_heads=16,
            num_hidden_layers=28,
            num_key_value_heads=8,
            pad_token_id=151643,
            rms_norm_eps=1e-06,
            rope_scaling=None,
            rope_theta=1000000,
            sliding_window=None,
            tie_word_embeddings=True,
            use_cache=True,
            use_sliding_window=False,
            vocab_size=151669,
        )
        self.model = Qwen3Model(config)
    @torch.no_grad()
    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: torch.Tensor,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True,
        )
        return outputs.last_hidden_state
--- a/diffsynth/models/ace_step_tokenizer.py
+++ b/diffsynth/models/ace_step_tokenizer.py
@@ -0,0 +1,732 @@
 # Copyright 2025 The ACESTEO Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ACE-Step Audio Tokenizer — VAE latent discretization pathway.
 Contains:
 - AceStepAudioTokenizer: continuous VAE latent → discrete FSQ tokens
 - AudioTokenDetokenizer: discrete tokens → continuous VAE-latent-shaped features
 Only used in cover song mode (is_covers=True). Bypassed in text-to-music.
 """
 from typing import Optional
 import torch
 import torch.nn as nn
 from einops import rearrange
 from ..core.attention import attention_forward
 from ..core.gradient import gradient_checkpoint_forward
 from transformers.cache_utils import Cache
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.processing_utils import Unpack
 from transformers.utils import can_return_tuple, logging
 from transformers.models.qwen3.modeling_qwen3 import (
    Qwen3MLP,
    Qwen3RMSNorm,
    Qwen3RotaryEmbedding,
    apply_rotary_pos_emb,
 )
 from vector_quantize_pytorch import ResidualFSQ
 logger = logging.get_logger(__name__)
 def create_4d_mask(
    seq_len: int,
    dtype: torch.dtype,
    device: torch.device,
    attention_mask: Optional[torch.Tensor] = None,
    sliding_window: Optional[int] = None,
    is_sliding_window: bool = False,
    is_causal: bool = True,
 ) -> torch.Tensor:
    indices = torch.arange(seq_len, device=device)
    diff = indices.unsqueeze(1) - indices.unsqueeze(0)
    valid_mask = torch.ones((seq_len, seq_len), device=device, dtype=torch.bool)
    if is_causal:
        valid_mask = valid_mask & (diff >= 0)
    if is_sliding_window and sliding_window is not None:
        if is_causal:
            valid_mask = valid_mask & (diff <= sliding_window)
        else:
            valid_mask = valid_mask & (torch.abs(diff) <= sliding_window)
    valid_mask = valid_mask.unsqueeze(0).unsqueeze(0)
    if attention_mask is not None:
        padding_mask_4d = attention_mask.view(attention_mask.shape[0], 1, 1, seq_len).to(torch.bool)
        valid_mask = valid_mask & padding_mask_4d
    min_dtype = torch.finfo(dtype).min
    mask_tensor = torch.full(valid_mask.shape, min_dtype, dtype=dtype, device=device)
    mask_tensor.masked_fill_(valid_mask, 0.0)
    return mask_tensor
 class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func
    def forward(self, x):
        return self.func(x)
 class AceStepAttention(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        rms_norm_eps: float,
        attention_bias: bool,
        attention_dropout: float,
        layer_types: list,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
        is_cross_attention: bool = False,
        is_causal: bool = False,
    ):
        super().__init__()
        self.layer_idx = layer_idx
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.num_key_value_groups = num_attention_heads // num_key_value_heads
        self.scaling = self.head_dim ** -0.5
        self.attention_dropout = attention_dropout
        if is_cross_attention:
            is_causal = False
        self.is_causal = is_causal
        self.is_cross_attention = is_cross_attention
        self.q_proj = nn.Linear(hidden_size, num_attention_heads * self.head_dim, bias=attention_bias)
        self.k_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim, bias=attention_bias)
        self.v_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim, bias=attention_bias)
        self.o_proj = nn.Linear(num_attention_heads * self.head_dim, hidden_size, bias=attention_bias)
        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.attention_type = layer_types[layer_idx]
        self.sliding_window = sliding_window if layer_types[layer_idx] == "sliding_attention" else None
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        position_embeddings: tuple[torch.Tensor, torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        is_cross_attention = self.is_cross_attention and encoder_hidden_states is not None
        if is_cross_attention:
            encoder_hidden_shape = (*encoder_hidden_states.shape[:-1], -1, self.head_dim)
            if past_key_value is not None:
                is_updated = past_key_value.is_updated.get(self.layer_idx)
                curr_past_key_value = past_key_value.cross_attention_cache
                if not is_updated:
                    key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                    value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
                    key_states, value_states = curr_past_key_value.update(key_states, value_states, self.layer_idx)
                    past_key_value.is_updated[self.layer_idx] = True
                else:
                    key_states = curr_past_key_value.layers[self.layer_idx].keys
                    value_states = curr_past_key_value.layers[self.layer_idx].values
            else:
                key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
        else:
            key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
            value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
            if position_embeddings is not None:
                cos, sin = position_embeddings
                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        if self.num_key_value_groups > 1:
            key_states = key_states.unsqueeze(2).expand(-1, -1, self.num_key_value_groups, -1, -1).flatten(1, 2)
            value_states = value_states.unsqueeze(2).expand(-1, -1, self.num_key_value_groups, -1, -1).flatten(1, 2)
        attn_output = attention_forward(
            query_states, key_states, value_states,
            q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d",
            attn_mask=attention_mask,
        )
        attn_weights = None
        attn_output = attn_output.transpose(1, 2).flatten(2, 3).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights
 class AceStepEncoderLayer(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        num_attention_heads: int,
        num_key_value_heads: int,
        rms_norm_eps: float,
        attention_bias: bool,
        attention_dropout: float,
        layer_types: list,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        layer_idx: int = 0,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.layer_idx = layer_idx
        self.self_attn = AceStepAttention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            layer_idx=layer_idx,
            is_cross_attention=False,
            is_causal=False,
        )
        self.input_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        self.post_attention_layernorm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        mlp_config = type('Config', (), {
            'hidden_size': hidden_size,
            'intermediate_size': intermediate_size,
            'hidden_act': 'silu',
        })()
        self.mlp = Qwen3MLP(mlp_config)
        self.attention_type = layer_types[layer_idx]
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs,
    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            use_cache=False,
            past_key_value=None,
            **kwargs,
        )
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)
        if output_attentions:
            outputs += (self_attn_weights,)
        return outputs
 class AttentionPooler(nn.Module):
    """Pools every pool_window_size frames into 1 representation via transformer + CLS token."""
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        num_attention_pooler_hidden_layers: int = 2,
        **kwargs,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.rms_norm_eps = rms_norm_eps
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        # Default matches target library config (24 alternating entries).
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * 12)
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.sliding_window = sliding_window
        self.use_sliding_window = use_sliding_window
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
        self._attn_implementation = kwargs.get("_attn_implementation", "sdpa")
        self.embed_tokens = nn.Linear(hidden_size, hidden_size)
        self.norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        # Slice layer_types to our own layer count
        pooler_layer_types = self.layer_types[:num_attention_pooler_hidden_layers]
        rope_config = type('RopeConfig', (), {
            'hidden_size': hidden_size,
            'num_attention_heads': num_attention_heads,
            'num_key_value_heads': num_key_value_heads,
            'head_dim': head_dim,
            'max_position_embeddings': max_position_embeddings,
            'rope_theta': rope_theta,
            'rope_parameters': {'rope_type': 'default', 'rope_theta': rope_theta},
            'rms_norm_eps': rms_norm_eps,
            'attention_bias': attention_bias,
            'attention_dropout': attention_dropout,
            'hidden_act': 'silu',
            'intermediate_size': intermediate_size,
            'layer_types': pooler_layer_types,
            'sliding_window': sliding_window,
            '_attn_implementation': self._attn_implementation,
        })()
        self.rotary_emb = Qwen3RotaryEmbedding(rope_config)
        self.gradient_checkpointing = False
        self.special_token = nn.Parameter(torch.randn(1, 1, hidden_size) * 0.02)
        self.layers = nn.ModuleList([
            AceStepEncoderLayer(
                hidden_size=hidden_size,
                intermediate_size=intermediate_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                attention_dropout=attention_dropout,
                layer_types=pooler_layer_types,
                head_dim=head_dim,
                sliding_window=sliding_window,
                layer_idx=layer_idx,
            )
            for layer_idx in range(num_attention_pooler_hidden_layers)
        ])
    @can_return_tuple
    def forward(
        self,
        x,
        attention_mask: Optional[torch.Tensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> torch.Tensor:
        B, T, P, D = x.shape
        x = self.embed_tokens(x)
        special_tokens = self.special_token.expand(B, T, 1, -1).to(x.device)
        x = torch.cat([special_tokens, x], dim=2)
        x = rearrange(x, "b t p c -> (b t) p c")
        cache_position = torch.arange(0, x.shape[1], device=x.device)
        position_ids = cache_position.unsqueeze(0)
        hidden_states = x
        position_embeddings = self.rotary_emb(hidden_states, position_ids)
        seq_len = x.shape[1]
        dtype = x.dtype
        device = x.device
        full_attn_mask = create_4d_mask(
            seq_len=seq_len, dtype=dtype, device=device,
            attention_mask=attention_mask, sliding_window=None,
            is_sliding_window=False, is_causal=False
        )
        sliding_attn_mask = None
        if self.use_sliding_window:
            sliding_attn_mask = create_4d_mask(
                seq_len=seq_len, dtype=dtype, device=device,
                attention_mask=attention_mask, sliding_window=self.sliding_window,
                is_sliding_window=True, is_causal=False
            )
        self_attn_mask_mapping = {
            "full_attention": full_attn_mask,
            "sliding_attention": sliding_attn_mask,
        }
        for layer_module in self.layers:
            layer_outputs = layer_module(
                hidden_states, position_embeddings,
                attention_mask=self_attn_mask_mapping[layer_module.attention_type],
                **flash_attn_kwargs,
            )
            hidden_states = layer_outputs[0]
        hidden_states = self.norm(hidden_states)
        cls_output = hidden_states[:, 0, :]
        return rearrange(cls_output, "(b t) c -> b t c", b=B)
 class AceStepAudioTokenizer(nn.Module):
    """Converts continuous acoustic features (VAE latents) into discrete quantized tokens.
    Input: [B, T, 64] (VAE latent dim)
    Output: quantized [B, T/5, 2048], indices [B, T/5, 1]
    """
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        audio_acoustic_hidden_dim: int = 64,
        pool_window_size: int = 5,
        fsq_dim: int = 2048,
        fsq_input_levels: list = None,
        fsq_input_num_quantizers: int = 1,
        num_attention_pooler_hidden_layers: int = 2,
        **kwargs,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.rms_norm_eps = rms_norm_eps
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        # Default matches target library config (24 alternating entries).
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * 12)
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.sliding_window = sliding_window
        self.use_sliding_window = use_sliding_window
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
        self.pool_window_size = pool_window_size
        self.fsq_dim = fsq_dim
        self.fsq_input_levels = fsq_input_levels or [8, 8, 8, 5, 5, 5]
        self.fsq_input_num_quantizers = fsq_input_num_quantizers
        self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
        self._attn_implementation = kwargs.get("_attn_implementation", "sdpa")
        self.audio_acoustic_proj = nn.Linear(audio_acoustic_hidden_dim, hidden_size)
        # Slice layer_types for the attention pooler
        pooler_layer_types = self.layer_types[:num_attention_pooler_hidden_layers]
        self.attention_pooler = AttentionPooler(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=pooler_layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            use_sliding_window=use_sliding_window,
            rope_theta=rope_theta,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            num_attention_pooler_hidden_layers=num_attention_pooler_hidden_layers,
        )
        self.quantizer = ResidualFSQ(
            dim=self.fsq_dim,
            levels=self.fsq_input_levels,
            num_quantizers=self.fsq_input_num_quantizers,
            force_quantization_f32=False,  # avoid autocast bug in vector_quantize_pytorch
        )
    @can_return_tuple
    def forward(
        self,
        hidden_states: Optional[torch.FloatTensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, torch.Tensor]:
        hidden_states = self.audio_acoustic_proj(hidden_states)
        hidden_states = self.attention_pooler(hidden_states)
        quantized, indices = self.quantizer(hidden_states)
        return quantized, indices
    def tokenize(self, x):
        """Convenience: takes [B, T, 64], rearranges to patches, runs forward."""
        x = rearrange(x, 'n (t_patch p) d -> n t_patch p d', p=self.pool_window_size)
        return self.forward(x)
 class AudioTokenDetokenizer(nn.Module):
    """Converts quantized audio tokens back to continuous acoustic representations.
    Input: [B, T/5, hidden_size] (quantized vectors)
    Output: [B, T, 64] (VAE-latent-shaped continuous features)
    """
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        pool_window_size: int = 5,
        audio_acoustic_hidden_dim: int = 64,
        num_attention_pooler_hidden_layers: int = 2,
        **kwargs,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.rms_norm_eps = rms_norm_eps
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        # Default matches target library config (24 alternating entries).
        self.layer_types = layer_types or (["sliding_attention", "full_attention"] * 12)
        self.head_dim = head_dim or hidden_size // num_attention_heads
        self.sliding_window = sliding_window
        self.use_sliding_window = use_sliding_window
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.initializer_range = initializer_range
        self.pool_window_size = pool_window_size
        self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
        self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
        self._attn_implementation = kwargs.get("_attn_implementation", "sdpa")
        self.embed_tokens = nn.Linear(hidden_size, hidden_size)
        self.norm = Qwen3RMSNorm(hidden_size, eps=rms_norm_eps)
        # Slice layer_types to our own layer count (use num_audio_decoder_hidden_layers)
        detok_layer_types = self.layer_types[:num_attention_pooler_hidden_layers]
        rope_config = type('RopeConfig', (), {
            'hidden_size': hidden_size,
            'num_attention_heads': num_attention_heads,
            'num_key_value_heads': num_key_value_heads,
            'head_dim': head_dim,
            'max_position_embeddings': max_position_embeddings,
            'rope_theta': rope_theta,
            'rope_parameters': {'rope_type': 'default', 'rope_theta': rope_theta},
            'rms_norm_eps': rms_norm_eps,
            'attention_bias': attention_bias,
            'attention_dropout': attention_dropout,
            'hidden_act': 'silu',
            'intermediate_size': intermediate_size,
            'layer_types': detok_layer_types,
            'sliding_window': sliding_window,
            '_attn_implementation': self._attn_implementation,
        })()
        self.rotary_emb = Qwen3RotaryEmbedding(rope_config)
        self.gradient_checkpointing = False
        self.special_tokens = nn.Parameter(torch.randn(1, pool_window_size, hidden_size) * 0.02)
        self.layers = nn.ModuleList([
            AceStepEncoderLayer(
                hidden_size=hidden_size,
                intermediate_size=intermediate_size,
                num_attention_heads=num_attention_heads,
                num_key_value_heads=num_key_value_heads,
                rms_norm_eps=rms_norm_eps,
                attention_bias=attention_bias,
                attention_dropout=attention_dropout,
                layer_types=detok_layer_types,
                head_dim=head_dim,
                sliding_window=sliding_window,
                layer_idx=layer_idx,
            )
            for layer_idx in range(num_attention_pooler_hidden_layers)
        ])
        self.proj_out = nn.Linear(hidden_size, audio_acoustic_hidden_dim)
    @can_return_tuple
    def forward(
        self,
        x,
        attention_mask: Optional[torch.Tensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> torch.Tensor:
        B, T, D = x.shape
        x = self.embed_tokens(x)
        x = x.unsqueeze(2).repeat(1, 1, self.pool_window_size, 1)
        special_tokens = self.special_tokens.expand(B, T, -1, -1)
        x = x + special_tokens.to(x.device)
        x = rearrange(x, "b t p c -> (b t) p c")
        cache_position = torch.arange(0, x.shape[1], device=x.device)
        position_ids = cache_position.unsqueeze(0)
        hidden_states = x
        position_embeddings = self.rotary_emb(hidden_states, position_ids)
        seq_len = x.shape[1]
        dtype = x.dtype
        device = x.device
        full_attn_mask = create_4d_mask(
            seq_len=seq_len, dtype=dtype, device=device,
            attention_mask=attention_mask, sliding_window=None,
            is_sliding_window=False, is_causal=False
        )
        sliding_attn_mask = None
        if self.use_sliding_window:
            sliding_attn_mask = create_4d_mask(
                seq_len=seq_len, dtype=dtype, device=device,
                attention_mask=attention_mask, sliding_window=self.sliding_window,
                is_sliding_window=True, is_causal=False
            )
        self_attn_mask_mapping = {
            "full_attention": full_attn_mask,
            "sliding_attention": sliding_attn_mask,
        }
        for layer_module in self.layers:
            layer_outputs = layer_module(
                hidden_states, position_embeddings,
                attention_mask=self_attn_mask_mapping[layer_module.attention_type],
                **flash_attn_kwargs,
            )
            hidden_states = layer_outputs[0]
        hidden_states = self.norm(hidden_states)
        hidden_states = self.proj_out(hidden_states)
        return rearrange(hidden_states, "(b t) p c -> b (t p) c", b=B, p=self.pool_window_size)
 class AceStepTokenizer(nn.Module):
    """Container for AceStepAudioTokenizer + AudioTokenDetokenizer.
    Provides encode/decode convenience methods for VAE latent discretization.
    Used in cover song mode to convert source audio latents to discrete tokens
    and back to continuous conditioning hints.
    """
    def __init__(
        self,
        hidden_size: int = 2048,
        intermediate_size: int = 6144,
        num_attention_heads: int = 16,
        num_key_value_heads: int = 8,
        rms_norm_eps: float = 1e-6,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        layer_types: Optional[list] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = 128,
        use_sliding_window: bool = True,
        rope_theta: float = 1000000,
        max_position_embeddings: int = 32768,
        initializer_range: float = 0.02,
        audio_acoustic_hidden_dim: int = 64,
        pool_window_size: int = 5,
        fsq_dim: int = 2048,
        fsq_input_levels: list = None,
        fsq_input_num_quantizers: int = 1,
        num_attention_pooler_hidden_layers: int = 2,
        num_audio_decoder_hidden_layers: int = 24,
        **kwargs,
    ):
        super().__init__()
        # Default layer_types matches target library config (24 alternating entries).
        # Sub-modules (pooler/detokenizer) slice first N entries for their own layer count.
        if layer_types is None:
            layer_types = ["sliding_attention", "full_attention"] * 12
        self.tokenizer = AceStepAudioTokenizer(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            use_sliding_window=use_sliding_window,
            rope_theta=rope_theta,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            audio_acoustic_hidden_dim=audio_acoustic_hidden_dim,
            pool_window_size=pool_window_size,
            fsq_dim=fsq_dim,
            fsq_input_levels=fsq_input_levels,
            fsq_input_num_quantizers=fsq_input_num_quantizers,
            num_attention_pooler_hidden_layers=num_attention_pooler_hidden_layers,
            **kwargs,
        )
        self.detokenizer = AudioTokenDetokenizer(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            rms_norm_eps=rms_norm_eps,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            layer_types=layer_types,
            head_dim=head_dim,
            sliding_window=sliding_window,
            use_sliding_window=use_sliding_window,
            rope_theta=rope_theta,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            pool_window_size=pool_window_size,
            audio_acoustic_hidden_dim=audio_acoustic_hidden_dim,
            num_attention_pooler_hidden_layers=num_attention_pooler_hidden_layers,
            **kwargs,
        )
    def encode(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """VAE latent [B, T, 64] → discrete tokens."""
        return self.tokenizer(hidden_states)
    def decode(self, quantized: torch.Tensor) -> torch.Tensor:
        """Discrete tokens [B, T/5, hidden_size] → continuous [B, T, 64]."""
        return self.detokenizer(quantized)
    def tokenize(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """Convenience: [B, T, 64] → quantized + indices via patch rearrangement."""
        return self.tokenizer.tokenize(x)
--- a/diffsynth/models/ace_step_vae.py
+++ b/diffsynth/models/ace_step_vae.py
@@ -0,0 +1,287 @@
 # Copyright 2025 The ACESTEO Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ACE-Step Audio VAE (AutoencoderOobleck CNN architecture).
 This is a CNN-based VAE for audio waveform encoding/decoding.
 It uses weight-normalized convolutions and Snake1d activations.
 Does NOT depend on diffusers — pure nn.Module implementation.
 """
 import math
 from typing import Optional
 import torch
 import torch.nn as nn
 from torch.nn.utils import weight_norm, remove_weight_norm
 class Snake1d(nn.Module):
    """Snake activation: x + 1/(beta+eps) * sin(alpha*x)^2."""
    def __init__(self, hidden_dim: int, logscale: bool = True):
        super().__init__()
        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
        self.logscale = logscale
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        shape = hidden_states.shape
        alpha = torch.exp(self.alpha) if self.logscale else self.alpha
        beta = torch.exp(self.beta) if self.logscale else self.beta
        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
        return hidden_states.reshape(shape)
 class OobleckResidualUnit(nn.Module):
    """Residual unit: Snake1d → Conv1d(dilated) → Snake1d → Conv1d(1×1) + skip."""
    def __init__(self, dimension: int = 16, dilation: int = 1):
        super().__init__()
        pad = ((7 - 1) * dilation) // 2
        self.snake1 = Snake1d(dimension)
        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
        self.snake2 = Snake1d(dimension)
        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        output = self.conv1(self.snake1(hidden_state))
        output = self.conv2(self.snake2(output))
        padding = (hidden_state.shape[-1] - output.shape[-1]) // 2
        if padding > 0:
            hidden_state = hidden_state[..., padding:-padding]
        return hidden_state + output
 class OobleckEncoderBlock(nn.Module):
    """Encoder block: 3 residual units + downsampling conv."""
    def __init__(self, input_dim: int, output_dim: int, stride: int = 1):
        super().__init__()
        self.res_unit1 = OobleckResidualUnit(input_dim, dilation=1)
        self.res_unit2 = OobleckResidualUnit(input_dim, dilation=3)
        self.res_unit3 = OobleckResidualUnit(input_dim, dilation=9)
        self.snake1 = Snake1d(input_dim)
        self.conv1 = weight_norm(
            nn.Conv1d(input_dim, output_dim, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2))
        )
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        hidden_state = self.res_unit1(hidden_state)
        hidden_state = self.res_unit2(hidden_state)
        hidden_state = self.snake1(self.res_unit3(hidden_state))
        return self.conv1(hidden_state)
 class OobleckDecoderBlock(nn.Module):
    """Decoder block: upsampling conv + 3 residual units."""
    def __init__(self, input_dim: int, output_dim: int, stride: int = 1):
        super().__init__()
        self.snake1 = Snake1d(input_dim)
        self.conv_t1 = weight_norm(
            nn.ConvTranspose1d(
                input_dim, output_dim, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2),
            )
        )
        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        hidden_state = self.snake1(hidden_state)
        hidden_state = self.conv_t1(hidden_state)
        hidden_state = self.res_unit1(hidden_state)
        hidden_state = self.res_unit2(hidden_state)
        return self.res_unit3(hidden_state)
 class OobleckEncoder(nn.Module):
    """Full encoder: audio → latent representation [B, encoder_hidden_size, T'].
    conv1 → [blocks] → snake1 → conv2
    """
    def __init__(
        self,
        encoder_hidden_size: int = 128,
        audio_channels: int = 2,
        downsampling_ratios: list = None,
        channel_multiples: list = None,
    ):
        super().__init__()
        downsampling_ratios = downsampling_ratios or [2, 4, 4, 6, 10]
        channel_multiples = channel_multiples or [1, 2, 4, 8, 16]
        channel_multiples = [1] + channel_multiples
        self.conv1 = weight_norm(nn.Conv1d(audio_channels, encoder_hidden_size, kernel_size=7, padding=3))
        self.block = nn.ModuleList()
        for stride_index, stride in enumerate(downsampling_ratios):
            self.block.append(
                OobleckEncoderBlock(
                    input_dim=encoder_hidden_size * channel_multiples[stride_index],
                    output_dim=encoder_hidden_size * channel_multiples[stride_index + 1],
                    stride=stride,
                )
            )
        d_model = encoder_hidden_size * channel_multiples[-1]
        self.snake1 = Snake1d(d_model)
        self.conv2 = weight_norm(nn.Conv1d(d_model, encoder_hidden_size, kernel_size=3, padding=1))
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        hidden_state = self.conv1(hidden_state)
        for block in self.block:
            hidden_state = block(hidden_state)
        hidden_state = self.snake1(hidden_state)
        return self.conv2(hidden_state)
 class OobleckDecoder(nn.Module):
    """Full decoder: latent → audio waveform [B, audio_channels, T].
    conv1 → [blocks] → snake1 → conv2(no bias)
    """
    def __init__(
        self,
        channels: int = 128,
        input_channels: int = 64,
        audio_channels: int = 2,
        upsampling_ratios: list = None,
        channel_multiples: list = None,
    ):
        super().__init__()
        upsampling_ratios = upsampling_ratios or [10, 6, 4, 4, 2]
        channel_multiples = channel_multiples or [1, 2, 4, 8, 16]
        channel_multiples = [1] + channel_multiples
        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
        self.block = nn.ModuleList()
        for stride_index, stride in enumerate(upsampling_ratios):
            self.block.append(
                OobleckDecoderBlock(
                    input_dim=channels * channel_multiples[len(upsampling_ratios) - stride_index],
                    output_dim=channels * channel_multiples[len(upsampling_ratios) - stride_index - 1],
                    stride=stride,
                )
            )
        self.snake1 = Snake1d(channels)
        # conv2 has no bias (matches checkpoint: only weight_g/weight_v, no bias key)
        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        hidden_state = self.conv1(hidden_state)
        for block in self.block:
            hidden_state = block(hidden_state)
        hidden_state = self.snake1(hidden_state)
        return self.conv2(hidden_state)
 class OobleckDiagonalGaussianDistribution(object):
    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
        self.parameters = parameters
        self.mean, self.scale = parameters.chunk(2, dim=1)
        self.std = nn.functional.softplus(self.scale) + 1e-4
        self.var = self.std * self.std
        self.logvar = torch.log(self.var)
        self.deterministic = deterministic
    def sample(self, generator: torch.Generator | None = None) -> torch.Tensor:
        # make sure sample is on the same device as the parameters and has same dtype
        sample = torch.randn(
            self.mean.shape,
            generator=generator,
            device=self.parameters.device,
            dtype=self.parameters.dtype,
        )
        x = self.mean + self.std * sample
        return x
    def kl(self, other: "OobleckDiagonalGaussianDistribution" = None) -> torch.Tensor:
        if self.deterministic:
            return torch.Tensor([0.0])
        else:
            if other is None:
                return (self.mean * self.mean + self.var - self.logvar - 1.0).sum(1).mean()
            else:
                normalized_diff = torch.pow(self.mean - other.mean, 2) / other.var
                var_ratio = self.var / other.var
                logvar_diff = self.logvar - other.logvar
                kl = normalized_diff + var_ratio + logvar_diff - 1
                kl = kl.sum(1).mean()
                return kl
 class AceStepVAE(nn.Module):
    """Audio VAE for ACE-Step (AutoencoderOobleck architecture).
    Encodes audio waveform → latent, decodes latent → audio waveform.
    Uses Snake1d activations and weight-normalized convolutions.
    """
    def __init__(
        self,
        encoder_hidden_size: int = 128,
        downsampling_ratios: list = None,
        channel_multiples: list = None,
        decoder_channels: int = 128,
        decoder_input_channels: int = 64,
        audio_channels: int = 2,
        sampling_rate: int = 48000,
    ):
        super().__init__()
        downsampling_ratios = downsampling_ratios or [2, 4, 4, 6, 10]
        channel_multiples = channel_multiples or [1, 2, 4, 8, 16]
        upsampling_ratios = downsampling_ratios[::-1]
        self.encoder = OobleckEncoder(
            encoder_hidden_size=encoder_hidden_size,
            audio_channels=audio_channels,
            downsampling_ratios=downsampling_ratios,
            channel_multiples=channel_multiples,
        )
        self.decoder = OobleckDecoder(
            channels=decoder_channels,
            input_channels=decoder_input_channels,
            audio_channels=audio_channels,
            upsampling_ratios=upsampling_ratios,
            channel_multiples=channel_multiples,
        )
        self.sampling_rate = sampling_rate
    def encode(self, x: torch.Tensor) -> torch.Tensor:
        """Audio waveform [B, audio_channels, T] → latent [B, decoder_input_channels, T']."""
        h = self.encoder(x)
        output = OobleckDiagonalGaussianDistribution(h).sample()
        return output
    def decode(self, z: torch.Tensor) -> torch.Tensor:
        """Latent [B, decoder_input_channels, T] → audio waveform [B, audio_channels, T']."""
        return self.decoder(z)
    def forward(self, sample: torch.Tensor) -> torch.Tensor:
        """Full round-trip: encode → decode."""
        z = self.encode(sample)
        return self.decode(z)
    def remove_weight_norm(self):
        """Remove weight normalization from all conv layers (for export/inference)."""
        for module in self.modules():
            if isinstance(module, nn.Conv1d) or isinstance(module, nn.ConvTranspose1d):
                remove_weight_norm(module)
--- a/diffsynth/models/anima_dit.py
+++ b/diffsynth/models/anima_dit.py
--- a/diffsynth/models/attention.py
+++ b/diffsynth/models/attention.py
@@ -1,89 +0,0 @@
 import torch
 from einops import rearrange
 def low_version_attention(query, key, value, attn_bias=None):
    scale = 1 / query.shape[-1] ** 0.5
    query = query * scale
    attn = torch.matmul(query, key.transpose(-2, -1))
    if attn_bias is not None:
        attn = attn + attn_bias
    attn = attn.softmax(-1)
    return attn @ value
 class Attention(torch.nn.Module):
    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
        super().__init__()
        dim_inner = head_dim * num_heads
        kv_dim = kv_dim if kv_dim is not None else q_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
    def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0):
        batch_size = q.shape[0]
        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
        hidden_states = hidden_states + scale * ip_hidden_states
        return hidden_states
    def torch_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        batch_size = encoder_hidden_states.shape[0]
        q = self.to_q(hidden_states)
        k = self.to_k(encoder_hidden_states)
        v = self.to_v(encoder_hidden_states)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        if qkv_preprocessor is not None:
            q, k, v = qkv_preprocessor(q, k, v)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        if ipadapter_kwargs is not None:
            hidden_states = self.interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states = self.to_out(hidden_states)
        return hidden_states
    def xformers_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        q = self.to_q(hidden_states)
        k = self.to_k(encoder_hidden_states)
        v = self.to_v(encoder_hidden_states)
        q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads)
        k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads)
        v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads)
        if attn_mask is not None:
            hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask)
        else:
            import xformers.ops as xops
            hidden_states = xops.memory_efficient_attention(q, k, v)
        hidden_states = rearrange(hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states = self.to_out(hidden_states)
        return hidden_states
    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
        return self.torch_forward(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask, ipadapter_kwargs=ipadapter_kwargs, qkv_preprocessor=qkv_preprocessor)
--- a/diffsynth/models/dinov3_image_encoder.py
+++ b/diffsynth/models/dinov3_image_encoder.py
@@ -0,0 +1,96 @@
 from transformers import DINOv3ViTModel, DINOv3ViTImageProcessorFast
 from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTConfig
 import torch
 from ..core.device.npu_compatible_device import get_device_type
 class DINOv3ImageEncoder(DINOv3ViTModel):
    def __init__(self):
        config = DINOv3ViTConfig(
            architectures = [
                "DINOv3ViTModel"
            ],
            attention_dropout = 0.0,
            drop_path_rate = 0.0,
            dtype = "float32",
            hidden_act = "silu",
            hidden_size = 4096,
            image_size = 224,
            initializer_range = 0.02,
            intermediate_size = 8192,
            key_bias = False,
            layer_norm_eps = 1e-05,
            layerscale_value = 1.0,
            mlp_bias = True,
            model_type = "dinov3_vit",
            num_attention_heads = 32,
            num_channels = 3,
            num_hidden_layers = 40,
            num_register_tokens = 4,
            patch_size = 16,
            pos_embed_jitter = None,
            pos_embed_rescale = 2.0,
            pos_embed_shift = None,
            proj_bias = True,
            query_bias = False,
            rope_theta = 100.0,
            transformers_version = "4.56.1",
            use_gated_mlp = True,
            value_bias = False
        )
        super().__init__(config)
        self.processor = DINOv3ViTImageProcessorFast(
            crop_size = None,
            data_format = "channels_first",
            default_to_square = True,
            device = None,
            disable_grouping = None,
            do_center_crop = None,
            do_convert_rgb = None,
            do_normalize = True,
            do_rescale = True,
            do_resize = True,
            image_mean = [
                0.485,
                0.456,
                0.406
            ],
            image_processor_type = "DINOv3ViTImageProcessorFast",
            image_std = [
                0.229,
                0.224,
                0.225
            ],
            input_data_format = None,
            resample = 2,
            rescale_factor = 0.00392156862745098,
            return_tensors = None,
            size = {
                "height": 224,
                "width": 224
            }
        )
    def forward(self, image, torch_dtype=torch.bfloat16, device=get_device_type()):
        inputs = self.processor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].to(dtype=torch_dtype, device=device)
        bool_masked_pos = None
        head_mask = None
        pixel_values = pixel_values.to(torch_dtype)
        hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
        position_embeddings = self.rope_embeddings(pixel_values)
        for i, layer_module in enumerate(self.layer):
            layer_head_mask = head_mask[i] if head_mask is not None else None
            hidden_states = layer_module(
                hidden_states,
                attention_mask=layer_head_mask,
                position_embeddings=position_embeddings,
            )
        sequence_output = self.norm(hidden_states)
        pooled_output = sequence_output[:, 0, :]
        return pooled_output
--- a/diffsynth/models/ernie_image_dit.py
+++ b/diffsynth/models/ernie_image_dit.py
@@ -0,0 +1,362 @@
 """
 Ernie-Image DiT for DiffSynth-Studio.
 Refactored from diffusers ErnieImageTransformer2DModel to use DiffSynth core modules.
 Default parameters from actual checkpoint config.json (PaddlePaddle/ERNIE-Image transformer).
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Tuple
 from ..core.attention import attention_forward
 from ..core.gradient import gradient_checkpoint_forward
 from .flux2_dit import Timesteps, TimestepEmbedding
 def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
    assert dim % 2 == 0
    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
    omega = 1.0 / (theta ** scale)
    out = torch.einsum("...n,d->...nd", pos, omega)
    return out.float()
 class ErnieImageEmbedND3(nn.Module):
    def __init__(self, dim: int, theta: int, axes_dim: Tuple[int, int, int]):
        super().__init__()
        self.dim = dim
        self.theta = theta
        self.axes_dim = list(axes_dim)
    def forward(self, ids: torch.Tensor) -> torch.Tensor:
        emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
        emb = emb.unsqueeze(2)
        return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1)
 class ErnieImagePatchEmbedDynamic(nn.Module):
    def __init__(self, in_channels: int, embed_dim: int, patch_size: int):
        super().__init__()
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.proj(x)
        batch_size, dim, height, width = x.shape
        return x.reshape(batch_size, dim, height * width).transpose(1, 2).contiguous()
 class ErnieImageSingleStreamAttnProcessor:
    def __call__(
        self,
        attn: "ErnieImageAttention",
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        freqs_cis: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        query = attn.to_q(hidden_states)
        key = attn.to_k(hidden_states)
        value = attn.to_v(hidden_states)
        query = query.unflatten(-1, (attn.heads, -1))
        key = key.unflatten(-1, (attn.heads, -1))
        value = value.unflatten(-1, (attn.heads, -1))
        if attn.norm_q is not None:
            query = attn.norm_q(query)
        if attn.norm_k is not None:
            key = attn.norm_k(key)
        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
            rot_dim = freqs_cis.shape[-1]
            x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
            cos_ = torch.cos(freqs_cis).to(x.dtype)
            sin_ = torch.sin(freqs_cis).to(x.dtype)
            x1, x2 = x.chunk(2, dim=-1)
            x_rotated = torch.cat((-x2, x1), dim=-1)
            return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
        if freqs_cis is not None:
            query = apply_rotary_emb(query, freqs_cis)
            key = apply_rotary_emb(key, freqs_cis)
        if attention_mask is not None and attention_mask.ndim == 2:
            attention_mask = attention_mask[:, None, None, :]
        hidden_states = attention_forward(
            query, key, value,
            q_pattern="b s n d",
            k_pattern="b s n d",
            v_pattern="b s n d",
            out_pattern="b s n d",
            attn_mask=attention_mask,
        )
        hidden_states = hidden_states.flatten(2, 3)
        hidden_states = hidden_states.to(query.dtype)
        output = attn.to_out[0](hidden_states)
        return output
 class ErnieImageAttention(nn.Module):
    def __init__(
        self,
        query_dim: int,
        heads: int = 8,
        dim_head: int = 64,
        dropout: float = 0.0,
        bias: bool = False,
        qk_norm: str = "rms_norm",
        out_bias: bool = True,
        eps: float = 1e-5,
        out_dim: int = None,
        elementwise_affine: bool = True,
    ):
        super().__init__()
        self.head_dim = dim_head
        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
        self.query_dim = query_dim
        self.out_dim = out_dim if out_dim is not None else query_dim
        self.heads = out_dim // dim_head if out_dim is not None else heads
        self.use_bias = bias
        self.dropout = dropout
        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
        self.to_k = nn.Linear(query_dim, self.inner_dim, bias=bias)
        self.to_v = nn.Linear(query_dim, self.inner_dim, bias=bias)
        if qk_norm == "layer_norm":
            self.norm_q = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
            self.norm_k = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
        elif qk_norm == "rms_norm":
            self.norm_q = nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
            self.norm_k = nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
        else:
            raise ValueError(
                f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'rms_norm'."
            )
        self.to_out = nn.ModuleList([])
        self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
        self.processor = ErnieImageSingleStreamAttnProcessor()
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        image_rotary_emb: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        return self.processor(self, hidden_states, attention_mask, image_rotary_emb)
 class ErnieImageFeedForward(nn.Module):
    def __init__(self, hidden_size: int, ffn_hidden_size: int):
        super().__init__()
        self.gate_proj = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
        self.up_proj = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
        self.linear_fc2 = nn.Linear(ffn_hidden_size, hidden_size, bias=False)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear_fc2(self.up_proj(x) * F.gelu(self.gate_proj(x)))
 class ErnieImageRMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        input_dtype = hidden_states.dtype
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(input_dtype)
 class ErnieImageSharedAdaLNBlock(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        ffn_hidden_size: int,
        eps: float = 1e-6,
        qk_layernorm: bool = True,
    ):
        super().__init__()
        self.adaLN_sa_ln = ErnieImageRMSNorm(hidden_size, eps=eps)
        self.self_attention = ErnieImageAttention(
            query_dim=hidden_size,
            dim_head=hidden_size // num_heads,
            heads=num_heads,
            qk_norm="rms_norm" if qk_layernorm else None,
            eps=eps,
            bias=False,
            out_bias=False,
        )
        self.adaLN_mlp_ln = ErnieImageRMSNorm(hidden_size, eps=eps)
        self.mlp = ErnieImageFeedForward(hidden_size, ffn_hidden_size)
    def forward(
        self,
        x: torch.Tensor,
        rotary_pos_emb: torch.Tensor,
        temb: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = temb
        residual = x
        x = self.adaLN_sa_ln(x)
        x = (x.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
        x_bsh = x.permute(1, 0, 2)
        attn_out = self.self_attention(x_bsh, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
        attn_out = attn_out.permute(1, 0, 2)
        x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
        residual = x
        x = self.adaLN_mlp_ln(x)
        x = (x.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
        return residual + (gate_mlp.float() * self.mlp(x).float()).to(x.dtype)
 class ErnieImageAdaLNContinuous(nn.Module):
    def __init__(self, hidden_size: int, eps: float = 1e-6):
        super().__init__()
        self.norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=eps)
        self.linear = nn.Linear(hidden_size, hidden_size * 2)
    def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
        scale, shift = self.linear(conditioning).chunk(2, dim=-1)
        x = self.norm(x)
        x = x * (1 + scale.unsqueeze(0)) + shift.unsqueeze(0)
        return x
 class ErnieImageDiT(nn.Module):
    """
    Ernie-Image DiT model for DiffSynth-Studio.
    Architecture: SharedAdaLN + RoPE 3D + Joint Image-Text Attention.
    Internal format: [S, B, H] for transformer blocks, [B, S, H] for attention.
    """
    def __init__(
        self,
        hidden_size: int = 4096,
        num_attention_heads: int = 32,
        num_layers: int = 36,
        ffn_hidden_size: int = 12288,
        in_channels: int = 128,
        out_channels: int = 128,
        patch_size: int = 1,
        text_in_dim: int = 3072,
        rope_theta: int = 256,
        rope_axes_dim: Tuple[int, int, int] = (32, 48, 48),
        eps: float = 1e-6,
        qk_layernorm: bool = True,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_attention_heads
        self.head_dim = hidden_size // num_attention_heads
        self.num_layers = num_layers
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.text_in_dim = text_in_dim
        self.x_embedder = ErnieImagePatchEmbedDynamic(in_channels, hidden_size, patch_size)
        self.text_proj = nn.Linear(text_in_dim, hidden_size, bias=False) if text_in_dim != hidden_size else None
        self.time_proj = Timesteps(hidden_size, flip_sin_to_cos=False, downscale_freq_shift=0)
        self.time_embedding = TimestepEmbedding(hidden_size, hidden_size)
        self.pos_embed = ErnieImageEmbedND3(dim=self.head_dim, theta=rope_theta, axes_dim=rope_axes_dim)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size))
        nn.init.zeros_(self.adaLN_modulation[-1].weight)
        nn.init.zeros_(self.adaLN_modulation[-1].bias)
        self.layers = nn.ModuleList([
            ErnieImageSharedAdaLNBlock(hidden_size, num_attention_heads, ffn_hidden_size, eps, qk_layernorm=qk_layernorm)
            for _ in range(num_layers)
        ])
        self.final_norm = ErnieImageAdaLNContinuous(hidden_size, eps)
        self.final_linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels)
        nn.init.zeros_(self.final_linear.weight)
        nn.init.zeros_(self.final_linear.bias)
    def forward(
        self,
        hidden_states: torch.Tensor,
        timestep: torch.Tensor,
        text_bth: torch.Tensor,
        text_lens: torch.Tensor,
        use_gradient_checkpointing: bool = False,
        use_gradient_checkpointing_offload: bool = False,
    ) -> torch.Tensor:
        device, dtype = hidden_states.device, hidden_states.dtype
        B, C, H, W = hidden_states.shape
        p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size
        N_img = Hp * Wp
        img_sbh = self.x_embedder(hidden_states).transpose(0, 1).contiguous()
        if self.text_proj is not None and text_bth.numel() > 0:
            text_bth = self.text_proj(text_bth)
        Tmax = text_bth.shape[1]
        text_sbh = text_bth.transpose(0, 1).contiguous()
        x = torch.cat([img_sbh, text_sbh], dim=0)
        S = x.shape[0]
        text_ids = torch.cat([
            torch.arange(Tmax, device=device, dtype=torch.float32).view(1, Tmax, 1).expand(B, -1, -1),
            torch.zeros((B, Tmax, 2), device=device)
        ], dim=-1) if Tmax > 0 else torch.zeros((B, 0, 3), device=device)
        grid_yx = torch.stack(
            torch.meshgrid(torch.arange(Hp, device=device, dtype=torch.float32),
                           torch.arange(Wp, device=device, dtype=torch.float32), indexing="ij"),
            dim=-1
        ).reshape(-1, 2)
        image_ids = torch.cat([
            text_lens.float().view(B, 1, 1).expand(-1, N_img, -1),
            grid_yx.view(1, N_img, 2).expand(B, -1, -1)
        ], dim=-1)
        rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1))
        valid_text = torch.arange(Tmax, device=device).view(1, Tmax) < text_lens.view(B, 1) if Tmax > 0 else torch.zeros((B, 0), device=device, dtype=torch.bool)
        attention_mask = torch.cat([
            torch.ones((B, N_img), device=device, dtype=torch.bool),
            valid_text
        ], dim=1)[:, None, None, :]
        sample = self.time_proj(timestep.to(dtype))
        sample = sample.to(self.time_embedding.linear_1.weight.dtype)
        c = self.time_embedding(sample)
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
            t.unsqueeze(0).expand(S, -1, -1).contiguous()
            for t in self.adaLN_modulation(c).chunk(6, dim=-1)
        ]
        for layer in self.layers:
            temb = [shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp]
            if torch.is_grad_enabled() and use_gradient_checkpointing:
                x = gradient_checkpoint_forward(
                    layer,
                    use_gradient_checkpointing,
                    use_gradient_checkpointing_offload,
                    x,
                    rotary_pos_emb,
                    temb,
                    attention_mask,
                )
            else:
                x = layer(x, rotary_pos_emb, temb, attention_mask)
        x = self.final_norm(x, c).type_as(x)
        patches = self.final_linear(x)[:N_img].transpose(0, 1).contiguous()
        output = patches.view(B, Hp, Wp, p, p, self.out_channels).permute(0, 5, 1, 3, 2, 4).contiguous().view(B, self.out_channels, H, W)
        return output
--- a/diffsynth/models/ernie_image_text_encoder.py
+++ b/diffsynth/models/ernie_image_text_encoder.py
@@ -0,0 +1,76 @@
 """
 Ernie-Image TextEncoder for DiffSynth-Studio.
 Wraps transformers Ministral3Model to output text embeddings.
 Pattern: lazy import + manual config dict + torch.nn.Module wrapper.
 Only loads the text (language) model, ignoring vision components.
 """
 import torch
 class ErnieImageTextEncoder(torch.nn.Module):
    """
    Text encoder using Ministral3Model (transformers).
    Only the text_config portion of the full Mistral3Model checkpoint.
    Uses the base model (no lm_head) since the checkpoint only has embeddings.
    """
    def __init__(self):
        super().__init__()
        from transformers import Ministral3Config, Ministral3Model
        text_config = {
            "attention_dropout": 0.0,
            "bos_token_id": 1,
            "dtype": "bfloat16",
            "eos_token_id": 2,
            "head_dim": 128,
            "hidden_act": "silu",
            "hidden_size": 3072,
            "initializer_range": 0.02,
            "intermediate_size": 9216,
            "max_position_embeddings": 262144,
            "model_type": "ministral3",
            "num_attention_heads": 32,
            "num_hidden_layers": 26,
            "num_key_value_heads": 8,
            "pad_token_id": 11,
            "rms_norm_eps": 1e-05,
            "rope_parameters": {
                "beta_fast": 32.0,
                "beta_slow": 1.0,
                "factor": 16.0,
                "llama_4_scaling_beta": 0.1,
                "mscale": 1.0,
                "mscale_all_dim": 1.0,
                "original_max_position_embeddings": 16384,
                "rope_theta": 1000000.0,
                "rope_type": "yarn",
                "type": "yarn",
            },
            "sliding_window": None,
            "tie_word_embeddings": True,
            "use_cache": True,
            "vocab_size": 131072,
        }
        config = Ministral3Config(**text_config)
        self.model = Ministral3Model(config)
        self.config = config
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        **kwargs,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_hidden_states=True,
            return_dict=True,
            **kwargs,
        )
        return (outputs.hidden_states,)
--- a/diffsynth/models/flux2_dit.py
+++ b/diffsynth/models/flux2_dit.py
--- a/diffsynth/models/flux2_text_encoder.py
+++ b/diffsynth/models/flux2_text_encoder.py
@@ -0,0 +1,58 @@
 from transformers import Mistral3ForConditionalGeneration, Mistral3Config
 class Flux2TextEncoder(Mistral3ForConditionalGeneration):
    def __init__(self):
        config = Mistral3Config(**{
            "architectures": [
                "Mistral3ForConditionalGeneration"
            ],
            "dtype": "bfloat16",
            "image_token_index": 10,
            "model_type": "mistral3",
            "multimodal_projector_bias": False,
            "projector_hidden_act": "gelu",
            "spatial_merge_size": 2,
            "text_config": {
                "attention_dropout": 0.0,
                "dtype": "bfloat16",
                "head_dim": 128,
                "hidden_act": "silu",
                "hidden_size": 5120,
                "initializer_range": 0.02,
                "intermediate_size": 32768,
                "max_position_embeddings": 131072,
                "model_type": "mistral",
                "num_attention_heads": 32,
                "num_hidden_layers": 40,
                "num_key_value_heads": 8,
                "rms_norm_eps": 1e-05,
                "rope_theta": 1000000000.0,
                "sliding_window": None,
                "use_cache": True,
                "vocab_size": 131072
            },
            "transformers_version": "4.57.1",
            "vision_config": {
                "attention_dropout": 0.0,
                "dtype": "bfloat16",
                "head_dim": 64,
                "hidden_act": "silu",
                "hidden_size": 1024,
                "image_size": 1540,
                "initializer_range": 0.02,
                "intermediate_size": 4096,
                "model_type": "pixtral",
                "num_attention_heads": 16,
                "num_channels": 3,
                "num_hidden_layers": 24,
                "patch_size": 14,
                "rope_theta": 10000.0
            },
            "vision_feature_layer": -1
        })
        super().__init__(config)
    def forward(self, input_ids = None, pixel_values = None, attention_mask = None, position_ids = None, past_key_values = None, inputs_embeds = None, labels = None, use_cache = None, output_attentions = None, output_hidden_states = None, return_dict = None, cache_position = None, logits_to_keep = 0, image_sizes = None, **kwargs):
        return super().forward(input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, image_sizes, **kwargs)
--- a/diffsynth/models/flux2_vae.py
+++ b/diffsynth/models/flux2_vae.py
--- a/diffsynth/models/flux_controlnet.py
+++ b/diffsynth/models/flux_controlnet.py
@@ -0,0 +1,384 @@
 import torch
 from einops import rearrange, repeat
 from .flux_dit import RoPEEmbedding, TimestepEmbeddings, FluxJointTransformerBlock, FluxSingleTransformerBlock, RMSNorm
 # from .utils import hash_state_dict_keys, init_weights_on_device
 from contextlib import contextmanager
 def hash_state_dict_keys(state_dict, with_shape=True):
    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
    keys_str = keys_str.encode(encoding="UTF-8")
    return hashlib.md5(keys_str).hexdigest()
@contextmanager
 def init_weights_on_device(device = torch.device("meta"), include_buffers :bool = False):
    old_register_parameter = torch.nn.Module.register_parameter
    if include_buffers:
        old_register_buffer = torch.nn.Module.register_buffer
    def register_empty_parameter(module, name, param):
        old_register_parameter(module, name, param)
        if param is not None:
            param_cls = type(module._parameters[name])
            kwargs = module._parameters[name].__dict__
            kwargs["requires_grad"] = param.requires_grad
            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
    def register_empty_buffer(module, name, buffer, persistent=True):
        old_register_buffer(module, name, buffer, persistent=persistent)
        if buffer is not None:
            module._buffers[name] = module._buffers[name].to(device)
    def patch_tensor_constructor(fn):
        def wrapper(*args, **kwargs):
            kwargs["device"] = device
            return fn(*args, **kwargs)
        return wrapper
    if include_buffers:
        tensor_constructors_to_patch = {
            torch_function_name: getattr(torch, torch_function_name)
            for torch_function_name in ["empty", "zeros", "ones", "full"]
        }
    else:
        tensor_constructors_to_patch = {}
    try:
        torch.nn.Module.register_parameter = register_empty_parameter
        if include_buffers:
            torch.nn.Module.register_buffer = register_empty_buffer
        for torch_function_name in tensor_constructors_to_patch.keys():
            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
        yield
    finally:
        torch.nn.Module.register_parameter = old_register_parameter
        if include_buffers:
            torch.nn.Module.register_buffer = old_register_buffer
        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
            setattr(torch, torch_function_name, old_torch_function)
 class FluxControlNet(torch.nn.Module):
    def __init__(self, disable_guidance_embedder=False, num_joint_blocks=5, num_single_blocks=10, num_mode=0, mode_dict={}, additional_input_dim=0):
        super().__init__()
        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
        self.time_embedder = TimestepEmbeddings(256, 3072)
        self.guidance_embedder = None if disable_guidance_embedder else TimestepEmbeddings(256, 3072)
        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(768, 3072), torch.nn.SiLU(), torch.nn.Linear(3072, 3072))
        self.context_embedder = torch.nn.Linear(4096, 3072)
        self.x_embedder = torch.nn.Linear(64, 3072)
        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(num_joint_blocks)])
        self.single_blocks = torch.nn.ModuleList([FluxSingleTransformerBlock(3072, 24) for _ in range(num_single_blocks)])
        self.controlnet_blocks = torch.nn.ModuleList([torch.nn.Linear(3072, 3072) for _ in range(num_joint_blocks)])
        self.controlnet_single_blocks = torch.nn.ModuleList([torch.nn.Linear(3072, 3072) for _ in range(num_single_blocks)])
        self.mode_dict = mode_dict
        self.controlnet_mode_embedder = torch.nn.Embedding(num_mode, 3072) if len(mode_dict) > 0 else None
        self.controlnet_x_embedder = torch.nn.Linear(64 + additional_input_dim, 3072)
    def prepare_image_ids(self, latents):
        batch_size, _, height, width = latents.shape
        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
        latent_image_ids = latent_image_ids.reshape(
            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
        )
        latent_image_ids = latent_image_ids.to(device=latents.device, dtype=latents.dtype)
        return latent_image_ids
    def patchify(self, hidden_states):
        hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
        return hidden_states
    def align_res_stack_to_original_blocks(self, res_stack, num_blocks, hidden_states):
        if len(res_stack) == 0:
            return [torch.zeros_like(hidden_states)] * num_blocks
        interval = (num_blocks + len(res_stack) - 1) // len(res_stack)
        aligned_res_stack = [res_stack[block_id // interval] for block_id in range(num_blocks)]
        return aligned_res_stack
    def forward(
        self,
        hidden_states,
        controlnet_conditioning,
        timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids, image_ids=None,
        processor_id=None,
        tiled=False, tile_size=128, tile_stride=64,
        **kwargs
    ):
        if image_ids is None:
            image_ids = self.prepare_image_ids(hidden_states)
        conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
        if self.guidance_embedder is not None:
            guidance = guidance * 1000
            conditioning = conditioning + self.guidance_embedder(guidance, hidden_states.dtype)
        prompt_emb = self.context_embedder(prompt_emb)
        if self.controlnet_mode_embedder is not None: # Different from FluxDiT
            processor_id = torch.tensor([self.mode_dict[processor_id]], dtype=torch.int)
            processor_id = repeat(processor_id, "D -> B D", B=1).to(text_ids.device)
            prompt_emb = torch.concat([self.controlnet_mode_embedder(processor_id), prompt_emb], dim=1)
            text_ids = torch.cat([text_ids[:, :1], text_ids], dim=1)
        image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
        hidden_states = self.patchify(hidden_states)
        hidden_states = self.x_embedder(hidden_states)
        controlnet_conditioning = self.patchify(controlnet_conditioning) # Different from FluxDiT
        hidden_states = hidden_states + self.controlnet_x_embedder(controlnet_conditioning) # Different from FluxDiT
        controlnet_res_stack = []
        for block, controlnet_block in zip(self.blocks, self.controlnet_blocks):
            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
            controlnet_res_stack.append(controlnet_block(hidden_states))
        controlnet_single_res_stack = []
        hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
        for block, controlnet_block in zip(self.single_blocks, self.controlnet_single_blocks):
            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
            controlnet_single_res_stack.append(controlnet_block(hidden_states[:, prompt_emb.shape[1]:]))
        controlnet_res_stack = self.align_res_stack_to_original_blocks(controlnet_res_stack, 19, hidden_states[:, prompt_emb.shape[1]:])
        controlnet_single_res_stack = self.align_res_stack_to_original_blocks(controlnet_single_res_stack, 38, hidden_states[:, prompt_emb.shape[1]:])
        return controlnet_res_stack, controlnet_single_res_stack
    # @staticmethod
    # def state_dict_converter():
    #     return FluxControlNetStateDictConverter()
    def quantize(self):
        def cast_to(weight, dtype=None, device=None, copy=False):
            if device is None or weight.device == device:
                if not copy:
                    if dtype is None or weight.dtype == dtype:
                        return weight
                return weight.to(dtype=dtype, copy=copy)
            r = torch.empty_like(weight, dtype=dtype, device=device)
            r.copy_(weight)
            return r
        def cast_weight(s, input=None, dtype=None, device=None):
            if input is not None:
                if dtype is None:
                    dtype = input.dtype
                if device is None:
                    device = input.device
            weight = cast_to(s.weight, dtype, device)
            return weight
        def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
            if input is not None:
                if dtype is None:
                    dtype = input.dtype
                if bias_dtype is None:
                    bias_dtype = dtype
                if device is None:
                    device = input.device
            bias = None
            weight = cast_to(s.weight, dtype, device)
            bias = cast_to(s.bias, bias_dtype, device)
            return weight, bias
        class quantized_layer:
            class QLinear(torch.nn.Linear):
                def __init__(self, *args, **kwargs):
                    super().__init__(*args, **kwargs)
                def forward(self,input,**kwargs):
                    weight,bias= cast_bias_weight(self,input)
                    return torch.nn.functional.linear(input,weight,bias)
            class QRMSNorm(torch.nn.Module):
                def __init__(self, module):
                    super().__init__()
                    self.module = module
                def forward(self,hidden_states,**kwargs):
                    weight= cast_weight(self.module,hidden_states)
                    input_dtype = hidden_states.dtype
                    variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
                    hidden_states = hidden_states * torch.rsqrt(variance + self.module.eps)
                    hidden_states = hidden_states.to(input_dtype) * weight
                    return hidden_states
            class QEmbedding(torch.nn.Embedding):
                def __init__(self, *args, **kwargs):
                    super().__init__(*args, **kwargs)
                def forward(self,input,**kwargs):
                    weight= cast_weight(self,input)
                    return torch.nn.functional.embedding(
                        input, weight, self.padding_idx, self.max_norm,
                        self.norm_type, self.scale_grad_by_freq, self.sparse)
        def replace_layer(model):
            for name, module in model.named_children():
                if isinstance(module,quantized_layer.QRMSNorm):
                    continue
                if isinstance(module, torch.nn.Linear):
                    with init_weights_on_device():
                        new_layer = quantized_layer.QLinear(module.in_features,module.out_features)
                    new_layer.weight = module.weight
                    if module.bias is not None:
                        new_layer.bias = module.bias
                    setattr(model, name, new_layer)
                elif isinstance(module, RMSNorm):
                    if hasattr(module,"quantized"):
                        continue
                    module.quantized= True
                    new_layer = quantized_layer.QRMSNorm(module)
                    setattr(model, name, new_layer)
                elif isinstance(module,torch.nn.Embedding):
                    rows, cols = module.weight.shape
                    new_layer = quantized_layer.QEmbedding(
                        num_embeddings=rows,
                        embedding_dim=cols,
                        _weight=module.weight,
                        # _freeze=module.freeze,
                        padding_idx=module.padding_idx,
                        max_norm=module.max_norm,
                        norm_type=module.norm_type,
                        scale_grad_by_freq=module.scale_grad_by_freq,
                        sparse=module.sparse)
                    setattr(model, name, new_layer)
                else:
                    replace_layer(module)
        replace_layer(self)
 class FluxControlNetStateDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        hash_value = hash_state_dict_keys(state_dict)
        global_rename_dict = {
            "context_embedder": "context_embedder",
            "x_embedder": "x_embedder",
            "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
            "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
            "time_text_embed.guidance_embedder.linear_1": "guidance_embedder.timestep_embedder.0",
            "time_text_embed.guidance_embedder.linear_2": "guidance_embedder.timestep_embedder.2",
            "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
            "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
            "norm_out.linear": "final_norm_out.linear",
            "proj_out": "final_proj_out",
        }
        rename_dict = {
            "proj_out": "proj_out",
            "norm1.linear": "norm1_a.linear",
            "norm1_context.linear": "norm1_b.linear",
            "attn.to_q": "attn.a_to_q",
            "attn.to_k": "attn.a_to_k",
            "attn.to_v": "attn.a_to_v",
            "attn.to_out.0": "attn.a_to_out",
            "attn.add_q_proj": "attn.b_to_q",
            "attn.add_k_proj": "attn.b_to_k",
            "attn.add_v_proj": "attn.b_to_v",
            "attn.to_add_out": "attn.b_to_out",
            "ff.net.0.proj": "ff_a.0",
            "ff.net.2": "ff_a.2",
            "ff_context.net.0.proj": "ff_b.0",
            "ff_context.net.2": "ff_b.2",
            "attn.norm_q": "attn.norm_q_a",
            "attn.norm_k": "attn.norm_k_a",
            "attn.norm_added_q": "attn.norm_q_b",
            "attn.norm_added_k": "attn.norm_k_b",
        }
        rename_dict_single = {
            "attn.to_q": "a_to_q",
            "attn.to_k": "a_to_k",
            "attn.to_v": "a_to_v",
            "attn.norm_q": "norm_q_a",
            "attn.norm_k": "norm_k_a",
            "norm.linear": "norm.linear",
            "proj_mlp": "proj_in_besides_attn",
            "proj_out": "proj_out",
        }
        state_dict_ = {}
        for name, param in state_dict.items():
            if name.endswith(".weight") or name.endswith(".bias"):
                suffix = ".weight" if name.endswith(".weight") else ".bias"
                prefix = name[:-len(suffix)]
                if prefix in global_rename_dict:
                    state_dict_[global_rename_dict[prefix] + suffix] = param
                elif prefix.startswith("transformer_blocks."):
                    names = prefix.split(".")
                    names[0] = "blocks"
                    middle = ".".join(names[2:])
                    if middle in rename_dict:
                        name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
                        state_dict_[name_] = param
                elif prefix.startswith("single_transformer_blocks."):
                    names = prefix.split(".")
                    names[0] = "single_blocks"
                    middle = ".".join(names[2:])
                    if middle in rename_dict_single:
                        name_ = ".".join(names[:2] + [rename_dict_single[middle]] + [suffix[1:]])
                        state_dict_[name_] = param
                    else:
                        state_dict_[name] = param
                else:
                    state_dict_[name] = param
        for name in list(state_dict_.keys()):
            if ".proj_in_besides_attn." in name:
                name_ = name.replace(".proj_in_besides_attn.", ".to_qkv_mlp.")
                param = torch.concat([
                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_q.")],
                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_k.")],
                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_v.")],
                    state_dict_[name],
                ], dim=0)
                state_dict_[name_] = param
                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_q."))
                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_k."))
                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_v."))
                state_dict_.pop(name)
        for name in list(state_dict_.keys()):
            for component in ["a", "b"]:
                if f".{component}_to_q." in name:
                    name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
                    param = torch.concat([
                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
                    ], dim=0)
                    state_dict_[name_] = param
                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))
        if hash_value == "78d18b9101345ff695f312e7e62538c0":
            extra_kwargs = {"num_mode": 10, "mode_dict": {"canny": 0, "tile": 1, "depth": 2, "blur": 3, "pose": 4, "gray": 5, "lq": 6}}
        elif hash_value == "b001c89139b5f053c715fe772362dd2a":
            extra_kwargs = {"num_single_blocks": 0}
        elif hash_value == "52357cb26250681367488a8954c271e8":
            extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4}
        elif hash_value == "0cfd1740758423a2a854d67c136d1e8c":
            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1}
        elif hash_value == "7f9583eb8ba86642abb9a21a4b2c9e16":
            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 10}
        elif hash_value == "43ad5aaa27dd4ee01b832ed16773fa52":
            extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0}
        else:
            extra_kwargs = {}
        return state_dict_, extra_kwargs
    def from_civitai(self, state_dict):
        return self.from_diffusers(state_dict)
--- a/diffsynth/models/flux_dit.py
+++ b/diffsynth/models/flux_dit.py
@@ -0,0 +1,398 @@
 import torch
 from .general_modules import TimestepEmbeddings, AdaLayerNorm, RMSNorm
 from einops import rearrange
 def interact_with_ipadapter(hidden_states, q, ip_k, ip_v, scale=1.0):
    batch_size, num_tokens = hidden_states.shape[0:2]
    ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
    ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, num_tokens, -1)
    hidden_states = hidden_states + scale * ip_hidden_states
    return hidden_states
 class RoPEEmbedding(torch.nn.Module):
    def __init__(self, dim, theta, axes_dim):
        super().__init__()
        self.dim = dim
        self.theta = theta
        self.axes_dim = axes_dim
    def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
        assert dim % 2 == 0, "The dimension must be even."
        scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
        omega = 1.0 / (theta**scale)
        batch_size, seq_length = pos.shape
        out = torch.einsum("...n,d->...nd", pos, omega)
        cos_out = torch.cos(out)
        sin_out = torch.sin(out)
        stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
        out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
        return out.float()
    def forward(self, ids):
        n_axes = ids.shape[-1]
        emb = torch.cat([self.rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
        return emb.unsqueeze(1)
 class FluxJointAttention(torch.nn.Module):
    def __init__(self, dim_a, dim_b, num_heads, head_dim, only_out_a=False):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.only_out_a = only_out_a
        self.a_to_qkv = torch.nn.Linear(dim_a, dim_a * 3)
        self.b_to_qkv = torch.nn.Linear(dim_b, dim_b * 3)
        self.norm_q_a = RMSNorm(head_dim, eps=1e-6)
        self.norm_k_a = RMSNorm(head_dim, eps=1e-6)
        self.norm_q_b = RMSNorm(head_dim, eps=1e-6)
        self.norm_k_b = RMSNorm(head_dim, eps=1e-6)
        self.a_to_out = torch.nn.Linear(dim_a, dim_a)
        if not only_out_a:
            self.b_to_out = torch.nn.Linear(dim_b, dim_b)
    def apply_rope(self, xq, xk, freqs_cis):
        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
    def forward(self, hidden_states_a, hidden_states_b, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
        batch_size = hidden_states_a.shape[0]
        # Part A
        qkv_a = self.a_to_qkv(hidden_states_a)
        qkv_a = qkv_a.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
        q_a, k_a, v_a = qkv_a.chunk(3, dim=1)
        q_a, k_a = self.norm_q_a(q_a), self.norm_k_a(k_a)
        # Part B
        qkv_b = self.b_to_qkv(hidden_states_b)
        qkv_b = qkv_b.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
        q_b, k_b, v_b = qkv_b.chunk(3, dim=1)
        q_b, k_b = self.norm_q_b(q_b), self.norm_k_b(k_b)
        q = torch.concat([q_b, q_a], dim=2)
        k = torch.concat([k_b, k_a], dim=2)
        v = torch.concat([v_b, v_a], dim=2)
        q, k = self.apply_rope(q, k, image_rotary_emb)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states_b, hidden_states_a = hidden_states[:, :hidden_states_b.shape[1]], hidden_states[:, hidden_states_b.shape[1]:]
        if ipadapter_kwargs_list is not None:
            hidden_states_a = interact_with_ipadapter(hidden_states_a, q_a, **ipadapter_kwargs_list)
        hidden_states_a = self.a_to_out(hidden_states_a)
        if self.only_out_a:
            return hidden_states_a
        else:
            hidden_states_b = self.b_to_out(hidden_states_b)
            return hidden_states_a, hidden_states_b
 class FluxJointTransformerBlock(torch.nn.Module):
    def __init__(self, dim, num_attention_heads):
        super().__init__()
        self.norm1_a = AdaLayerNorm(dim)
        self.norm1_b = AdaLayerNorm(dim)
        self.attn = FluxJointAttention(dim, dim, num_attention_heads, dim // num_attention_heads)
        self.norm2_a = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
        self.ff_a = torch.nn.Sequential(
            torch.nn.Linear(dim, dim*4),
            torch.nn.GELU(approximate="tanh"),
            torch.nn.Linear(dim*4, dim)
        )
        self.norm2_b = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
        self.ff_b = torch.nn.Sequential(
            torch.nn.Linear(dim, dim*4),
            torch.nn.GELU(approximate="tanh"),
            torch.nn.Linear(dim*4, dim)
        )
    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
        norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
        norm_hidden_states_b, gate_msa_b, shift_mlp_b, scale_mlp_b, gate_mlp_b = self.norm1_b(hidden_states_b, emb=temb)
        # Attention
        attn_output_a, attn_output_b = self.attn(norm_hidden_states_a, norm_hidden_states_b, image_rotary_emb, attn_mask, ipadapter_kwargs_list)
        # Part A
        hidden_states_a = hidden_states_a + gate_msa_a * attn_output_a
        norm_hidden_states_a = self.norm2_a(hidden_states_a) * (1 + scale_mlp_a) + shift_mlp_a
        hidden_states_a = hidden_states_a + gate_mlp_a * self.ff_a(norm_hidden_states_a)
        # Part B
        hidden_states_b = hidden_states_b + gate_msa_b * attn_output_b
        norm_hidden_states_b = self.norm2_b(hidden_states_b) * (1 + scale_mlp_b) + shift_mlp_b
        hidden_states_b = hidden_states_b + gate_mlp_b * self.ff_b(norm_hidden_states_b)
        return hidden_states_a, hidden_states_b
 class FluxSingleAttention(torch.nn.Module):
    def __init__(self, dim_a, dim_b, num_heads, head_dim):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.a_to_qkv = torch.nn.Linear(dim_a, dim_a * 3)
        self.norm_q_a = RMSNorm(head_dim, eps=1e-6)
        self.norm_k_a = RMSNorm(head_dim, eps=1e-6)
    def apply_rope(self, xq, xk, freqs_cis):
        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
    def forward(self, hidden_states, image_rotary_emb):
        batch_size = hidden_states.shape[0]
        qkv_a = self.a_to_qkv(hidden_states)
        qkv_a = qkv_a.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
        q_a, k_a, v = qkv_a.chunk(3, dim=1)
        q_a, k_a = self.norm_q_a(q_a), self.norm_k_a(k_a)
        q, k = self.apply_rope(q_a, k_a, image_rotary_emb)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        return hidden_states
 class AdaLayerNormSingle(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.silu = torch.nn.SiLU()
        self.linear = torch.nn.Linear(dim, 3 * dim, bias=True)
        self.norm = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
    def forward(self, x, emb):
        emb = self.linear(self.silu(emb))
        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1)
        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
        return x, gate_msa
 class FluxSingleTransformerBlock(torch.nn.Module):
    def __init__(self, dim, num_attention_heads):
        super().__init__()
        self.num_heads = num_attention_heads
        self.head_dim = dim // num_attention_heads
        self.dim = dim
        self.norm = AdaLayerNormSingle(dim)
        self.to_qkv_mlp = torch.nn.Linear(dim, dim * (3 + 4))
        self.norm_q_a = RMSNorm(self.head_dim, eps=1e-6)
        self.norm_k_a = RMSNorm(self.head_dim, eps=1e-6)
        self.proj_out = torch.nn.Linear(dim * 5, dim)
    def apply_rope(self, xq, xk, freqs_cis):
        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
    def process_attention(self, hidden_states, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
        batch_size = hidden_states.shape[0]
        qkv = hidden_states.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
        q, k, v = qkv.chunk(3, dim=1)
        q, k = self.norm_q_a(q), self.norm_k_a(k)
        q, k = self.apply_rope(q, k, image_rotary_emb)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        if ipadapter_kwargs_list is not None:
            hidden_states = interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs_list)
        return hidden_states
    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
        residual = hidden_states_a
        norm_hidden_states, gate = self.norm(hidden_states_a, emb=temb)
        hidden_states_a = self.to_qkv_mlp(norm_hidden_states)
        attn_output, mlp_hidden_states = hidden_states_a[:, :, :self.dim * 3], hidden_states_a[:, :, self.dim * 3:]
        attn_output = self.process_attention(attn_output, image_rotary_emb, attn_mask, ipadapter_kwargs_list)
        mlp_hidden_states = torch.nn.functional.gelu(mlp_hidden_states, approximate="tanh")
        hidden_states_a = torch.cat([attn_output, mlp_hidden_states], dim=2)
        hidden_states_a = gate.unsqueeze(1) * self.proj_out(hidden_states_a)
        hidden_states_a = residual + hidden_states_a
        return hidden_states_a, hidden_states_b
 class AdaLayerNormContinuous(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.silu = torch.nn.SiLU()
        self.linear = torch.nn.Linear(dim, dim * 2, bias=True)
        self.norm = torch.nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
    def forward(self, x, conditioning):
        emb = self.linear(self.silu(conditioning))
        shift, scale = torch.chunk(emb, 2, dim=1)
        x = self.norm(x) * (1 + scale)[:, None] + shift[:, None]
        return x
 class FluxDiT(torch.nn.Module):
    _repeated_blocks = ["FluxJointTransformerBlock", "FluxSingleTransformerBlock"]
    def __init__(self, disable_guidance_embedder=False, input_dim=64, num_blocks=19):
        super().__init__()
        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
        self.time_embedder = TimestepEmbeddings(256, 3072)
        self.guidance_embedder = None if disable_guidance_embedder else TimestepEmbeddings(256, 3072)
        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(768, 3072), torch.nn.SiLU(), torch.nn.Linear(3072, 3072))
        self.context_embedder = torch.nn.Linear(4096, 3072)
        self.x_embedder = torch.nn.Linear(input_dim, 3072)
        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(num_blocks)])
        self.single_blocks = torch.nn.ModuleList([FluxSingleTransformerBlock(3072, 24) for _ in range(38)])
        self.final_norm_out = AdaLayerNormContinuous(3072)
        self.final_proj_out = torch.nn.Linear(3072, 64)
        self.input_dim = input_dim
    def patchify(self, hidden_states):
        hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
        return hidden_states
    def unpatchify(self, hidden_states, height, width):
        hidden_states = rearrange(hidden_states, "B (H W) (C P Q) -> B C (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
        return hidden_states
    def prepare_image_ids(self, latents):
        batch_size, _, height, width = latents.shape
        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
        latent_image_ids = latent_image_ids.reshape(
            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
        )
        latent_image_ids = latent_image_ids.to(device=latents.device, dtype=latents.dtype)
        return latent_image_ids
    def construct_mask(self, entity_masks, prompt_seq_len, image_seq_len):
        N = len(entity_masks)
        batch_size = entity_masks[0].shape[0]
        total_seq_len = N * prompt_seq_len + image_seq_len
        patched_masks = [self.patchify(entity_masks[i]) for i in range(N)]
        attention_mask = torch.ones((batch_size, total_seq_len, total_seq_len), dtype=torch.bool).to(device=entity_masks[0].device)
        image_start = N * prompt_seq_len
        image_end = N * prompt_seq_len + image_seq_len
        # prompt-image mask
        for i in range(N):
            prompt_start = i * prompt_seq_len
            prompt_end = (i + 1) * prompt_seq_len
            image_mask = torch.sum(patched_masks[i], dim=-1) > 0
            image_mask = image_mask.unsqueeze(1).repeat(1, prompt_seq_len, 1)
            # prompt update with image
            attention_mask[:, prompt_start:prompt_end, image_start:image_end] = image_mask
            # image update with prompt
            attention_mask[:, image_start:image_end, prompt_start:prompt_end] = image_mask.transpose(1, 2)
        # prompt-prompt mask
        for i in range(N):
            for j in range(N):
                if i != j:
                    prompt_start_i = i * prompt_seq_len
                    prompt_end_i = (i + 1) * prompt_seq_len
                    prompt_start_j = j * prompt_seq_len
                    prompt_end_j = (j + 1) * prompt_seq_len
                    attention_mask[:, prompt_start_i:prompt_end_i, prompt_start_j:prompt_end_j] = False
        attention_mask = attention_mask.float()
        attention_mask[attention_mask == 0] = float('-inf')
        attention_mask[attention_mask == 1] = 0
        return attention_mask
    def process_entity_masks(self, hidden_states, prompt_emb, entity_prompt_emb, entity_masks, text_ids, image_ids, repeat_dim):
        max_masks = 0
        attention_mask = None
        prompt_embs = [prompt_emb]
        if entity_masks is not None:
            # entity_masks
            batch_size, max_masks = entity_masks.shape[0], entity_masks.shape[1]
            entity_masks = entity_masks.repeat(1, 1, repeat_dim, 1, 1)
            entity_masks = [entity_masks[:, i, None].squeeze(1) for i in range(max_masks)]
            # global mask
            global_mask = torch.ones_like(entity_masks[0]).to(device=hidden_states.device, dtype=hidden_states.dtype)
            entity_masks = entity_masks + [global_mask] # append global to last
            # attention mask
            attention_mask = self.construct_mask(entity_masks, prompt_emb.shape[1], hidden_states.shape[1])
            attention_mask = attention_mask.to(device=hidden_states.device, dtype=hidden_states.dtype)
            attention_mask = attention_mask.unsqueeze(1)
            # embds: n_masks * b * seq * d
            local_embs = [entity_prompt_emb[:, i, None].squeeze(1) for i in range(max_masks)]
            prompt_embs = local_embs + prompt_embs # append global to last
        prompt_embs = [self.context_embedder(prompt_emb) for prompt_emb in prompt_embs]
        prompt_emb = torch.cat(prompt_embs, dim=1)
        # positional embedding
        text_ids = torch.cat([text_ids] * (max_masks + 1), dim=1)
        image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
        return prompt_emb, image_rotary_emb, attention_mask
    def forward(
        self,
        hidden_states,
        timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids, image_ids=None,
        tiled=False, tile_size=128, tile_stride=64, entity_prompt_emb=None, entity_masks=None,
        use_gradient_checkpointing=False,
        **kwargs
    ):
        # (Deprecated) The real forward is in `pipelines.flux_image`.
        return None
--- a/diffsynth/models/flux_infiniteyou.py
+++ b/diffsynth/models/flux_infiniteyou.py
@@ -0,0 +1,129 @@
 import math
 import torch
 import torch.nn as nn
 # FFN
 def FeedForward(dim, mult=4):
    inner_dim = int(dim * mult)
    return nn.Sequential(
        nn.LayerNorm(dim),
        nn.Linear(dim, inner_dim, bias=False),
        nn.GELU(),
        nn.Linear(inner_dim, dim, bias=False),
    )
 def reshape_tensor(x, heads):
    bs, length, width = x.shape
    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
    x = x.view(bs, length, heads, -1)
    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
    x = x.transpose(1, 2)
    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
    x = x.reshape(bs, heads, length, -1)
    return x
 class PerceiverAttention(nn.Module):
    def __init__(self, *, dim, dim_head=64, heads=8):
        super().__init__()
        self.scale = dim_head**-0.5
        self.dim_head = dim_head
        self.heads = heads
        inner_dim = dim_head * heads
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.to_q = nn.Linear(dim, inner_dim, bias=False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
        self.to_out = nn.Linear(inner_dim, dim, bias=False)
    def forward(self, x, latents):
        """
        Args:
            x (torch.Tensor): image features
                shape (b, n1, D)
            latent (torch.Tensor): latent features
                shape (b, n2, D)
        """
        x = self.norm1(x)
        latents = self.norm2(latents)
        b, l, _ = latents.shape
        q = self.to_q(latents)
        kv_input = torch.cat((x, latents), dim=-2)
        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
        q = reshape_tensor(q, self.heads)
        k = reshape_tensor(k, self.heads)
        v = reshape_tensor(v, self.heads)
        # attention
        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
        out = weight @ v
        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
        return self.to_out(out)
 class InfiniteYouImageProjector(nn.Module):
    def __init__(
        self,
        dim=1280,
        depth=4,
        dim_head=64,
        heads=20,
        num_queries=8,
        embedding_dim=512,
        output_dim=4096,
        ff_mult=4,
    ):
        super().__init__()
        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
        self.proj_in = nn.Linear(embedding_dim, dim)
        self.proj_out = nn.Linear(dim, output_dim)
        self.norm_out = nn.LayerNorm(output_dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(
                nn.ModuleList([
                    PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
                    FeedForward(dim=dim, mult=ff_mult),
                ]))
    def forward(self, x):
        latents = self.latents.repeat(x.size(0), 1, 1)
        latents = latents.to(dtype=x.dtype, device=x.device)
        x = self.proj_in(x)
        for attn, ff in self.layers:
            latents = attn(x, latents) + latents
            latents = ff(latents) + latents
        latents = self.proj_out(latents)
        return self.norm_out(latents)
    @staticmethod
    def state_dict_converter():
        return FluxInfiniteYouImageProjectorStateDictConverter()
 class FluxInfiniteYouImageProjectorStateDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        return state_dict['image_proj']
--- a/diffsynth/models/flux_ipadapter.py
+++ b/diffsynth/models/flux_ipadapter.py
@@ -0,0 +1,110 @@
 from .general_modules import RMSNorm
 from transformers import SiglipVisionModel, SiglipVisionConfig
 import torch
 class SiglipVisionModelSO400M(SiglipVisionModel):
    def __init__(self):
        config = SiglipVisionConfig(
            hidden_size=1152,
            image_size=384,
            intermediate_size=4304,
            model_type="siglip_vision_model",
            num_attention_heads=16,
            num_hidden_layers=27,
            patch_size=14,
            architectures=["SiglipModel"],
            initializer_factor=1.0,
            torch_dtype="float32",
            transformers_version="4.37.0.dev0"
        )
        super().__init__(config)
 class MLPProjModel(torch.nn.Module):
    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
        super().__init__()
        self.cross_attention_dim = cross_attention_dim
        self.num_tokens = num_tokens
        self.proj = torch.nn.Sequential(
            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
            torch.nn.GELU(),
            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
        )
        self.norm = torch.nn.LayerNorm(cross_attention_dim)
    def forward(self, id_embeds):
        x = self.proj(id_embeds)
        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
        x = self.norm(x)
        return x
 class IpAdapterModule(torch.nn.Module):
    def __init__(self, num_attention_heads, attention_head_dim, input_dim):
        super().__init__()
        self.num_heads = num_attention_heads
        self.head_dim = attention_head_dim
        output_dim = num_attention_heads * attention_head_dim
        self.to_k_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
        self.to_v_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
        self.norm_added_k = RMSNorm(attention_head_dim, eps=1e-5, elementwise_affine=False)
    def forward(self, hidden_states):
        batch_size = hidden_states.shape[0]
        # ip_k
        ip_k = self.to_k_ip(hidden_states)
        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        ip_k = self.norm_added_k(ip_k)
        # ip_v
        ip_v = self.to_v_ip(hidden_states)
        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        return ip_k, ip_v
 class FluxIpAdapter(torch.nn.Module):
    def __init__(self, num_attention_heads=24, attention_head_dim=128, cross_attention_dim=4096, num_tokens=128, num_blocks=57):
        super().__init__()
        self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(num_attention_heads, attention_head_dim, cross_attention_dim) for _ in range(num_blocks)])
        self.image_proj = MLPProjModel(cross_attention_dim=cross_attention_dim, id_embeddings_dim=1152, num_tokens=num_tokens)
        self.set_adapter()
    def set_adapter(self):
        self.call_block_id = {i:i for i in range(len(self.ipadapter_modules))}
    def forward(self, hidden_states, scale=1.0):
        hidden_states = self.image_proj(hidden_states)
        hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1])
        ip_kv_dict = {}
        for block_id in self.call_block_id:
            ipadapter_id = self.call_block_id[block_id]
            ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states)
            ip_kv_dict[block_id] = {
                "ip_k": ip_k,
                "ip_v": ip_v,
                "scale": scale
            }
        return ip_kv_dict
    @staticmethod
    def state_dict_converter():
        return FluxIpAdapterStateDictConverter()
 class FluxIpAdapterStateDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        state_dict_ = {}
        for name in state_dict["ip_adapter"]:
            name_ = 'ipadapter_modules.' + name
            state_dict_[name_] = state_dict["ip_adapter"][name]
        for name in state_dict["image_proj"]:
            name_ = "image_proj." + name
            state_dict_[name_] = state_dict["image_proj"][name]
        return state_dict_
    def from_civitai(self, state_dict):
        return self.from_diffusers(state_dict)
--- a/diffsynth/models/flux_lora_encoder.py
+++ b/diffsynth/models/flux_lora_encoder.py
@@ -0,0 +1,521 @@
 import torch
 from einops import rearrange
 def low_version_attention(query, key, value, attn_bias=None):
    scale = 1 / query.shape[-1] ** 0.5
    query = query * scale
    attn = torch.matmul(query, key.transpose(-2, -1))
    if attn_bias is not None:
        attn = attn + attn_bias
    attn = attn.softmax(-1)
    return attn @ value
 class Attention(torch.nn.Module):
    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
        super().__init__()
        dim_inner = head_dim * num_heads
        kv_dim = kv_dim if kv_dim is not None else q_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
    def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0):
        batch_size = q.shape[0]
        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
        hidden_states = hidden_states + scale * ip_hidden_states
        return hidden_states
    def torch_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        batch_size = encoder_hidden_states.shape[0]
        q = self.to_q(hidden_states)
        k = self.to_k(encoder_hidden_states)
        v = self.to_v(encoder_hidden_states)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        if qkv_preprocessor is not None:
            q, k, v = qkv_preprocessor(q, k, v)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        if ipadapter_kwargs is not None:
            hidden_states = self.interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states = self.to_out(hidden_states)
        return hidden_states
    def xformers_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        q = self.to_q(hidden_states)
        k = self.to_k(encoder_hidden_states)
        v = self.to_v(encoder_hidden_states)
        q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads)
        k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads)
        v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads)
        if attn_mask is not None:
            hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask)
        else:
            import xformers.ops as xops
            hidden_states = xops.memory_efficient_attention(q, k, v)
        hidden_states = rearrange(hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states = self.to_out(hidden_states)
        return hidden_states
    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
        return self.torch_forward(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask, ipadapter_kwargs=ipadapter_kwargs, qkv_preprocessor=qkv_preprocessor)
 class CLIPEncoderLayer(torch.nn.Module):
    def __init__(self, embed_dim, intermediate_size, num_heads=12, head_dim=64, use_quick_gelu=True):
        super().__init__()
        self.attn = Attention(q_dim=embed_dim, num_heads=num_heads, head_dim=head_dim, bias_q=True, bias_kv=True, bias_out=True)
        self.layer_norm1 = torch.nn.LayerNorm(embed_dim)
        self.layer_norm2 = torch.nn.LayerNorm(embed_dim)
        self.fc1 = torch.nn.Linear(embed_dim, intermediate_size)
        self.fc2 = torch.nn.Linear(intermediate_size, embed_dim)
        self.use_quick_gelu = use_quick_gelu
    def quickGELU(self, x):
        return x * torch.sigmoid(1.702 * x)
    def forward(self, hidden_states, attn_mask=None):
        residual = hidden_states
        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.fc1(hidden_states)
        if self.use_quick_gelu:
            hidden_states = self.quickGELU(hidden_states)
        else:
            hidden_states = torch.nn.functional.gelu(hidden_states)
        hidden_states = self.fc2(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states
 class SDTextEncoder(torch.nn.Module):
    def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=12, encoder_intermediate_size=3072):
        super().__init__()
        # token_embedding
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        # position_embeds (This is a fixed tensor)
        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
        # encoders
        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
        # attn_mask
        self.attn_mask = self.attention_mask(max_position_embeddings)
        # final_layer_norm
        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
    def attention_mask(self, length):
        mask = torch.empty(length, length)
        mask.fill_(float("-inf"))
        mask.triu_(1)
        return mask
    def forward(self, input_ids, clip_skip=1):
        embeds = self.token_embedding(input_ids) + self.position_embeds
        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
        for encoder_id, encoder in enumerate(self.encoders):
            embeds = encoder(embeds, attn_mask=attn_mask)
            if encoder_id + clip_skip == len(self.encoders):
                break
        embeds = self.final_layer_norm(embeds)
        return embeds
    @staticmethod
    def state_dict_converter():
        return SDTextEncoderStateDictConverter()
 class SDTextEncoderStateDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        rename_dict = {
            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
            "text_model.embeddings.position_embedding.weight": "position_embeds",
            "text_model.final_layer_norm.weight": "final_layer_norm.weight",
            "text_model.final_layer_norm.bias": "final_layer_norm.bias"
        }
        attn_rename_dict = {
            "self_attn.q_proj": "attn.to_q",
            "self_attn.k_proj": "attn.to_k",
            "self_attn.v_proj": "attn.to_v",
            "self_attn.out_proj": "attn.to_out",
            "layer_norm1": "layer_norm1",
            "layer_norm2": "layer_norm2",
            "mlp.fc1": "fc1",
            "mlp.fc2": "fc2",
        }
        state_dict_ = {}
        for name in state_dict:
            if name in rename_dict:
                param = state_dict[name]
                if name == "text_model.embeddings.position_embedding.weight":
                    param = param.reshape((1, param.shape[0], param.shape[1]))
                state_dict_[rename_dict[name]] = param
            elif name.startswith("text_model.encoder.layers."):
                param = state_dict[name]
                names = name.split(".")
                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
                state_dict_[name_] = param
        return state_dict_
    def from_civitai(self, state_dict):
        rename_dict = {
            "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias": "encoders.0.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight": "encoders.0.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias": "encoders.0.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight": "encoders.0.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias": "encoders.0.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight": "encoders.0.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias": "encoders.0.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight": "encoders.0.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias": "encoders.0.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight": "encoders.0.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias": "encoders.0.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight": "encoders.0.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias": "encoders.0.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight": "encoders.0.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias": "encoders.0.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight": "encoders.0.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias": "encoders.1.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight": "encoders.1.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias": "encoders.1.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight": "encoders.1.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias": "encoders.1.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight": "encoders.1.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias": "encoders.1.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight": "encoders.1.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias": "encoders.1.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight": "encoders.1.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias": "encoders.1.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight": "encoders.1.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias": "encoders.1.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight": "encoders.1.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias": "encoders.1.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight": "encoders.1.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias": "encoders.10.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight": "encoders.10.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias": "encoders.10.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight": "encoders.10.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias": "encoders.10.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight": "encoders.10.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias": "encoders.10.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight": "encoders.10.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias": "encoders.10.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight": "encoders.10.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias": "encoders.10.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight": "encoders.10.attn.to_out.weight",        
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias": "encoders.10.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight": "encoders.10.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias": "encoders.10.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight": "encoders.10.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias": "encoders.11.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight": "encoders.11.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias": "encoders.11.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight": "encoders.11.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias": "encoders.11.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight": "encoders.11.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias": "encoders.11.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight": "encoders.11.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias": "encoders.11.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight": "encoders.11.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias": "encoders.11.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight": "encoders.11.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias": "encoders.11.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight": "encoders.11.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias": "encoders.11.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight": "encoders.11.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias": "encoders.2.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight": "encoders.2.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias": "encoders.2.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight": "encoders.2.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias": "encoders.2.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight": "encoders.2.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias": "encoders.2.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight": "encoders.2.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias": "encoders.2.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight": "encoders.2.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias": "encoders.2.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight": "encoders.2.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias": "encoders.2.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight": "encoders.2.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias": "encoders.2.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight": "encoders.2.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias": "encoders.3.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight": "encoders.3.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias": "encoders.3.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight": "encoders.3.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias": "encoders.3.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight": "encoders.3.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias": "encoders.3.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight": "encoders.3.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias": "encoders.3.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight": "encoders.3.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias": "encoders.3.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight": "encoders.3.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias": "encoders.3.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight": "encoders.3.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias": "encoders.3.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight": "encoders.3.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias": "encoders.4.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight": "encoders.4.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias": "encoders.4.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight": "encoders.4.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias": "encoders.4.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight": "encoders.4.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias": "encoders.4.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight": "encoders.4.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias": "encoders.4.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight": "encoders.4.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias": "encoders.4.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight": "encoders.4.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias": "encoders.4.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight": "encoders.4.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias": "encoders.4.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight": "encoders.4.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias": "encoders.5.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight": "encoders.5.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias": "encoders.5.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight": "encoders.5.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias": "encoders.5.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight": "encoders.5.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias": "encoders.5.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight": "encoders.5.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias": "encoders.5.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight": "encoders.5.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias": "encoders.5.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight": "encoders.5.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias": "encoders.5.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight": "encoders.5.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias": "encoders.5.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight": "encoders.5.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias": "encoders.6.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight": "encoders.6.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias": "encoders.6.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight": "encoders.6.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias": "encoders.6.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight": "encoders.6.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias": "encoders.6.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight": "encoders.6.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias": "encoders.6.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight": "encoders.6.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias": "encoders.6.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight": "encoders.6.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias": "encoders.6.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight": "encoders.6.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias": "encoders.6.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight": "encoders.6.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias": "encoders.7.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight": "encoders.7.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias": "encoders.7.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight": "encoders.7.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias": "encoders.7.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight": "encoders.7.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias": "encoders.7.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight": "encoders.7.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias": "encoders.7.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight": "encoders.7.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias": "encoders.7.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight": "encoders.7.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias": "encoders.7.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight": "encoders.7.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias": "encoders.7.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight": "encoders.7.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias": "encoders.8.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight": "encoders.8.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias": "encoders.8.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight": "encoders.8.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias": "encoders.8.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight": "encoders.8.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias": "encoders.8.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight": "encoders.8.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias": "encoders.8.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight": "encoders.8.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias": "encoders.8.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight": "encoders.8.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias": "encoders.8.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight": "encoders.8.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias": "encoders.8.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight": "encoders.8.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias": "encoders.9.layer_norm1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight": "encoders.9.layer_norm1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias": "encoders.9.layer_norm2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight": "encoders.9.layer_norm2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias": "encoders.9.fc1.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight": "encoders.9.fc1.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias": "encoders.9.fc2.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight": "encoders.9.fc2.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias": "encoders.9.attn.to_k.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight": "encoders.9.attn.to_k.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias": "encoders.9.attn.to_out.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight": "encoders.9.attn.to_out.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias": "encoders.9.attn.to_q.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight": "encoders.9.attn.to_q.weight",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias": "encoders.9.attn.to_v.bias",
            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight": "encoders.9.attn.to_v.weight",
            "cond_stage_model.transformer.text_model.final_layer_norm.bias": "final_layer_norm.bias",
            "cond_stage_model.transformer.text_model.final_layer_norm.weight": "final_layer_norm.weight",
            "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight": "position_embeds"
        }
        state_dict_ = {}
        for name in state_dict:
            if name in rename_dict:
                param = state_dict[name]
                if name == "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight":
                    param = param.reshape((1, param.shape[0], param.shape[1]))
                state_dict_[rename_dict[name]] = param
        return state_dict_
 class LoRALayerBlock(torch.nn.Module):
    def __init__(self, L, dim_in, dim_out):
        super().__init__()
        self.x = torch.nn.Parameter(torch.randn(1, L, dim_in))
        self.layer_norm = torch.nn.LayerNorm(dim_out)
    def forward(self, lora_A, lora_B):
        x = self.x @ lora_A.T @ lora_B.T
        x = self.layer_norm(x)
        return x
 class LoRAEmbedder(torch.nn.Module):
    def __init__(self, lora_patterns=None, L=1, out_dim=2048):
        super().__init__()
        if lora_patterns is None:
            lora_patterns = self.default_lora_patterns()
        model_dict = {}
        for lora_pattern in lora_patterns:
            name, dim = lora_pattern["name"], lora_pattern["dim"]
            model_dict[name.replace(".", "___")] = LoRALayerBlock(L, dim[0], dim[1])
        self.model_dict = torch.nn.ModuleDict(model_dict)
        proj_dict = {}
        for lora_pattern in lora_patterns:
            layer_type, dim = lora_pattern["type"], lora_pattern["dim"]
            if layer_type not in proj_dict:
                proj_dict[layer_type.replace(".", "___")] = torch.nn.Linear(dim[1], out_dim)
        self.proj_dict = torch.nn.ModuleDict(proj_dict)
        self.lora_patterns = lora_patterns
    def default_lora_patterns(self):
        lora_patterns = []
        lora_dict = {
            "attn.a_to_qkv": (3072, 9216), "attn.a_to_out": (3072, 3072), "ff_a.0": (3072, 12288), "ff_a.2": (12288, 3072), "norm1_a.linear": (3072, 18432),
            "attn.b_to_qkv": (3072, 9216), "attn.b_to_out": (3072, 3072), "ff_b.0": (3072, 12288), "ff_b.2": (12288, 3072), "norm1_b.linear": (3072, 18432),
        }
        for i in range(19):
            for suffix in lora_dict:
                lora_patterns.append({
                    "name": f"blocks.{i}.{suffix}",
                    "dim": lora_dict[suffix],
                    "type": suffix,
                })
        lora_dict = {"to_qkv_mlp": (3072, 21504), "proj_out": (15360, 3072), "norm.linear": (3072, 9216)}
        for i in range(38):
            for suffix in lora_dict:
                lora_patterns.append({
                    "name": f"single_blocks.{i}.{suffix}",
                    "dim": lora_dict[suffix],
                    "type": suffix,
                })
        return lora_patterns
    def forward(self, lora):
        lora_emb = []
        for lora_pattern in self.lora_patterns:
            name, layer_type = lora_pattern["name"], lora_pattern["type"]
            lora_A = lora[name + ".lora_A.weight"]
            lora_B = lora[name + ".lora_B.weight"]
            lora_out = self.model_dict[name.replace(".", "___")](lora_A, lora_B)
            lora_out = self.proj_dict[layer_type.replace(".", "___")](lora_out)
            lora_emb.append(lora_out)
        lora_emb = torch.concat(lora_emb, dim=1)
        return lora_emb
 class FluxLoRAEncoder(torch.nn.Module):
    def __init__(self, embed_dim=4096, encoder_intermediate_size=8192, num_encoder_layers=1, num_embeds_per_lora=16, num_special_embeds=1):
        super().__init__()
        self.num_embeds_per_lora = num_embeds_per_lora
        # embedder
        self.embedder = LoRAEmbedder(L=num_embeds_per_lora, out_dim=embed_dim)
        # encoders
        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=32, head_dim=128) for _ in range(num_encoder_layers)])
        # special embedding
        self.special_embeds = torch.nn.Parameter(torch.randn(1, num_special_embeds, embed_dim))
        self.num_special_embeds = num_special_embeds
        # final layer
        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
        self.final_linear = torch.nn.Linear(embed_dim, embed_dim)
    def forward(self, lora):
        lora_embeds = self.embedder(lora)
        special_embeds = self.special_embeds.to(dtype=lora_embeds.dtype, device=lora_embeds.device)
        embeds = torch.concat([special_embeds, lora_embeds], dim=1)
        for encoder_id, encoder in enumerate(self.encoders):
            embeds = encoder(embeds)
        embeds = embeds[:, :self.num_special_embeds]
        embeds = self.final_layer_norm(embeds)
        embeds = self.final_linear(embeds)
        return embeds
    @staticmethod
    def state_dict_converter():
        return FluxLoRAEncoderStateDictConverter()
 class FluxLoRAEncoderStateDictConverter:
    def from_civitai(self, state_dict):
        return state_dict
--- a/diffsynth/models/flux_lora_patcher.py
+++ b/diffsynth/models/flux_lora_patcher.py
@@ -0,0 +1,306 @@
 import torch, math
 from ..core.loader import load_state_dict
 from typing import Union
 class GeneralLoRALoader:
    def __init__(self, device="cpu", torch_dtype=torch.float32):
        self.device = device
        self.torch_dtype = torch_dtype
    def get_name_dict(self, lora_state_dict):
        lora_name_dict = {}
        for key in lora_state_dict:
            if ".lora_B." not in key:
                continue
            keys = key.split(".")
            if len(keys) > keys.index("lora_B") + 2:
                keys.pop(keys.index("lora_B") + 1)
            keys.pop(keys.index("lora_B"))
            if keys[0] == "diffusion_model":
                keys.pop(0)
            keys.pop(-1)
            target_name = ".".join(keys)
            lora_name_dict[target_name] = (key, key.replace(".lora_B.", ".lora_A."))
        return lora_name_dict
    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
        updated_num = 0
        lora_name_dict = self.get_name_dict(state_dict_lora)
        for name, module in model.named_modules():
            if name in lora_name_dict:
                weight_up = state_dict_lora[lora_name_dict[name][0]].to(device=self.device, dtype=self.torch_dtype)
                weight_down = state_dict_lora[lora_name_dict[name][1]].to(device=self.device, dtype=self.torch_dtype)
                if len(weight_up.shape) == 4:
                    weight_up = weight_up.squeeze(3).squeeze(2)
                    weight_down = weight_down.squeeze(3).squeeze(2)
                    weight_lora = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
                else:
                    weight_lora = alpha * torch.mm(weight_up, weight_down)
                state_dict = module.state_dict()
                state_dict["weight"] = state_dict["weight"].to(device=self.device, dtype=self.torch_dtype) + weight_lora
                module.load_state_dict(state_dict)
                updated_num += 1
        print(f"{updated_num} tensors are updated by LoRA.")
 class FluxLoRALoader(GeneralLoRALoader):
    def __init__(self, device="cpu", torch_dtype=torch.float32):
        super().__init__(device=device, torch_dtype=torch_dtype)
        self.diffusers_rename_dict = {
            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_A.weight":"single_blocks.blockid.a_to_k.lora_A.default.weight",
            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_B.weight":"single_blocks.blockid.a_to_k.lora_B.default.weight",
            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_A.weight":"single_blocks.blockid.a_to_q.lora_A.default.weight",
            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_B.weight":"single_blocks.blockid.a_to_q.lora_B.default.weight",
            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_A.weight":"single_blocks.blockid.a_to_v.lora_A.default.weight",
            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_B.weight":"single_blocks.blockid.a_to_v.lora_B.default.weight",
            "transformer.single_transformer_blocks.blockid.norm.linear.lora_A.weight":"single_blocks.blockid.norm.linear.lora_A.default.weight",
            "transformer.single_transformer_blocks.blockid.norm.linear.lora_B.weight":"single_blocks.blockid.norm.linear.lora_B.default.weight",
            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_A.weight":"single_blocks.blockid.proj_in_besides_attn.lora_A.default.weight",
            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_B.weight":"single_blocks.blockid.proj_in_besides_attn.lora_B.default.weight",
            "transformer.single_transformer_blocks.blockid.proj_out.lora_A.weight":"single_blocks.blockid.proj_out.lora_A.default.weight",
            "transformer.single_transformer_blocks.blockid.proj_out.lora_B.weight":"single_blocks.blockid.proj_out.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_A.weight":"blocks.blockid.attn.b_to_k.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_B.weight":"blocks.blockid.attn.b_to_k.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_A.weight":"blocks.blockid.attn.b_to_q.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_B.weight":"blocks.blockid.attn.b_to_q.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_A.weight":"blocks.blockid.attn.b_to_v.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_B.weight":"blocks.blockid.attn.b_to_v.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_A.weight":"blocks.blockid.attn.b_to_out.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_B.weight":"blocks.blockid.attn.b_to_out.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_k.lora_A.weight":"blocks.blockid.attn.a_to_k.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_k.lora_B.weight":"blocks.blockid.attn.a_to_k.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_A.weight":"blocks.blockid.attn.a_to_out.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_B.weight":"blocks.blockid.attn.a_to_out.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_q.lora_A.weight":"blocks.blockid.attn.a_to_q.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_q.lora_B.weight":"blocks.blockid.attn.a_to_q.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_v.lora_A.weight":"blocks.blockid.attn.a_to_v.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.attn.to_v.lora_B.weight":"blocks.blockid.attn.a_to_v.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_A.weight":"blocks.blockid.ff_a.0.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_B.weight":"blocks.blockid.ff_a.0.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.ff.net.2.lora_A.weight":"blocks.blockid.ff_a.2.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.ff.net.2.lora_B.weight":"blocks.blockid.ff_a.2.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_A.weight":"blocks.blockid.ff_b.0.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_B.weight":"blocks.blockid.ff_b.0.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_A.weight":"blocks.blockid.ff_b.2.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_B.weight":"blocks.blockid.ff_b.2.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.norm1.linear.lora_A.weight":"blocks.blockid.norm1_a.linear.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.norm1.linear.lora_B.weight":"blocks.blockid.norm1_a.linear.lora_B.default.weight",
            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_A.weight":"blocks.blockid.norm1_b.linear.lora_A.default.weight",
            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_B.weight":"blocks.blockid.norm1_b.linear.lora_B.default.weight",
        }
        self.civitai_rename_dict = {
            "lora_unet_double_blocks_blockid_img_mod_lin.lora_down.weight": "blocks.blockid.norm1_a.linear.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_img_mod_lin.lora_up.weight": "blocks.blockid.norm1_a.linear.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_down.weight": "blocks.blockid.norm1_b.linear.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_up.weight": "blocks.blockid.norm1_b.linear.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_down.weight": "blocks.blockid.attn.a_to_qkv.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_up.weight": "blocks.blockid.attn.a_to_qkv.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_down.weight": "blocks.blockid.attn.b_to_qkv.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_up.weight": "blocks.blockid.attn.b_to_qkv.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_img_attn_proj.lora_down.weight": "blocks.blockid.attn.a_to_out.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_img_attn_proj.lora_up.weight": "blocks.blockid.attn.a_to_out.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_down.weight": "blocks.blockid.attn.b_to_out.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_up.weight": "blocks.blockid.attn.b_to_out.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_img_mlp_0.lora_down.weight": "blocks.blockid.ff_a.0.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_img_mlp_0.lora_up.weight": "blocks.blockid.ff_a.0.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_img_mlp_2.lora_down.weight": "blocks.blockid.ff_a.2.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_img_mlp_2.lora_up.weight": "blocks.blockid.ff_a.2.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_down.weight": "blocks.blockid.ff_b.0.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_up.weight": "blocks.blockid.ff_b.0.lora_B.default.weight",
            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_down.weight": "blocks.blockid.ff_b.2.lora_A.default.weight",
            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_up.weight": "blocks.blockid.ff_b.2.lora_B.default.weight",
            "lora_unet_single_blocks_blockid_modulation_lin.lora_down.weight": "single_blocks.blockid.norm.linear.lora_A.default.weight",
            "lora_unet_single_blocks_blockid_modulation_lin.lora_up.weight": "single_blocks.blockid.norm.linear.lora_B.default.weight",
            "lora_unet_single_blocks_blockid_linear1.lora_down.weight": "single_blocks.blockid.to_qkv_mlp.lora_A.default.weight",
            "lora_unet_single_blocks_blockid_linear1.lora_up.weight": "single_blocks.blockid.to_qkv_mlp.lora_B.default.weight",
            "lora_unet_single_blocks_blockid_linear2.lora_down.weight": "single_blocks.blockid.proj_out.lora_A.default.weight",
            "lora_unet_single_blocks_blockid_linear2.lora_up.weight": "single_blocks.blockid.proj_out.lora_B.default.weight",
        }
    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
        super().load(model, state_dict_lora, alpha)
    def convert_state_dict(self,state_dict):
        def guess_block_id(name,model_resource):
            if model_resource == 'civitai':
                names = name.split("_")
                for i in names:
                    if i.isdigit():
                        return i, name.replace(f"_{i}_", "_blockid_")
            if model_resource == 'diffusers':
                names = name.split(".")
                for i in names:
                    if i.isdigit():
                        return i, name.replace(f"transformer_blocks.{i}.", "transformer_blocks.blockid.")
            return None, None
        def guess_resource(state_dict):
            for k in state_dict:
                if "lora_unet_" in k:
                    return 'civitai'
                elif k.startswith("transformer."):
                    return 'diffusers'
                else:
                    None
        model_resource = guess_resource(state_dict)
        if model_resource is None:
            return state_dict
        rename_dict = self.diffusers_rename_dict if model_resource == 'diffusers' else self.civitai_rename_dict
        def guess_alpha(state_dict):
                for name, param in state_dict.items():
                    if ".alpha" in name:
                        for suffix in [".lora_down.weight", ".lora_A.weight"]:
                            name_ = name.replace(".alpha", suffix)
                            if name_ in state_dict:
                                lora_alpha = param.item() / state_dict[name_].shape[0]
                                lora_alpha = math.sqrt(lora_alpha)
                                return lora_alpha
                return 1
        alpha = guess_alpha(state_dict)
        state_dict_ = {}
        for name, param in state_dict.items():
            block_id, source_name = guess_block_id(name,model_resource)
            if alpha != 1:
                param *= alpha
            if source_name in rename_dict:
                target_name = rename_dict[source_name]
                target_name = target_name.replace(".blockid.", f".{block_id}.")
                state_dict_[target_name] = param
            else:
                state_dict_[name] = param
        if model_resource == 'diffusers':
            for name in list(state_dict_.keys()):
                if "single_blocks." in name and ".a_to_q." in name:
                    mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
                    if mlp is None:
                        dim = 4
                        if 'lora_A' in name:
                            dim = 1
                        mlp = torch.zeros(dim * state_dict_[name].shape[0],
                                        *state_dict_[name].shape[1:],
                                        dtype=state_dict_[name].dtype)
                    else:
                        state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
                    if 'lora_A' in name:
                        param = torch.concat([
                            state_dict_.pop(name),
                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
                            mlp,
                        ], dim=0)
                    elif 'lora_B' in name:
                        d, r = state_dict_[name].shape
                        param = torch.zeros((3*d+mlp.shape[0], 3*r+mlp.shape[1]), dtype=state_dict_[name].dtype, device=state_dict_[name].device)
                        param[:d, :r] = state_dict_.pop(name)
                        param[d:2*d, r:2*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_k."))
                        param[2*d:3*d, 2*r:3*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_v."))
                        param[3*d:, 3*r:] = mlp
                    else:
                        param = torch.concat([
                            state_dict_.pop(name),
                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
                            mlp,
                        ], dim=0)
                    name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
                    state_dict_[name_] = param
            for name in list(state_dict_.keys()):
                for component in ["a", "b"]:
                    if f".{component}_to_q." in name:
                        name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
                        concat_dim = 0
                        if 'lora_A' in name:
                            param = torch.concat([
                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
                            ], dim=0)
                        elif 'lora_B' in name:
                            origin = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
                            d, r = origin.shape
                            # print(d, r)
                            param = torch.zeros((3*d, 3*r), dtype=origin.dtype, device=origin.device)
                            param[:d, :r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
                            param[d:2*d, r:2*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")]
                            param[2*d:3*d, 2*r:3*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")]
                        else:
                            param = torch.concat([
                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
                            ], dim=0)
                        state_dict_[name_] = param
                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))  
        return state_dict_
 class LoraMerger(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.weight_base = torch.nn.Parameter(torch.randn((dim,)))
        self.weight_lora = torch.nn.Parameter(torch.randn((dim,)))
        self.weight_cross = torch.nn.Parameter(torch.randn((dim,)))
        self.weight_out = torch.nn.Parameter(torch.ones((dim,)))
        self.bias = torch.nn.Parameter(torch.randn((dim,)))
        self.activation = torch.nn.Sigmoid()
        self.norm_base = torch.nn.LayerNorm(dim, eps=1e-5)
        self.norm_lora = torch.nn.LayerNorm(dim, eps=1e-5)
    def forward(self, base_output, lora_outputs):
        norm_base_output = self.norm_base(base_output)
        norm_lora_outputs = self.norm_lora(lora_outputs)
        gate = self.activation(
            norm_base_output * self.weight_base \
            + norm_lora_outputs * self.weight_lora \
            + norm_base_output * norm_lora_outputs * self.weight_cross + self.bias
        )
        output = base_output + (self.weight_out * gate * lora_outputs).sum(dim=0)
        return output
 class FluxLoraPatcher(torch.nn.Module):
    def __init__(self, lora_patterns=None):
        super().__init__()
        if lora_patterns is None:
            lora_patterns = self.default_lora_patterns()
        model_dict = {}
        for lora_pattern in lora_patterns:
            name, dim = lora_pattern["name"], lora_pattern["dim"]
            model_dict[name.replace(".", "___")] = LoraMerger(dim)
        self.model_dict = torch.nn.ModuleDict(model_dict)
    def default_lora_patterns(self):
        lora_patterns = []
        lora_dict = {
            "attn.a_to_qkv": 9216, "attn.a_to_out": 3072, "ff_a.0": 12288, "ff_a.2": 3072, "norm1_a.linear": 18432,
            "attn.b_to_qkv": 9216, "attn.b_to_out": 3072, "ff_b.0": 12288, "ff_b.2": 3072, "norm1_b.linear": 18432,
        }
        for i in range(19):
            for suffix in lora_dict:
                lora_patterns.append({
                    "name": f"blocks.{i}.{suffix}",
                    "dim": lora_dict[suffix]
                })
        lora_dict = {"to_qkv_mlp": 21504, "proj_out": 3072, "norm.linear": 9216}
        for i in range(38):
            for suffix in lora_dict:
                lora_patterns.append({
                    "name": f"single_blocks.{i}.{suffix}",
                    "dim": lora_dict[suffix]
                })
        return lora_patterns
    def forward(self, base_output, lora_outputs, name):
        return self.model_dict[name.replace(".", "___")](base_output, lora_outputs)
--- a/diffsynth/models/flux_text_encoder_clip.py
+++ b/diffsynth/models/flux_text_encoder_clip.py
@@ -0,0 +1,112 @@
 import torch
 class Attention(torch.nn.Module):
    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
        super().__init__()
        dim_inner = head_dim * num_heads
        kv_dim = kv_dim if kv_dim is not None else q_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        batch_size = encoder_hidden_states.shape[0]
        q = self.to_q(hidden_states)
        k = self.to_k(encoder_hidden_states)
        v = self.to_v(encoder_hidden_states)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states = self.to_out(hidden_states)
        return hidden_states
 class CLIPEncoderLayer(torch.nn.Module):
    def __init__(self, embed_dim, intermediate_size, num_heads=12, head_dim=64, use_quick_gelu=True):
        super().__init__()
        self.attn = Attention(q_dim=embed_dim, num_heads=num_heads, head_dim=head_dim, bias_q=True, bias_kv=True, bias_out=True)
        self.layer_norm1 = torch.nn.LayerNorm(embed_dim)
        self.layer_norm2 = torch.nn.LayerNorm(embed_dim)
        self.fc1 = torch.nn.Linear(embed_dim, intermediate_size)
        self.fc2 = torch.nn.Linear(intermediate_size, embed_dim)
        self.use_quick_gelu = use_quick_gelu
    def quickGELU(self, x):
        return x * torch.sigmoid(1.702 * x)
    def forward(self, hidden_states, attn_mask=None):
        residual = hidden_states
        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.fc1(hidden_states)
        if self.use_quick_gelu:
            hidden_states = self.quickGELU(hidden_states)
        else:
            hidden_states = torch.nn.functional.gelu(hidden_states)
        hidden_states = self.fc2(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states
 class FluxTextEncoderClip(torch.nn.Module):
    def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=12, encoder_intermediate_size=3072):
        super().__init__()
        # token_embedding
        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
        # position_embeds (This is a fixed tensor)
        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
        # encoders
        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
        # attn_mask
        self.attn_mask = self.attention_mask(max_position_embeddings)
        # final_layer_norm
        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
    def attention_mask(self, length):
        mask = torch.empty(length, length)
        mask.fill_(float("-inf"))
        mask.triu_(1)
        return mask
    def forward(self, input_ids, clip_skip=2, extra_mask=None):
        embeds = self.token_embedding(input_ids)
        embeds = embeds + self.position_embeds.to(dtype=embeds.dtype, device=input_ids.device)
        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
        if extra_mask is not None:
            attn_mask[:, extra_mask[0]==0] = float("-inf")
        for encoder_id, encoder in enumerate(self.encoders):
            embeds = encoder(embeds, attn_mask=attn_mask)
            if encoder_id + clip_skip == len(self.encoders):
                hidden_states = embeds
        embeds = self.final_layer_norm(embeds)
        pooled_embeds = embeds[torch.arange(embeds.shape[0]), input_ids.to(dtype=torch.int).argmax(dim=-1)]
        return pooled_embeds, hidden_states
--- a/diffsynth/models/flux_text_encoder_t5.py
+++ b/diffsynth/models/flux_text_encoder_t5.py
@@ -0,0 +1,43 @@
 import torch
 from transformers import T5EncoderModel, T5Config
 class FluxTextEncoderT5(T5EncoderModel):
    def __init__(self):
        config = T5Config(**{
            "architectures": [
                "T5EncoderModel"
            ],
            "classifier_dropout": 0.0,
            "d_ff": 10240,
            "d_kv": 64,
            "d_model": 4096,
            "decoder_start_token_id": 0,
            "dense_act_fn": "gelu_new",
            "dropout_rate": 0.1,
            "dtype": "bfloat16",
            "eos_token_id": 1,
            "feed_forward_proj": "gated-gelu",
            "initializer_factor": 1.0,
            "is_encoder_decoder": True,
            "is_gated_act": True,
            "layer_norm_epsilon": 1e-06,
            "model_type": "t5",
            "num_decoder_layers": 24,
            "num_heads": 64,
            "num_layers": 24,
            "output_past": True,
            "pad_token_id": 0,
            "relative_attention_max_distance": 128,
            "relative_attention_num_buckets": 32,
            "tie_word_embeddings": False,
            "transformers_version": "4.57.1",
            "use_cache": True,
            "vocab_size": 32128
        })
        super().__init__(config)
    def forward(self, input_ids):
        outputs = super().forward(input_ids=input_ids)
        prompt_emb = outputs.last_hidden_state
        return prompt_emb
--- a/diffsynth/models/flux_vae.py
+++ b/diffsynth/models/flux_vae.py
@@ -0,0 +1,451 @@
 import torch
 from einops import rearrange, repeat
 class TileWorker:
    def __init__(self):
        pass
    def mask(self, height, width, border_width):
        # Create a mask with shape (height, width).
        # The centre area is filled with 1, and the border line is filled with values in range (0, 1].
        x = torch.arange(height).repeat(width, 1).T
        y = torch.arange(width).repeat(height, 1)
        mask = torch.stack([x + 1, height - x, y + 1, width - y]).min(dim=0).values
        mask = (mask / border_width).clip(0, 1)
        return mask
    def tile(self, model_input, tile_size, tile_stride, tile_device, tile_dtype):
        # Convert a tensor (b, c, h, w) to (b, c, tile_size, tile_size, tile_num)
        batch_size, channel, _, _ = model_input.shape
        model_input = model_input.to(device=tile_device, dtype=tile_dtype)
        unfold_operator = torch.nn.Unfold(
            kernel_size=(tile_size, tile_size),
            stride=(tile_stride, tile_stride)
        )
        model_input = unfold_operator(model_input)
        model_input = model_input.view((batch_size, channel, tile_size, tile_size, -1))
        return model_input
    def tiled_inference(self, forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype):
        # Call y=forward_fn(x) for each tile
        tile_num = model_input.shape[-1]
        model_output_stack = []
        for tile_id in range(0, tile_num, tile_batch_size):
            # process input
            tile_id_ = min(tile_id + tile_batch_size, tile_num)
            x = model_input[:, :, :, :, tile_id: tile_id_]
            x = x.to(device=inference_device, dtype=inference_dtype)
            x = rearrange(x, "b c h w n -> (n b) c h w")
            # process output
            y = forward_fn(x)
            y = rearrange(y, "(n b) c h w -> b c h w n", n=tile_id_-tile_id)
            y = y.to(device=tile_device, dtype=tile_dtype)
            model_output_stack.append(y)
        model_output = torch.concat(model_output_stack, dim=-1)
        return model_output
    def io_scale(self, model_output, tile_size):
        # Determine the size modification happened in forward_fn
        # We only consider the same scale on height and width.
        io_scale = model_output.shape[2] / tile_size
        return io_scale
    def untile(self, model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype):
        # The reversed function of tile
        mask = self.mask(tile_size, tile_size, border_width)
        mask = mask.to(device=tile_device, dtype=tile_dtype)
        mask = rearrange(mask, "h w -> 1 1 h w 1")
        model_output = model_output * mask
        fold_operator = torch.nn.Fold(
            output_size=(height, width),
            kernel_size=(tile_size, tile_size),
            stride=(tile_stride, tile_stride)
        )
        mask = repeat(mask[0, 0, :, :, 0], "h w -> 1 (h w) n", n=model_output.shape[-1])
        model_output = rearrange(model_output, "b c h w n -> b (c h w) n")
        model_output = fold_operator(model_output) / fold_operator(mask)
        return model_output
    def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_batch_size=1, tile_device="cpu", tile_dtype=torch.float32, border_width=None):
        # Prepare
        inference_device, inference_dtype = model_input.device, model_input.dtype
        height, width = model_input.shape[2], model_input.shape[3]
        border_width = int(tile_stride*0.5) if border_width is None else border_width
        # tile
        model_input = self.tile(model_input, tile_size, tile_stride, tile_device, tile_dtype)
        # inference
        model_output = self.tiled_inference(forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype)
        # resize
        io_scale = self.io_scale(model_output, tile_size)
        height, width = int(height*io_scale), int(width*io_scale)
        tile_size, tile_stride = int(tile_size*io_scale), int(tile_stride*io_scale)
        border_width = int(border_width*io_scale)
        # untile
        model_output = self.untile(model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype)
        # Done!
        model_output = model_output.to(device=inference_device, dtype=inference_dtype)
        return model_output
 class ConvAttention(torch.nn.Module):
    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
        super().__init__()
        dim_inner = head_dim * num_heads
        kv_dim = kv_dim if kv_dim is not None else q_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.to_q = torch.nn.Conv2d(q_dim, dim_inner, kernel_size=(1, 1), bias=bias_q)
        self.to_k = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
        self.to_v = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
        self.to_out = torch.nn.Conv2d(dim_inner, q_dim, kernel_size=(1, 1), bias=bias_out)
    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        batch_size = encoder_hidden_states.shape[0]
        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
        q = self.to_q(conv_input)
        q = rearrange(q[:, :, :, 0], "B C L -> B L C")
        conv_input = rearrange(encoder_hidden_states, "B L C -> B C L 1")
        k = self.to_k(conv_input)
        v = self.to_v(conv_input)
        k = rearrange(k[:, :, :, 0], "B C L -> B L C")
        v = rearrange(v[:, :, :, 0], "B C L -> B L C")
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
        hidden_states = self.to_out(conv_input)
        hidden_states = rearrange(hidden_states[:, :, :, 0], "B C L -> B L C")
        return hidden_states
 class Attention(torch.nn.Module):
    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
        super().__init__()
        dim_inner = head_dim * num_heads
        kv_dim = kv_dim if kv_dim is not None else q_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        batch_size = encoder_hidden_states.shape[0]
        q = self.to_q(hidden_states)
        k = self.to_k(encoder_hidden_states)
        v = self.to_v(encoder_hidden_states)
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
        hidden_states = hidden_states.to(q.dtype)
        hidden_states = self.to_out(hidden_states)
        return hidden_states
 class VAEAttentionBlock(torch.nn.Module):
    def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5, use_conv_attention=True):
        super().__init__()
        inner_dim = num_attention_heads * attention_head_dim
        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
        if use_conv_attention:
            self.transformer_blocks = torch.nn.ModuleList([
                ConvAttention(
                    inner_dim,
                    num_attention_heads,
                    attention_head_dim,
                    bias_q=True,
                    bias_kv=True,
                    bias_out=True
                )
                for d in range(num_layers)
            ])
        else:
            self.transformer_blocks = torch.nn.ModuleList([
                Attention(
                    inner_dim,
                    num_attention_heads,
                    attention_head_dim,
                    bias_q=True,
                    bias_kv=True,
                    bias_out=True
                )
                for d in range(num_layers)
            ])
    def forward(self, hidden_states, time_emb, text_emb, res_stack):
        batch, _, height, width = hidden_states.shape
        residual = hidden_states
        hidden_states = self.norm(hidden_states)
        inner_dim = hidden_states.shape[1]
        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
        for block in self.transformer_blocks:
            hidden_states = block(hidden_states)
        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
        hidden_states = hidden_states + residual
        return hidden_states, time_emb, text_emb, res_stack
 class ResnetBlock(torch.nn.Module):
    def __init__(self, in_channels, out_channels, temb_channels=None, groups=32, eps=1e-5):
        super().__init__()
        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        if temb_channels is not None:
            self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
        self.norm2 = torch.nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.nonlinearity = torch.nn.SiLU()
        self.conv_shortcut = None
        if in_channels != out_channels:
            self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
        x = hidden_states
        x = self.norm1(x)
        x = self.nonlinearity(x)
        x = self.conv1(x)
        if time_emb is not None:
            emb = self.nonlinearity(time_emb)
            emb = self.time_emb_proj(emb)[:, :, None, None]
            x = x + emb
        x = self.norm2(x)
        x = self.nonlinearity(x)
        x = self.conv2(x)
        if self.conv_shortcut is not None:
            hidden_states = self.conv_shortcut(hidden_states)
        hidden_states = hidden_states + x
        return hidden_states, time_emb, text_emb, res_stack
 class UpSampler(torch.nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv = torch.nn.Conv2d(channels, channels, 3, padding=1)
    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
        hidden_states = torch.nn.functional.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
        hidden_states = self.conv(hidden_states)
        return hidden_states, time_emb, text_emb, res_stack
 class DownSampler(torch.nn.Module):
    def __init__(self, channels, padding=1, extra_padding=False):
        super().__init__()
        self.conv = torch.nn.Conv2d(channels, channels, 3, stride=2, padding=padding)
        self.extra_padding = extra_padding
    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
        if self.extra_padding:
            hidden_states = torch.nn.functional.pad(hidden_states, (0, 1, 0, 1), mode="constant", value=0)
        hidden_states = self.conv(hidden_states)
        return hidden_states, time_emb, text_emb, res_stack
 class FluxVAEDecoder(torch.nn.Module):
    def __init__(self, use_conv_attention=True):
        super().__init__()
        self.scaling_factor = 0.3611
        self.shift_factor = 0.1159
        self.conv_in = torch.nn.Conv2d(16, 512, kernel_size=3, padding=1) # Different from SD 1.x
        self.blocks = torch.nn.ModuleList([
            # UNetMidBlock2D
            ResnetBlock(512, 512, eps=1e-6),
            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, use_conv_attention=use_conv_attention),
            ResnetBlock(512, 512, eps=1e-6),
            # UpDecoderBlock2D
            ResnetBlock(512, 512, eps=1e-6),
            ResnetBlock(512, 512, eps=1e-6),
            ResnetBlock(512, 512, eps=1e-6),
            UpSampler(512),
            # UpDecoderBlock2D
            ResnetBlock(512, 512, eps=1e-6),
            ResnetBlock(512, 512, eps=1e-6),
            ResnetBlock(512, 512, eps=1e-6),
            UpSampler(512),
            # UpDecoderBlock2D
            ResnetBlock(512, 256, eps=1e-6),
            ResnetBlock(256, 256, eps=1e-6),
            ResnetBlock(256, 256, eps=1e-6),
            UpSampler(256),
            # UpDecoderBlock2D
            ResnetBlock(256, 128, eps=1e-6),
            ResnetBlock(128, 128, eps=1e-6),
            ResnetBlock(128, 128, eps=1e-6),
        ])
        self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6)
        self.conv_act = torch.nn.SiLU()
        self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
        hidden_states = TileWorker().tiled_forward(
            lambda x: self.forward(x),
            sample,
            tile_size,
            tile_stride,
            tile_device=sample.device,
            tile_dtype=sample.dtype
        )
        return hidden_states
    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
        # For VAE Decoder, we do not need to apply the tiler on each layer.
        if tiled:
            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
        # 1. pre-process
        hidden_states = sample / self.scaling_factor + self.shift_factor
        hidden_states = self.conv_in(hidden_states)
        time_emb = None
        text_emb = None
        res_stack = None
        # 2. blocks
        for i, block in enumerate(self.blocks):
            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
        # 3. output
        hidden_states = self.conv_norm_out(hidden_states)
        hidden_states = self.conv_act(hidden_states)
        hidden_states = self.conv_out(hidden_states)
        return hidden_states
 class FluxVAEEncoder(torch.nn.Module):
    def __init__(self, use_conv_attention=True):
        super().__init__()
        self.scaling_factor = 0.3611
        self.shift_factor = 0.1159
        self.conv_in = torch.nn.Conv2d(3, 128, kernel_size=3, padding=1)
        self.blocks = torch.nn.ModuleList([
            # DownEncoderBlock2D
            ResnetBlock(128, 128, eps=1e-6),
            ResnetBlock(128, 128, eps=1e-6),
            DownSampler(128, padding=0, extra_padding=True),
            # DownEncoderBlock2D
            ResnetBlock(128, 256, eps=1e-6),
            ResnetBlock(256, 256, eps=1e-6),
            DownSampler(256, padding=0, extra_padding=True),
            # DownEncoderBlock2D
            ResnetBlock(256, 512, eps=1e-6),
            ResnetBlock(512, 512, eps=1e-6),
            DownSampler(512, padding=0, extra_padding=True),
            # DownEncoderBlock2D
            ResnetBlock(512, 512, eps=1e-6),
            ResnetBlock(512, 512, eps=1e-6),
            # UNetMidBlock2D
            ResnetBlock(512, 512, eps=1e-6),
            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, use_conv_attention=use_conv_attention),
            ResnetBlock(512, 512, eps=1e-6),
        ])
        self.conv_norm_out = torch.nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6)
        self.conv_act = torch.nn.SiLU()
        self.conv_out = torch.nn.Conv2d(512, 32, kernel_size=3, padding=1)
    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
        hidden_states = TileWorker().tiled_forward(
            lambda x: self.forward(x),
            sample,
            tile_size,
            tile_stride,
            tile_device=sample.device,
            tile_dtype=sample.dtype
        )
        return hidden_states
    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
        # For VAE Decoder, we do not need to apply the tiler on each layer.
        if tiled:
            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
        # 1. pre-process
        hidden_states = self.conv_in(sample)
        time_emb = None
        text_emb = None
        res_stack = None
        # 2. blocks
        for i, block in enumerate(self.blocks):
            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
        # 3. output
        hidden_states = self.conv_norm_out(hidden_states)
        hidden_states = self.conv_act(hidden_states)
        hidden_states = self.conv_out(hidden_states)
        hidden_states = hidden_states[:, :16]
        hidden_states = (hidden_states - self.shift_factor) * self.scaling_factor
        return hidden_states
    def encode_video(self, sample, batch_size=8):
        B = sample.shape[0]
        hidden_states = []
        for i in range(0, sample.shape[2], batch_size):
            j = min(i + batch_size, sample.shape[2])
            sample_batch = rearrange(sample[:,:,i:j], "B C T H W -> (B T) C H W")
            hidden_states_batch = self(sample_batch)
            hidden_states_batch = rearrange(hidden_states_batch, "(B T) C H W -> B C T H W", B=B)
            hidden_states.append(hidden_states_batch)
        hidden_states = torch.concat(hidden_states, dim=2)
        return hidden_states
--- a/diffsynth/models/flux_value_control.py
+++ b/diffsynth/models/flux_value_control.py
@@ -0,0 +1,56 @@
 import torch
 from .general_modules import TemporalTimesteps
 class MultiValueEncoder(torch.nn.Module):
    def __init__(self, encoders=()):
        super().__init__()
        if not isinstance(encoders, list):
            encoders = [encoders]
        self.encoders = torch.nn.ModuleList(encoders)
    def __call__(self, values, dtype):
        emb = []
        for encoder, value in zip(self.encoders, values):
            if value is not None:
                value = value.unsqueeze(0)
                emb.append(encoder(value, dtype))
        emb = torch.concat(emb, dim=0)
        return emb
 class SingleValueEncoder(torch.nn.Module):
    def __init__(self, dim_in=256, dim_out=4096, prefer_len=32, computation_device=None):
        super().__init__()
        self.prefer_len = prefer_len
        self.prefer_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0, computation_device=computation_device)
        self.prefer_value_embedder = torch.nn.Sequential(
            torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
        )
        self.positional_embedding = torch.nn.Parameter(
            torch.randn(self.prefer_len, dim_out) 
        )
    def forward(self, value, dtype):
        value = value * 1000
        emb = self.prefer_proj(value).to(dtype)
        emb = self.prefer_value_embedder(emb).squeeze(0)
        base_embeddings = emb.expand(self.prefer_len, -1)
        positional_embedding = self.positional_embedding.to(dtype=base_embeddings.dtype, device=base_embeddings.device)
        learned_embeddings = base_embeddings + positional_embedding
        return learned_embeddings
    @staticmethod
    def state_dict_converter():
        return SingleValueEncoderStateDictConverter()
 class SingleValueEncoderStateDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        return state_dict
    def from_civitai(self, state_dict):
        return state_dict
--- a/diffsynth/models/general_modules.py
+++ b/diffsynth/models/general_modules.py
@@ -0,0 +1,146 @@
 import torch, math
 def get_timestep_embedding(
    timesteps: torch.Tensor,
    embedding_dim: int,
    flip_sin_to_cos: bool = False,
    downscale_freq_shift: float = 1,
    scale: float = 1,
    max_period: int = 10000,
    computation_device = None,
    align_dtype_to_timestep = False,
 ):
    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
    half_dim = embedding_dim // 2
    exponent = -math.log(max_period) * torch.arange(
        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device if computation_device is None else computation_device
    )
    exponent = exponent / (half_dim - downscale_freq_shift)
    emb = torch.exp(exponent)
    if align_dtype_to_timestep:
        emb = emb.to(timesteps.dtype)
    emb = timesteps[:, None].float() * emb[None, :]
    # scale embeddings
    emb = scale * emb
    # concat sine and cosine embeddings
    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
    # flip sine and cosine embeddings
    if flip_sin_to_cos:
        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
    # zero pad
    if embedding_dim % 2 == 1:
        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
    return emb
 class TemporalTimesteps(torch.nn.Module):
    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, computation_device = None, scale=1, align_dtype_to_timestep=False):
        super().__init__()
        self.num_channels = num_channels
        self.flip_sin_to_cos = flip_sin_to_cos
        self.downscale_freq_shift = downscale_freq_shift
        self.computation_device = computation_device
        self.scale = scale
        self.align_dtype_to_timestep = align_dtype_to_timestep
    def forward(self, timesteps):
        t_emb = get_timestep_embedding(
            timesteps,
            self.num_channels,
            flip_sin_to_cos=self.flip_sin_to_cos,
            downscale_freq_shift=self.downscale_freq_shift,
            computation_device=self.computation_device,
            scale=self.scale,
            align_dtype_to_timestep=self.align_dtype_to_timestep,
        )
        return t_emb
 class DiffusersCompatibleTimestepProj(torch.nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.linear_1 = torch.nn.Linear(dim_in, dim_out)
        self.act = torch.nn.SiLU()
        self.linear_2 = torch.nn.Linear(dim_out, dim_out)
    def forward(self, x):
        x = self.linear_1(x)
        x = self.act(x)
        x = self.linear_2(x)
        return x
 class TimestepEmbeddings(torch.nn.Module):
    def __init__(self, dim_in, dim_out, computation_device=None, diffusers_compatible_format=False, scale=1, align_dtype_to_timestep=False, use_additional_t_cond=False):
        super().__init__()
        self.time_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0, computation_device=computation_device, scale=scale, align_dtype_to_timestep=align_dtype_to_timestep)
        if diffusers_compatible_format:
            self.timestep_embedder = DiffusersCompatibleTimestepProj(dim_in, dim_out)
        else:
            self.timestep_embedder = torch.nn.Sequential(
                torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
            )
        self.use_additional_t_cond = use_additional_t_cond
        if use_additional_t_cond:
            self.addition_t_embedding = torch.nn.Embedding(2, dim_out)
    def forward(self, timestep, dtype, addition_t_cond=None):
        time_emb = self.time_proj(timestep).to(dtype)
        time_emb = self.timestep_embedder(time_emb)
        if addition_t_cond is not None:
            addition_t_emb = self.addition_t_embedding(addition_t_cond)
            addition_t_emb = addition_t_emb.to(dtype=dtype)
            time_emb = time_emb + addition_t_emb
        return time_emb
 class RMSNorm(torch.nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True):
        super().__init__()
        self.eps = eps
        if elementwise_affine:
            self.weight = torch.nn.Parameter(torch.ones((dim,)))
        else:
            self.weight = None
    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
        hidden_states = hidden_states.to(input_dtype)
        if self.weight is not None:
            hidden_states = hidden_states * self.weight
        return hidden_states
 class AdaLayerNorm(torch.nn.Module):
    def __init__(self, dim, single=False, dual=False):
        super().__init__()
        self.single = single
        self.dual = dual
        self.linear = torch.nn.Linear(dim, dim * [[6, 2][single], 9][dual])
        self.norm = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
    def forward(self, x, emb):
        emb = self.linear(torch.nn.functional.silu(emb))
        if self.single:
            scale, shift = emb.unsqueeze(1).chunk(2, dim=2)
            x = self.norm(x) * (1 + scale) + shift
            return x
        elif self.dual:
            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, shift_msa2, scale_msa2, gate_msa2 = emb.unsqueeze(1).chunk(9, dim=2)
            norm_x = self.norm(x)
            x = norm_x * (1 + scale_msa) + shift_msa
            norm_x2 = norm_x * (1 + scale_msa2) + shift_msa2
            return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_x2, gate_msa2
        else:
            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.unsqueeze(1).chunk(6, dim=2)
            x = self.norm(x) * (1 + scale_msa) + shift_msa
            return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
--- a/diffsynth/models/hunyuan_dit.py
+++ b/diffsynth/models/hunyuan_dit.py
@@ -1,451 +0,0 @@
 from .attention import Attention
 from .tiler import TileWorker
 from einops import repeat, rearrange
 import math
 import torch
 class HunyuanDiTRotaryEmbedding(torch.nn.Module):
    def __init__(self, q_norm_shape=88, k_norm_shape=88, rotary_emb_on_k=True):
        super().__init__()
        self.q_norm = torch.nn.LayerNorm((q_norm_shape,), elementwise_affine=True, eps=1e-06)
        self.k_norm = torch.nn.LayerNorm((k_norm_shape,), elementwise_affine=True, eps=1e-06)
        self.rotary_emb_on_k = rotary_emb_on_k
        self.k_cache, self.v_cache = [], []
    def reshape_for_broadcast(self, freqs_cis, x):
        ndim = x.ndim
        shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
    def rotate_half(self, x):
        x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
        return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
    def apply_rotary_emb(self, xq, xk, freqs_cis):
        xk_out = None
        cos, sin = self.reshape_for_broadcast(freqs_cis, xq)
        cos, sin = cos.to(xq.device), sin.to(xq.device)
        xq_out = (xq.float() * cos + self.rotate_half(xq.float()) * sin).type_as(xq)
        if xk is not None:
            xk_out = (xk.float() * cos + self.rotate_half(xk.float()) * sin).type_as(xk)
        return xq_out, xk_out
    def forward(self, q, k, v, freqs_cis_img, to_cache=False):
        # norm
        q = self.q_norm(q)
        k = self.k_norm(k)
        # RoPE
        if self.rotary_emb_on_k:
            q, k = self.apply_rotary_emb(q, k, freqs_cis_img)
        else:
            q, _ = self.apply_rotary_emb(q, None, freqs_cis_img)
        if to_cache:
            self.k_cache.append(k)
            self.v_cache.append(v)
        elif len(self.k_cache) > 0 and len(self.v_cache) > 0:
            k = torch.concat([k] + self.k_cache, dim=2)
            v = torch.concat([v] + self.v_cache, dim=2)
            self.k_cache, self.v_cache = [], []
        return q, k, v
 class FP32_Layernorm(torch.nn.LayerNorm):
    def forward(self, inputs):
        origin_dtype = inputs.dtype
        return torch.nn.functional.layer_norm(inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps).to(origin_dtype)
 class FP32_SiLU(torch.nn.SiLU):
    def forward(self, inputs):
        origin_dtype = inputs.dtype
        return torch.nn.functional.silu(inputs.float(), inplace=False).to(origin_dtype)
 class HunyuanDiTFinalLayer(torch.nn.Module):
    def __init__(self, final_hidden_size=1408, condition_dim=1408, patch_size=2, out_channels=8):
        super().__init__()
        self.norm_final = torch.nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = torch.nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
        self.adaLN_modulation = torch.nn.Sequential(
            FP32_SiLU(),
            torch.nn.Linear(condition_dim, 2 * final_hidden_size, bias=True)
        )
    def modulate(self, x, shift, scale):
        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
    def forward(self, hidden_states, condition_emb):
        shift, scale = self.adaLN_modulation(condition_emb).chunk(2, dim=1)
        hidden_states = self.modulate(self.norm_final(hidden_states), shift, scale)
        hidden_states = self.linear(hidden_states)
        return hidden_states
 class HunyuanDiTBlock(torch.nn.Module):
    def __init__(
        self,
        hidden_dim=1408,
        condition_dim=1408,
        num_heads=16,
        mlp_ratio=4.3637,
        text_dim=1024,
        skip_connection=False
    ):
        super().__init__()
        self.norm1 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
        self.rota1 = HunyuanDiTRotaryEmbedding(hidden_dim//num_heads, hidden_dim//num_heads)
        self.attn1 = Attention(hidden_dim, num_heads, hidden_dim//num_heads, bias_q=True, bias_kv=True, bias_out=True)
        self.norm2 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
        self.rota2 = HunyuanDiTRotaryEmbedding(hidden_dim//num_heads, hidden_dim//num_heads, rotary_emb_on_k=False)
        self.attn2 = Attention(hidden_dim, num_heads, hidden_dim//num_heads, kv_dim=text_dim, bias_q=True, bias_kv=True, bias_out=True)
        self.norm3 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
        self.modulation = torch.nn.Sequential(FP32_SiLU(), torch.nn.Linear(condition_dim, hidden_dim, bias=True))
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, int(hidden_dim*mlp_ratio), bias=True),
            torch.nn.GELU(approximate="tanh"),
            torch.nn.Linear(int(hidden_dim*mlp_ratio), hidden_dim, bias=True)
        )
        if skip_connection:
            self.skip_norm = FP32_Layernorm((hidden_dim * 2,), eps=1e-6, elementwise_affine=True)
            self.skip_linear = torch.nn.Linear(hidden_dim * 2, hidden_dim, bias=True)
        else:
            self.skip_norm, self.skip_linear = None, None
    def forward(self, hidden_states, condition_emb, text_emb, freq_cis_img, residual=None, to_cache=False):
        # Long Skip Connection
        if self.skip_norm is not None and self.skip_linear is not None:
            hidden_states = torch.cat([hidden_states, residual], dim=-1)
            hidden_states = self.skip_norm(hidden_states)
            hidden_states = self.skip_linear(hidden_states)
        # Self-Attention
        shift_msa = self.modulation(condition_emb).unsqueeze(dim=1)
        attn_input = self.norm1(hidden_states) + shift_msa
        hidden_states = hidden_states + self.attn1(attn_input, qkv_preprocessor=lambda q, k, v: self.rota1(q, k, v, freq_cis_img, to_cache=to_cache))
        # Cross-Attention
        attn_input = self.norm3(hidden_states)
        hidden_states = hidden_states + self.attn2(attn_input, text_emb, qkv_preprocessor=lambda q, k, v: self.rota2(q, k, v, freq_cis_img))
        # FFN Layer
        mlp_input = self.norm2(hidden_states)
        hidden_states = hidden_states + self.mlp(mlp_input)
        return hidden_states
 class AttentionPool(torch.nn.Module):
    def __init__(self, spacial_dim, embed_dim, num_heads, output_dim = None):
        super().__init__()
        self.positional_embedding = torch.nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
        self.k_proj = torch.nn.Linear(embed_dim, embed_dim)
        self.q_proj = torch.nn.Linear(embed_dim, embed_dim)
        self.v_proj = torch.nn.Linear(embed_dim, embed_dim)
        self.c_proj = torch.nn.Linear(embed_dim, output_dim or embed_dim)
        self.num_heads = num_heads
    def forward(self, x):
        x = x.permute(1, 0, 2)  # NLC -> LNC
        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (L+1)NC
        x, _ = torch.nn.functional.multi_head_attention_forward(
            query=x[:1], key=x, value=x,
            embed_dim_to_check=x.shape[-1],
            num_heads=self.num_heads,
            q_proj_weight=self.q_proj.weight,
            k_proj_weight=self.k_proj.weight,
            v_proj_weight=self.v_proj.weight,
            in_proj_weight=None,
            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
            bias_k=None,
            bias_v=None,
            add_zero_attn=False,
            dropout_p=0,
            out_proj_weight=self.c_proj.weight,
            out_proj_bias=self.c_proj.bias,
            use_separate_proj_weight=True,
            training=self.training,
            need_weights=False
        )
        return x.squeeze(0)
 class PatchEmbed(torch.nn.Module):
    def __init__(
        self,
        patch_size=(2, 2),
        in_chans=4,
        embed_dim=1408,
        bias=True,
    ):
        super().__init__()
        self.proj = torch.nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        return x
 def timestep_embedding(t, dim, max_period=10000, repeat_only=False):
    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
    if not repeat_only:
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period)
            * torch.arange(start=0, end=half, dtype=torch.float32)
            / half
        ).to(device=t.device)   # size: [dim/2], 一个指数衰减的曲线
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat(
                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
            )
    else:
        embedding = repeat(t, "b -> b d", d=dim)
    return embedding
 class TimestepEmbedder(torch.nn.Module):
    def __init__(self, hidden_size=1408, frequency_embedding_size=256):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            torch.nn.SiLU(),
            torch.nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size
    def forward(self, t):
        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
        t_emb = self.mlp(t_freq)
        return t_emb
 class HunyuanDiT(torch.nn.Module):
    def __init__(self, num_layers_down=21, num_layers_up=19, in_channels=4, out_channels=8, hidden_dim=1408, text_dim=1024, t5_dim=2048, text_length=77, t5_length=256):
        super().__init__()
        # Embedders
        self.text_emb_padding = torch.nn.Parameter(torch.randn(text_length + t5_length, text_dim, dtype=torch.float32))
        self.t5_embedder = torch.nn.Sequential(
            torch.nn.Linear(t5_dim, t5_dim * 4, bias=True),
            FP32_SiLU(),
            torch.nn.Linear(t5_dim * 4, text_dim, bias=True),
        )
        self.t5_pooler = AttentionPool(t5_length, t5_dim, num_heads=8, output_dim=1024)
        self.style_embedder = torch.nn.Parameter(torch.randn(hidden_dim))
        self.patch_embedder = PatchEmbed(in_chans=in_channels)
        self.timestep_embedder = TimestepEmbedder()
        self.extra_embedder = torch.nn.Sequential(
            torch.nn.Linear(256 * 6 + 1024 + hidden_dim, hidden_dim * 4),
            FP32_SiLU(),
            torch.nn.Linear(hidden_dim * 4, hidden_dim),
        )
        # Transformer blocks
        self.num_layers_down = num_layers_down
        self.num_layers_up = num_layers_up
        self.blocks = torch.nn.ModuleList(
            [HunyuanDiTBlock(skip_connection=False) for _ in range(num_layers_down)] + \
            [HunyuanDiTBlock(skip_connection=True) for _ in range(num_layers_up)]
        )
        # Output layers
        self.final_layer = HunyuanDiTFinalLayer()
        self.out_channels = out_channels
    def prepare_text_emb(self, text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5):
        text_emb_mask = text_emb_mask.bool()
        text_emb_mask_t5 = text_emb_mask_t5.bool()
        text_emb_t5 = self.t5_embedder(text_emb_t5)
        text_emb = torch.cat([text_emb, text_emb_t5], dim=1)
        text_emb_mask = torch.cat([text_emb_mask, text_emb_mask_t5], dim=-1)
        text_emb = torch.where(text_emb_mask.unsqueeze(2), text_emb, self.text_emb_padding.to(text_emb))
        return text_emb
    def prepare_extra_emb(self, text_emb_t5, timestep, size_emb, dtype, batch_size):
        # Text embedding
        pooled_text_emb_t5 = self.t5_pooler(text_emb_t5)
        # Timestep embedding
        timestep_emb = self.timestep_embedder(timestep)
        # Size embedding
        size_emb = timestep_embedding(size_emb.view(-1), 256).to(dtype)
        size_emb = size_emb.view(-1, 6 * 256)
        # Style embedding
        style_emb = repeat(self.style_embedder, "D -> B D", B=batch_size)
        # Concatenate all extra vectors
        extra_emb = torch.cat([pooled_text_emb_t5, size_emb, style_emb], dim=1)
        condition_emb = timestep_emb + self.extra_embedder(extra_emb)
        return condition_emb
    def unpatchify(self, x, h, w):
        return rearrange(x, "B (H W) (P Q C) -> B C (H P) (W Q)", H=h, W=w, P=2, Q=2)
    def build_mask(self, data, is_bound):
        _, _, H, W = data.shape
        h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
        w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
        border_width = (H + W) // 4
        pad = torch.ones_like(h) * border_width
        mask = torch.stack([
            pad if is_bound[0] else h + 1,
            pad if is_bound[1] else H - h,
            pad if is_bound[2] else w + 1,
            pad if is_bound[3] else W - w
        ]).min(dim=0).values
        mask = mask.clip(1, border_width)
        mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
        mask = rearrange(mask, "H W -> 1 H W")
        return mask
    def tiled_block_forward(self, block, hidden_states, condition_emb, text_emb, freq_cis_img, residual, torch_dtype, data_device, computation_device, tile_size, tile_stride):
        B, C, H, W = hidden_states.shape
        weight = torch.zeros((1, 1, H, W), dtype=torch_dtype, device=data_device)
        values = torch.zeros((B, C, H, W), dtype=torch_dtype, device=data_device)
        # Split tasks
        tasks = []
        for h in range(0, H, tile_stride):
            for w in range(0, W, tile_stride):
                if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
                    continue
                h_, w_ = h + tile_size, w + tile_size
                if h_ > H: h, h_ = H - tile_size, H
                if w_ > W: w, w_ = W - tile_size, W
                tasks.append((h, h_, w, w_))
        # Run
        for hl, hr, wl, wr in tasks:
            hidden_states_batch = hidden_states[:, :, hl:hr, wl:wr].to(computation_device)
            hidden_states_batch = rearrange(hidden_states_batch, "B C H W -> B (H W) C")
            if residual is not None:
                residual_batch = residual[:, :, hl:hr, wl:wr].to(computation_device)
                residual_batch = rearrange(residual_batch, "B C H W -> B (H W) C")
            else:
                residual_batch = None
            # Forward
            hidden_states_batch = block(hidden_states_batch, condition_emb, text_emb, freq_cis_img, residual_batch).to(data_device)
            hidden_states_batch = rearrange(hidden_states_batch, "B (H W) C -> B C H W", H=hr-hl)
            mask = self.build_mask(hidden_states_batch, is_bound=(hl==0, hr>=H, wl==0, wr>=W))
            values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
            weight[:, :, hl:hr, wl:wr] += mask
        values /= weight
        return values
    def forward(
        self, hidden_states, text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5, timestep, size_emb, freq_cis_img,
        tiled=False, tile_size=64, tile_stride=32,
        to_cache=False,
        use_gradient_checkpointing=False,
    ):
        # Embeddings
        text_emb = self.prepare_text_emb(text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5)
        condition_emb = self.prepare_extra_emb(text_emb_t5, timestep, size_emb, hidden_states.dtype, hidden_states.shape[0])
        # Input
        height, width = hidden_states.shape[-2], hidden_states.shape[-1]
        hidden_states = self.patch_embedder(hidden_states)
        # Blocks
        def create_custom_forward(module):
            def custom_forward(*inputs):
                return module(*inputs)
            return custom_forward
        if tiled:
            hidden_states = rearrange(hidden_states, "B (H W) C -> B C H W", H=height//2)
            residuals = []
            for block_id, block in enumerate(self.blocks):
                residual = residuals.pop() if block_id >= self.num_layers_down else None
                hidden_states = self.tiled_block_forward(
                    block, hidden_states, condition_emb, text_emb, freq_cis_img, residual,
                    torch_dtype=hidden_states.dtype, data_device=hidden_states.device, computation_device=hidden_states.device,
                    tile_size=tile_size, tile_stride=tile_stride
                )
                if block_id < self.num_layers_down - 2:
                    residuals.append(hidden_states)
            hidden_states = rearrange(hidden_states, "B C H W -> B (H W) C")
        else:
            residuals = []
            for block_id, block in enumerate(self.blocks):
                residual = residuals.pop() if block_id >= self.num_layers_down else None
                if self.training and use_gradient_checkpointing:
                    hidden_states = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(block),
                        hidden_states, condition_emb, text_emb, freq_cis_img, residual,
                        use_reentrant=False,
                    )
                else:
                    hidden_states = block(hidden_states, condition_emb, text_emb, freq_cis_img, residual, to_cache=to_cache)
                if block_id < self.num_layers_down - 2:
                    residuals.append(hidden_states)
        # Output
        hidden_states = self.final_layer(hidden_states, condition_emb)
        hidden_states = self.unpatchify(hidden_states, height//2, width//2)
        hidden_states, _ = hidden_states.chunk(2, dim=1)
        return hidden_states
    def state_dict_converter(self):
        return HunyuanDiTStateDictConverter()
 class HunyuanDiTStateDictConverter():
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        state_dict_ = {}
        for name, param in state_dict.items():
            name_ = name
            name_ = name_.replace(".default_modulation.", ".modulation.")
            name_ = name_.replace(".mlp.fc1.", ".mlp.0.")
            name_ = name_.replace(".mlp.fc2.", ".mlp.2.")
            name_ = name_.replace(".attn1.q_norm.", ".rota1.q_norm.")
            name_ = name_.replace(".attn2.q_norm.", ".rota2.q_norm.")
            name_ = name_.replace(".attn1.k_norm.", ".rota1.k_norm.")
            name_ = name_.replace(".attn2.k_norm.", ".rota2.k_norm.")
            name_ = name_.replace(".q_proj.", ".to_q.")
            name_ = name_.replace(".out_proj.", ".to_out.")
            name_ = name_.replace("text_embedding_padding", "text_emb_padding")
            name_ = name_.replace("mlp_t5.0.", "t5_embedder.0.")
            name_ = name_.replace("mlp_t5.2.", "t5_embedder.2.")
            name_ = name_.replace("pooler.", "t5_pooler.")
            name_ = name_.replace("x_embedder.", "patch_embedder.")
            name_ = name_.replace("t_embedder.", "timestep_embedder.")
            name_ = name_.replace("t5_pooler.to_q.", "t5_pooler.q_proj.")
            name_ = name_.replace("style_embedder.weight", "style_embedder")
            if ".kv_proj." in name_:
                param_k = param[:param.shape[0]//2]
                param_v = param[param.shape[0]//2:]
                state_dict_[name_.replace(".kv_proj.", ".to_k.")] = param_k
                state_dict_[name_.replace(".kv_proj.", ".to_v.")] = param_v
            elif ".Wqkv." in name_:
                param_q = param[:param.shape[0]//3]
                param_k = param[param.shape[0]//3:param.shape[0]//3*2]
                param_v = param[param.shape[0]//3*2:]
                state_dict_[name_.replace(".Wqkv.", ".to_q.")] = param_q
                state_dict_[name_.replace(".Wqkv.", ".to_k.")] = param_k
                state_dict_[name_.replace(".Wqkv.", ".to_v.")] = param_v
            elif "style_embedder" in name_:
                state_dict_[name_] = param.squeeze()
            else:
                state_dict_[name_] = param
        return state_dict_
    def from_civitai(self, state_dict):
        return self.from_diffusers(state_dict)
--- a/diffsynth/models/hunyuan_dit_text_encoder.py
+++ b/diffsynth/models/hunyuan_dit_text_encoder.py
@@ -1,161 +0,0 @@
 from transformers import BertModel, BertConfig, T5EncoderModel, T5Config
 import torch
 class HunyuanDiTCLIPTextEncoder(BertModel):
    def __init__(self):
        config = BertConfig(
            _name_or_path = "",
            architectures = ["BertModel"],
            attention_probs_dropout_prob = 0.1,
            bos_token_id = 0,
            classifier_dropout = None,
            directionality = "bidi",
            eos_token_id = 2,
            hidden_act = "gelu",
            hidden_dropout_prob = 0.1,
            hidden_size = 1024,
            initializer_range = 0.02,
            intermediate_size = 4096,
            layer_norm_eps = 1e-12,
            max_position_embeddings = 512,
            model_type = "bert",
            num_attention_heads = 16,
            num_hidden_layers = 24,
            output_past = True,
            pad_token_id = 0,
            pooler_fc_size = 768,
            pooler_num_attention_heads = 12,
            pooler_num_fc_layers = 3,
            pooler_size_per_head = 128,
            pooler_type = "first_token_transform",
            position_embedding_type = "absolute",
            torch_dtype = "float32",
            transformers_version = "4.37.2",
            type_vocab_size = 2,
            use_cache = True,
            vocab_size = 47020
        )
        super().__init__(config, add_pooling_layer=False)
        self.eval()
    def forward(self, input_ids, attention_mask, clip_skip=1):
        input_shape = input_ids.size()
        batch_size, seq_length = input_shape
        device = input_ids.device
        past_key_values_length = 0
        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=None,
            token_type_ids=None,
            inputs_embeds=None,
            past_key_values_length=0,
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_values=None,
            use_cache=False,
            output_attentions=False,
            output_hidden_states=True,
            return_dict=True,
        )
        all_hidden_states = encoder_outputs.hidden_states
        prompt_emb = all_hidden_states[-clip_skip]
        if clip_skip > 1:
            mean, std = all_hidden_states[-1].mean(), all_hidden_states[-1].std()
            prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean
        return prompt_emb
    def state_dict_converter(self):
        return HunyuanDiTCLIPTextEncoderStateDictConverter()
 class HunyuanDiTT5TextEncoder(T5EncoderModel):
    def __init__(self):
        config = T5Config(
            _name_or_path = "../HunyuanDiT/t2i/mt5",
            architectures = ["MT5ForConditionalGeneration"],
            classifier_dropout = 0.0,
            d_ff = 5120,
            d_kv = 64,
            d_model = 2048,
            decoder_start_token_id = 0,
            dense_act_fn = "gelu_new",
            dropout_rate = 0.1,
            eos_token_id = 1,
            feed_forward_proj = "gated-gelu",
            initializer_factor = 1.0,
            is_encoder_decoder = True,
            is_gated_act = True,
            layer_norm_epsilon = 1e-06,
            model_type = "t5",
            num_decoder_layers = 24,
            num_heads = 32,
            num_layers = 24,
            output_past = True,
            pad_token_id = 0,
            relative_attention_max_distance = 128,
            relative_attention_num_buckets = 32,
            tie_word_embeddings = False,
            tokenizer_class = "T5Tokenizer",
            transformers_version = "4.37.2",
            use_cache = True,
            vocab_size = 250112
        )
        super().__init__(config)
        self.eval()
    def forward(self, input_ids, attention_mask, clip_skip=1):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        prompt_emb = outputs.hidden_states[-clip_skip]
        if clip_skip > 1:
            mean, std = outputs.hidden_states[-1].mean(), outputs.hidden_states[-1].std()
            prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean
        return prompt_emb
    def state_dict_converter(self):
        return HunyuanDiTT5TextEncoderStateDictConverter()
 class HunyuanDiTCLIPTextEncoderStateDictConverter():
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        state_dict_ = {name[5:]: param for name, param in state_dict.items() if name.startswith("bert.")}
        return state_dict_
    def from_civitai(self, state_dict):
        return self.from_diffusers(state_dict)
 class HunyuanDiTT5TextEncoderStateDictConverter():
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        state_dict_ = {name: param for name, param in state_dict.items() if name.startswith("encoder.")}
        state_dict_["shared.weight"] = state_dict["shared.weight"]
        return state_dict_
    def from_civitai(self, state_dict):
        return self.from_diffusers(state_dict)
--- a/diffsynth/models/joyai_image_dit.py
+++ b/diffsynth/models/joyai_image_dit.py
@@ -0,0 +1,636 @@
 import math
 from typing import Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from ..core.attention import attention_forward
 from ..core.gradient import gradient_checkpoint_forward
 def get_timestep_embedding(
    timesteps: torch.Tensor,
    embedding_dim: int,
    flip_sin_to_cos: bool = False,
    downscale_freq_shift: float = 1,
    scale: float = 1,
    max_period: int = 10000,
 ) -> torch.Tensor:
    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
    half_dim = embedding_dim // 2
    exponent = -math.log(max_period) * torch.arange(
        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
    )
    exponent = exponent / (half_dim - downscale_freq_shift)
    emb = torch.exp(exponent)
    emb = timesteps[:, None].float() * emb[None, :]
    emb = scale * emb
    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
    if flip_sin_to_cos:
        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
    if embedding_dim % 2 == 1:
        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
    return emb
 class Timesteps(nn.Module):
    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
        super().__init__()
        self.num_channels = num_channels
        self.flip_sin_to_cos = flip_sin_to_cos
        self.downscale_freq_shift = downscale_freq_shift
        self.scale = scale
    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
        return get_timestep_embedding(
            timesteps,
            self.num_channels,
            flip_sin_to_cos=self.flip_sin_to_cos,
            downscale_freq_shift=self.downscale_freq_shift,
            scale=self.scale,
        )
 class TimestepEmbedding(nn.Module):
    def __init__(
        self,
        in_channels: int,
        time_embed_dim: int,
        act_fn: str = "silu",
        out_dim: int = None,
        post_act_fn: Optional[str] = None,
        cond_proj_dim=None,
        sample_proj_bias=True,
    ):
        super().__init__()
        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
        if cond_proj_dim is not None:
            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
        else:
            self.cond_proj = None
        self.act = nn.SiLU()
        time_embed_dim_out = out_dim if out_dim is not None else time_embed_dim
        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
        self.post_act = nn.SiLU() if post_act_fn == "silu" else None
    def forward(self, sample, condition=None):
        if condition is not None:
            sample = sample + self.cond_proj(condition)
        sample = self.linear_1(sample)
        if self.act is not None:
            sample = self.act(sample)
        sample = self.linear_2(sample)
        if self.post_act is not None:
            sample = self.post_act(sample)
        return sample
 class PixArtAlphaTextProjection(nn.Module):
    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
        super().__init__()
        if out_features is None:
            out_features = hidden_size
        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
        if act_fn == "gelu_tanh":
            self.act_1 = nn.GELU(approximate="tanh")
        elif act_fn == "silu":
            self.act_1 = nn.SiLU()
        else:
            self.act_1 = nn.GELU(approximate="tanh")
        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
    def forward(self, caption):
        hidden_states = self.linear_1(caption)
        hidden_states = self.act_1(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states
 class GELU(nn.Module):
    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
        super().__init__()
        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
        self.approximate = approximate
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.proj(hidden_states)
        hidden_states = F.gelu(hidden_states, approximate=self.approximate)
        return hidden_states
 class FeedForward(nn.Module):
    def __init__(
        self,
        dim: int,
        dim_out: Optional[int] = None,
        mult: int = 4,
        dropout: float = 0.0,
        activation_fn: str = "geglu",
        final_dropout: bool = False,
        inner_dim=None,
        bias: bool = True,
    ):
        super().__init__()
        if inner_dim is None:
            inner_dim = int(dim * mult)
        dim_out = dim_out if dim_out is not None else dim
        # Build activation + projection matching diffusers pattern
        if activation_fn == "gelu":
            act_fn = GELU(dim, inner_dim, bias=bias)
        elif activation_fn == "gelu-approximate":
            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
        else:
            act_fn = GELU(dim, inner_dim, bias=bias)
        self.net = nn.ModuleList([])
        self.net.append(act_fn)
        self.net.append(nn.Dropout(dropout))
        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
        if final_dropout:
            self.net.append(nn.Dropout(dropout))
    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
        for module in self.net:
            hidden_states = module(hidden_states)
        return hidden_states
 def _to_tuple(x, dim=2):
    if isinstance(x, int):
        return (x,) * dim
    elif len(x) == dim:
        return x
    else:
        raise ValueError(f"Expected length {dim} or int, but got {x}")
 def get_meshgrid_nd(start, *args, dim=2):
    if len(args) == 0:
        num = _to_tuple(start, dim=dim)
        start = (0,) * dim
        stop = num
    elif len(args) == 1:
        start = _to_tuple(start, dim=dim)
        stop = _to_tuple(args[0], dim=dim)
        num = [stop[i] - start[i] for i in range(dim)]
    elif len(args) == 2:
        start = _to_tuple(start, dim=dim)
        stop = _to_tuple(args[0], dim=dim)
        num = _to_tuple(args[1], dim=dim)
    else:
        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
    axis_grid = []
    for i in range(dim):
        a, b, n = start[i], stop[i], num[i]
        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
        axis_grid.append(g)
    grid = torch.meshgrid(*axis_grid, indexing="ij")
    grid = torch.stack(grid, dim=0)
    return grid
 def reshape_for_broadcast(freqs_cis, x, head_first=False):
    ndim = x.ndim
    assert 0 <= 1 < ndim
    if isinstance(freqs_cis, tuple):
        if head_first:
            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1])
            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        else:
            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1])
            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
    else:
        if head_first:
            assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        else:
            assert freqs_cis.shape == (x.shape[1], x.shape[-1])
            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return freqs_cis.view(*shape)
 def rotate_half(x):
    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
 def apply_rotary_emb(xq, xk, freqs_cis, head_first=False):
    cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)
    cos, sin = cos.to(xq.device), sin.to(xq.device)
    xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
    xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
    return xq_out, xk_out
 def get_1d_rotary_pos_embed(dim, pos, theta=10000.0, use_real=False, theta_rescale_factor=1.0, interpolation_factor=1.0):
    if isinstance(pos, int):
        pos = torch.arange(pos).float()
    if theta_rescale_factor != 1.0:
        theta *= theta_rescale_factor ** (dim / (dim - 2))
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    freqs = torch.outer(pos * interpolation_factor, freqs)
    if use_real:
        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)
        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)
        return freqs_cos, freqs_sin
    else:
        return torch.polar(torch.ones_like(freqs), freqs)
 def get_nd_rotary_pos_embed(rope_dim_list, start, *args, theta=10000.0, use_real=False,
                            txt_rope_size=None, theta_rescale_factor=1.0, interpolation_factor=1.0):
    grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list))
    if isinstance(theta_rescale_factor, (int, float)):
        theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
    elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
        theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
    if isinstance(interpolation_factor, (int, float)):
        interpolation_factor = [interpolation_factor] * len(rope_dim_list)
    elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
        interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
    embs = []
    for i in range(len(rope_dim_list)):
        emb = get_1d_rotary_pos_embed(
            rope_dim_list[i], grid[i].reshape(-1), theta,
            use_real=use_real, theta_rescale_factor=theta_rescale_factor[i],
            interpolation_factor=interpolation_factor[i],
        )
        embs.append(emb)
    if use_real:
        vis_emb = (torch.cat([emb[0] for emb in embs], dim=1), torch.cat([emb[1] for emb in embs], dim=1))
    else:
        vis_emb = torch.cat(embs, dim=1)
    if txt_rope_size is not None:
        embs_txt = []
        vis_max_ids = grid.view(-1).max().item()
        grid_txt = torch.arange(txt_rope_size) + vis_max_ids + 1
        for i in range(len(rope_dim_list)):
            emb = get_1d_rotary_pos_embed(
                rope_dim_list[i], grid_txt, theta,
                use_real=use_real, theta_rescale_factor=theta_rescale_factor[i],
                interpolation_factor=interpolation_factor[i],
            )
            embs_txt.append(emb)
        if use_real:
            txt_emb = (torch.cat([emb[0] for emb in embs_txt], dim=1), torch.cat([emb[1] for emb in embs_txt], dim=1))
        else:
            txt_emb = torch.cat(embs_txt, dim=1)
    else:
        txt_emb = None
    return vis_emb, txt_emb
 class ModulateWan(nn.Module):
    def __init__(self, hidden_size: int, factor: int, dtype=None, device=None):
        super().__init__()
        self.factor = factor
        factory_kwargs = {"dtype": dtype, "device": device}
        self.modulate_table = nn.Parameter(
            torch.zeros(1, factor, hidden_size, **factory_kwargs) / hidden_size**0.5,
            requires_grad=True
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if len(x.shape) != 3:
            x = x.unsqueeze(1)
        return [o.squeeze(1) for o in (self.modulate_table + x).chunk(self.factor, dim=1)]
 def modulate(x, shift=None, scale=None):
    if scale is None and shift is None:
        return x
    elif shift is None:
        return x * (1 + scale.unsqueeze(1))
    elif scale is None:
        return x + shift.unsqueeze(1)
    else:
        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
 def apply_gate(x, gate=None, tanh=False):
    if gate is None:
        return x
    if tanh:
        return x * gate.unsqueeze(1).tanh()
    else:
        return x * gate.unsqueeze(1)
 def load_modulation(modulate_type: str, hidden_size: int, factor: int, dtype=None, device=None):
    factory_kwargs = {"dtype": dtype, "device": device}
    if modulate_type == 'wanx':
        return ModulateWan(hidden_size, factor, **factory_kwargs)
    raise ValueError(f"Unknown modulation type: {modulate_type}. Only 'wanx' is supported.")
 class RMSNorm(nn.Module):
    def __init__(self, dim: int, elementwise_affine=True, eps: float = 1e-6, device=None, dtype=None):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.eps = eps
        if elementwise_affine:
            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        if hasattr(self, "weight"):
            output = output * self.weight
        return output
 class MMDoubleStreamBlock(nn.Module):
    """
    A multimodal dit block with separate modulation for
    text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
                                     (Flux.1): https://github.com/black-forest-labs/flux
    """
    def __init__(
        self,
        hidden_size: int,
        heads_num: int,
        mlp_width_ratio: float,
        mlp_act_type: str = "gelu_tanh",
        dtype: Optional[torch.dtype] = None,
        device: Optional[torch.device] = None,
        dit_modulation_type: Optional[str] = "wanx",
    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.dit_modulation_type = dit_modulation_type
        self.heads_num = heads_num
        head_dim = hidden_size // heads_num
        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
        self.img_mod = load_modulation(
            modulate_type=self.dit_modulation_type,
            hidden_size=hidden_size, factor=6, **factory_kwargs,
        )
        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
        self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=True, **factory_kwargs)
        self.img_attn_q_norm = RMSNorm(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
        self.img_attn_k_norm = RMSNorm(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
        self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=True, **factory_kwargs)
        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
        self.img_mlp = FeedForward(hidden_size, inner_dim=mlp_hidden_dim, activation_fn="gelu-approximate")
        self.txt_mod = load_modulation(
            modulate_type=self.dit_modulation_type,
            hidden_size=hidden_size, factor=6, **factory_kwargs,
        )
        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
        self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=True, **factory_kwargs)
        self.txt_attn_q_norm = RMSNorm(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
        self.txt_attn_k_norm = RMSNorm(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
        self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=True, **factory_kwargs)
        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
        self.txt_mlp = FeedForward(hidden_size, inner_dim=mlp_hidden_dim, activation_fn="gelu-approximate")
    def forward(
        self,
        img: torch.Tensor,
        txt: torch.Tensor,
        vec: torch.Tensor,
        vis_freqs_cis: tuple = None,
        txt_freqs_cis: tuple = None,
        attn_kwargs: Optional[dict] = {},
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        (
            img_mod1_shift, img_mod1_scale, img_mod1_gate,
            img_mod2_shift, img_mod2_scale, img_mod2_gate,
        ) = self.img_mod(vec)
        (
            txt_mod1_shift, txt_mod1_scale, txt_mod1_gate,
            txt_mod2_shift, txt_mod2_scale, txt_mod2_gate,
        ) = self.txt_mod(vec)
        img_modulated = self.img_norm1(img)
        img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
        img_qkv = self.img_attn_qkv(img_modulated)
        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
        img_q = self.img_attn_q_norm(img_q).to(img_v)
        img_k = self.img_attn_k_norm(img_k).to(img_v)
        if vis_freqs_cis is not None:
            img_qq, img_kk = apply_rotary_emb(img_q, img_k, vis_freqs_cis, head_first=False)
            img_q, img_k = img_qq, img_kk
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
        txt_qkv = self.txt_attn_qkv(txt_modulated)
        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
        if txt_freqs_cis is not None:
            raise NotImplementedError("RoPE text is not supported for inference")
        q = torch.cat((img_q, txt_q), dim=1)
        k = torch.cat((img_k, txt_k), dim=1)
        v = torch.cat((img_v, txt_v), dim=1)
        # Use DiffSynth unified attention
        attn_out = attention_forward(
            q, k, v,
            q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s n d",
        )
        attn_out = attn_out.flatten(2, 3)
        img_attn, txt_attn = attn_out[:, : img.shape[1]], attn_out[:, img.shape[1]:]
        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
        img = img + apply_gate(
            self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
            gate=img_mod2_gate,
        )
        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
        txt = txt + apply_gate(
            self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
            gate=txt_mod2_gate,
        )
        return img, txt
 class WanTimeTextImageEmbedding(nn.Module):
    def __init__(
        self,
        dim: int,
        time_freq_dim: int,
        time_proj_dim: int,
        text_embed_dim: int,
        image_embed_dim: Optional[int] = None,
        pos_embed_seq_len: Optional[int] = None,
    ):
        super().__init__()
        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
        self.act_fn = nn.SiLU()
        self.time_proj = nn.Linear(dim, time_proj_dim)
        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
    def forward(self, timestep: torch.Tensor, encoder_hidden_states: torch.Tensor):
        timestep = self.timesteps_proj(timestep)
        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
            timestep = timestep.to(time_embedder_dtype)
        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
        timestep_proj = self.time_proj(self.act_fn(temb))
        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
        return temb, timestep_proj, encoder_hidden_states
 class JoyAIImageDiT(nn.Module):
    _supports_gradient_checkpointing = True
    def __init__(
        self,
        patch_size: list = [1, 2, 2],
        in_channels: int = 16,
        out_channels: int = 16,
        hidden_size: int = 4096,
        heads_num: int = 32,
        text_states_dim: int = 4096,
        mlp_width_ratio: float = 4.0,
        mm_double_blocks_depth: int = 40,
        rope_dim_list: List[int] = [16, 56, 56],
        rope_type: str = 'rope',
        dtype: Optional[torch.dtype] = None,
        device: Optional[torch.device] = None,
        dit_modulation_type: str = "wanx",
        theta: int = 10000,
    ):
        super().__init__()
        self.out_channels = out_channels or in_channels
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.heads_num = heads_num
        self.rope_dim_list = rope_dim_list
        self.dit_modulation_type = dit_modulation_type
        self.mm_double_blocks_depth = mm_double_blocks_depth
        self.rope_type = rope_type
        self.theta = theta
        factory_kwargs = {"device": device, "dtype": dtype}
        if hidden_size % heads_num != 0:
            raise ValueError(f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}")
        self.img_in = nn.Conv3d(in_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
        self.condition_embedder = WanTimeTextImageEmbedding(
            dim=hidden_size,
            time_freq_dim=256,
            time_proj_dim=hidden_size * 6,
            text_embed_dim=text_states_dim,
        )
        self.double_blocks = nn.ModuleList([
            MMDoubleStreamBlock(
                self.hidden_size, self.heads_num,
                mlp_width_ratio=mlp_width_ratio,
                dit_modulation_type=self.dit_modulation_type,
                **factory_kwargs,
            )
            for _ in range(mm_double_blocks_depth)
        ])
        self.norm_out = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.proj_out = nn.Linear(hidden_size, self.out_channels * math.prod(patch_size), **factory_kwargs)
    def get_rotary_pos_embed(self, vis_rope_size, txt_rope_size=None):
        target_ndim = 3
        if len(vis_rope_size) != target_ndim:
            vis_rope_size = [1] * (target_ndim - len(vis_rope_size)) + vis_rope_size
        head_dim = self.hidden_size // self.heads_num
        rope_dim_list = self.rope_dim_list
        if rope_dim_list is None:
            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
        assert sum(rope_dim_list) == head_dim
        vis_freqs, txt_freqs = get_nd_rotary_pos_embed(
            rope_dim_list, vis_rope_size,
            txt_rope_size=txt_rope_size if self.rope_type == 'mrope' else None,
            theta=self.theta, use_real=True, theta_rescale_factor=1,
        )
        return vis_freqs, txt_freqs
    def forward(
        self,
        hidden_states: torch.Tensor,
        timestep: torch.Tensor,
        encoder_hidden_states: torch.Tensor = None,
        encoder_hidden_states_mask: torch.Tensor = None,
        return_dict: bool = True,
        use_gradient_checkpointing: bool = False,
        use_gradient_checkpointing_offload: bool = False,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
        is_multi_item = (len(hidden_states.shape) == 6)
        num_items = 0
        if is_multi_item:
            num_items = hidden_states.shape[1]
            if num_items > 1:
                assert self.patch_size[0] == 1, "For multi-item input, patch_size[0] must be 1"
                hidden_states = torch.cat([hidden_states[:, -1:], hidden_states[:, :-1]], dim=1)
            hidden_states = rearrange(hidden_states, 'b n c t h w -> b c (n t) h w')
        batch_size, _, ot, oh, ow = hidden_states.shape
        tt, th, tw = ot // self.patch_size[0], oh // self.patch_size[1], ow // self.patch_size[2]
        if encoder_hidden_states_mask is None:
            encoder_hidden_states_mask = torch.ones(
                (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]),
                dtype=torch.bool,
            ).to(encoder_hidden_states.device)
        img = self.img_in(hidden_states).flatten(2).transpose(1, 2)
        temb, vec, txt = self.condition_embedder(timestep, encoder_hidden_states)
        if vec.shape[-1] > self.hidden_size:
            vec = vec.unflatten(1, (6, -1))
        txt_seq_len = txt.shape[1]
        img_seq_len = img.shape[1]
        vis_freqs_cis, txt_freqs_cis = self.get_rotary_pos_embed(
            vis_rope_size=(tt, th, tw),
            txt_rope_size=txt_seq_len if self.rope_type == 'mrope' else None,
        )
        for block in self.double_blocks:
            img, txt = gradient_checkpoint_forward(
                block,
                use_gradient_checkpointing=use_gradient_checkpointing,
                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
                img=img, txt=txt, vec=vec,
                vis_freqs_cis=vis_freqs_cis, txt_freqs_cis=txt_freqs_cis,
                attn_kwargs={},
            )
        img_len = img.shape[1]
        x = torch.cat((img, txt), 1)
        img = x[:, :img_len, ...]
        img = self.proj_out(self.norm_out(img))
        img = self.unpatchify(img, tt, th, tw)
        if is_multi_item:
            img = rearrange(img, 'b c (n t) h w -> b n c t h w', n=num_items)
            if num_items > 1:
                img = torch.cat([img[:, 1:], img[:, :1]], dim=1)
        return img
    def unpatchify(self, x, t, h, w):
        c = self.out_channels
        pt, ph, pw = self.patch_size
        assert t * h * w == x.shape[1]
        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = torch.einsum("nthwopqc->nctohpwq", x)
        return x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
--- a/diffsynth/models/joyai_image_text_encoder.py
+++ b/diffsynth/models/joyai_image_text_encoder.py
@@ -0,0 +1,82 @@
 import torch
 from typing import Optional
 class JoyAIImageTextEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        from transformers import Qwen3VLConfig, Qwen3VLForConditionalGeneration
        config = Qwen3VLConfig(
            text_config={
                "attention_bias": False,
                "attention_dropout": 0.0,
                "bos_token_id": 151643,
                "eos_token_id": 151645,
                "head_dim": 128,
                "hidden_act": "silu",
                "hidden_size": 4096,
                "initializer_range": 0.02,
                "intermediate_size": 12288,
                "max_position_embeddings": 262144,
                "model_type": "qwen3_vl_text",
                "num_attention_heads": 32,
                "num_hidden_layers": 36,
                "num_key_value_heads": 8,
                "rms_norm_eps": 1e-6,
                "rope_scaling": {
                    "mrope_interleaved": True,
                    "mrope_section": [24, 20, 20],
                    "rope_type": "default",
                },
                "rope_theta": 5000000,
                "use_cache": True,
                "vocab_size": 151936,
            },
            vision_config={
                "deepstack_visual_indexes": [8, 16, 24],
                "depth": 27,
                "hidden_act": "gelu_pytorch_tanh",
                "hidden_size": 1152,
                "in_channels": 3,
                "initializer_range": 0.02,
                "intermediate_size": 4304,
                "model_type": "qwen3_vl",
                "num_heads": 16,
                "num_position_embeddings": 2304,
                "out_hidden_size": 4096,
                "patch_size": 16,
                "spatial_merge_size": 2,
                "temporal_patch_size": 2,
            },
            image_token_id=151655,
            video_token_id=151656,
            vision_start_token_id=151652,
            vision_end_token_id=151653,
            tie_word_embeddings=False,
        )
        self.model = Qwen3VLForConditionalGeneration(config)
        self.config = config
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        pixel_values: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        **kwargs,
    ):
        pre_norm_output = [None]
        def hook_fn(module, args, kwargs_output=None):
            pre_norm_output[0] = args[0]
        self.model.model.language_model.norm.register_forward_hook(hook_fn)
        _ = self.model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            attention_mask=attention_mask,
            output_hidden_states=True,
            **kwargs,
        )
        return pre_norm_output[0]
--- a/diffsynth/models/longcat_video_dit.py
+++ b/diffsynth/models/longcat_video_dit.py
@@ -0,0 +1,902 @@
 from typing import List, Optional, Tuple
 import math
 import torch
 import torch.nn as nn
 import torch.amp as amp
 import numpy as np
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from .wan_video_dit import flash_attention
 from ..core.device.npu_compatible_device import get_device_type
 from ..core.gradient import gradient_checkpoint_forward
 class RMSNorm_FP32(torch.nn.Module):
    def __init__(self, dim: int, eps: float):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight
 def broadcat(tensors, dim=-1):
    num_tensors = len(tensors)
    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
    shape_len = list(shape_lens)[0]
    dim = (dim + shape_len) if dim < 0 else dim
    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
    assert all(
        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
    ), "invalid dimensions for broadcastable concatentation"
    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
    expanded_dims.insert(dim, (dim, dims[dim]))
    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
    return torch.cat(tensors, dim=dim)
 def rotate_half(x):
    x = rearrange(x, "... (d r) -> ... d r", r=2)
    x1, x2 = x.unbind(dim=-1)
    x = torch.stack((-x2, x1), dim=-1)
    return rearrange(x, "... d r -> ... (d r)")
 class RotaryPositionalEmbedding(nn.Module):
    def __init__(self,
                 head_dim,
                 cp_split_hw=None
                 ):
        """Rotary positional embedding for 3D
        Reference : https://blog.eleuther.ai/rotary-embeddings/
        Paper: https://arxiv.org/pdf/2104.09864.pdf
        Args:
            dim: Dimension of embedding
            base: Base value for exponential
        """
        super().__init__()
        self.head_dim = head_dim
        assert self.head_dim % 8 == 0, 'Dim must be a multiply of 8 for 3D RoPE.'
        self.cp_split_hw = cp_split_hw
        # We take the assumption that the longest side of grid will not larger than 512, i.e, 512 * 8 = 4098 input pixels
        self.base = 10000
        self.freqs_dict = {}
    def register_grid_size(self, grid_size):
        if grid_size not in self.freqs_dict:
            self.freqs_dict.update({
                grid_size: self.precompute_freqs_cis_3d(grid_size)
            })
    def precompute_freqs_cis_3d(self, grid_size):
        num_frames, height, width = grid_size     
        dim_t = self.head_dim - 4 * (self.head_dim // 6)
        dim_h = 2 * (self.head_dim // 6)
        dim_w = 2 * (self.head_dim // 6)
        freqs_t = 1.0 / (self.base ** (torch.arange(0, dim_t, 2)[: (dim_t // 2)].float() / dim_t))
        freqs_h = 1.0 / (self.base ** (torch.arange(0, dim_h, 2)[: (dim_h // 2)].float() / dim_h))
        freqs_w = 1.0 / (self.base ** (torch.arange(0, dim_w, 2)[: (dim_w // 2)].float() / dim_w))
        grid_t = np.linspace(0, num_frames, num_frames, endpoint=False, dtype=np.float32)
        grid_h = np.linspace(0, height, height, endpoint=False, dtype=np.float32)
        grid_w = np.linspace(0, width, width, endpoint=False, dtype=np.float32)
        grid_t = torch.from_numpy(grid_t).float()
        grid_h = torch.from_numpy(grid_h).float()
        grid_w = torch.from_numpy(grid_w).float()
        freqs_t = torch.einsum("..., f -> ... f", grid_t, freqs_t)
        freqs_h = torch.einsum("..., f -> ... f", grid_h, freqs_h)
        freqs_w = torch.einsum("..., f -> ... f", grid_w, freqs_w)
        freqs_t = repeat(freqs_t, "... n -> ... (n r)", r=2)
        freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2)
        freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2)
        freqs = broadcat((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
        # (T H W D)
        freqs = rearrange(freqs, "T H W D -> (T H W) D")
        # if self.cp_split_hw[0] * self.cp_split_hw[1] > 1:
        #     with torch.no_grad():
        #         freqs = rearrange(freqs, "(T H W) D -> T H W D", T=num_frames, H=height, W=width)
        #         freqs = context_parallel_util.split_cp_2d(freqs, seq_dim_hw=(1, 2), split_hw=self.cp_split_hw)
        #         freqs = rearrange(freqs, "T H W D -> (T H W) D")
        return freqs
    def forward(self, q, k, grid_size):
        """3D RoPE.
        Args:
            query: [B, head, seq, head_dim]
            key: [B, head, seq, head_dim]
        Returns:
            query and key with the same shape as input.
        """
        if grid_size not in self.freqs_dict:
            self.register_grid_size(grid_size)
        freqs_cis = self.freqs_dict[grid_size].to(q.device)
        q_, k_ = q.float(), k.float()
        freqs_cis = freqs_cis.float().to(q.device)
        cos, sin = freqs_cis.cos(), freqs_cis.sin()
        cos, sin = rearrange(cos, 'n d -> 1 1 n d'), rearrange(sin, 'n d -> 1 1 n d')
        q_ = (q_ * cos) + (rotate_half(q_) * sin)
        k_ = (k_ * cos) + (rotate_half(k_) * sin)
        return q_.type_as(q), k_.type_as(k)
 class Attention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int,
        enable_flashattn3: bool = False,
        enable_flashattn2: bool = False,
        enable_xformers: bool = False,
        enable_bsa: bool = False,
        bsa_params: dict = None,
        cp_split_hw: Optional[List[int]] = None
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim**-0.5
        self.enable_flashattn3 = enable_flashattn3
        self.enable_flashattn2 = enable_flashattn2
        self.enable_xformers = enable_xformers
        self.enable_bsa = enable_bsa
        self.bsa_params = bsa_params
        self.cp_split_hw = cp_split_hw
        self.qkv = nn.Linear(dim, dim * 3, bias=True)
        self.q_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
        self.k_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
        self.proj = nn.Linear(dim, dim)
        self.rope_3d = RotaryPositionalEmbedding(
            self.head_dim,
            cp_split_hw=cp_split_hw
        )
    def _process_attn(self, q, k, v, shape):
        q = rearrange(q, "B H S D -> B S (H D)")
        k = rearrange(k, "B H S D -> B S (H D)")
        v = rearrange(v, "B H S D -> B S (H D)")
        x = flash_attention(q, k, v, num_heads=self.num_heads)
        x = rearrange(x, "B S (H D) -> B H S D", H=self.num_heads)
        return x
    def forward(self, x: torch.Tensor, shape=None, num_cond_latents=None, return_kv=False) -> torch.Tensor:
        """
        """
        B, N, C = x.shape
        qkv = self.qkv(x)
        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.view(qkv_shape).permute((2, 0, 3, 1, 4)) # [3, B, H, N, D]
        q, k, v = qkv.unbind(0)
        q, k = self.q_norm(q), self.k_norm(k)
        if return_kv:
            k_cache, v_cache = k.clone(), v.clone()
        q, k = self.rope_3d(q, k, shape)
        # cond mode
        if num_cond_latents is not None and num_cond_latents > 0:
            num_cond_latents_thw = num_cond_latents * (N // shape[0])
            # process the condition tokens
            q_cond = q[:, :, :num_cond_latents_thw].contiguous()
            k_cond = k[:, :, :num_cond_latents_thw].contiguous()
            v_cond = v[:, :, :num_cond_latents_thw].contiguous()
            x_cond = self._process_attn(q_cond, k_cond, v_cond, shape)
            # process the noise tokens
            q_noise = q[:, :, num_cond_latents_thw:].contiguous()
            x_noise = self._process_attn(q_noise, k, v, shape)
            # merge x_cond and x_noise
            x = torch.cat([x_cond, x_noise], dim=2).contiguous()
        else:
            x = self._process_attn(q, k, v, shape)
        x_output_shape = (B, N, C)
        x = x.transpose(1, 2) # [B, H, N, D] --> [B, N, H, D]
        x = x.reshape(x_output_shape) # [B, N, H, D] --> [B, N, C]
        x = self.proj(x)
        if return_kv:
            return x, (k_cache, v_cache)
        else:
            return x
    def forward_with_kv_cache(self, x: torch.Tensor, shape=None, num_cond_latents=None, kv_cache=None) -> torch.Tensor:
        """
        """
        B, N, C = x.shape
        qkv = self.qkv(x)
        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.view(qkv_shape).permute((2, 0, 3, 1, 4)) # [3, B, H, N, D]
        q, k, v = qkv.unbind(0)
        q, k = self.q_norm(q), self.k_norm(k)
        T, H, W = shape
        k_cache, v_cache = kv_cache
        assert k_cache.shape[0] == v_cache.shape[0] and k_cache.shape[0] in [1, B]
        if k_cache.shape[0] == 1:
            k_cache = k_cache.repeat(B, 1, 1, 1)
            v_cache = v_cache.repeat(B, 1, 1, 1)
        if num_cond_latents is not None and num_cond_latents > 0:
            k_full = torch.cat([k_cache, k], dim=2).contiguous()
            v_full = torch.cat([v_cache, v], dim=2).contiguous()
            q_padding = torch.cat([torch.empty_like(k_cache), q], dim=2).contiguous()
            q_padding, k_full = self.rope_3d(q_padding, k_full, (T + num_cond_latents, H, W))
            q = q_padding[:, :, -N:].contiguous()
        x = self._process_attn(q, k_full, v_full, shape)
        x_output_shape = (B, N, C)
        x = x.transpose(1, 2) # [B, H, N, D] --> [B, N, H, D]
        x = x.reshape(x_output_shape) # [B, N, H, D] --> [B, N, C]
        x = self.proj(x)
        return x
 class MultiHeadCrossAttention(nn.Module):
    def __init__(
            self,
            dim,
            num_heads,
            enable_flashattn3=False,
            enable_flashattn2=False,
            enable_xformers=False,
        ):
        super(MultiHeadCrossAttention, self).__init__()
        assert dim % num_heads == 0, "d_model must be divisible by num_heads"
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.q_linear = nn.Linear(dim, dim)
        self.kv_linear = nn.Linear(dim, dim * 2)
        self.proj = nn.Linear(dim, dim)
        self.q_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
        self.k_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
        self.enable_flashattn3 = enable_flashattn3
        self.enable_flashattn2 = enable_flashattn2
        self.enable_xformers = enable_xformers
    def _process_cross_attn(self, x, cond, kv_seqlen):
        B, N, C = x.shape
        assert C == self.dim and cond.shape[2] == self.dim
        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
        k, v = kv.unbind(2)
        q, k = self.q_norm(q), self.k_norm(k)
        q = rearrange(q, "B S H D -> B S (H D)")
        k = rearrange(k, "B S H D -> B S (H D)")
        v = rearrange(v, "B S H D -> B S (H D)")
        x = flash_attention(q, k, v, num_heads=self.num_heads)
        x = x.view(B, -1, C)
        x = self.proj(x)
        return x
    def forward(self, x, cond, kv_seqlen, num_cond_latents=None, shape=None):
        """
            x: [B, N, C]
            cond: [B, M, C]
        """
        if num_cond_latents is None or num_cond_latents == 0:
            return self._process_cross_attn(x, cond, kv_seqlen)
        else:
            B, N, C = x.shape
            if num_cond_latents is not None and num_cond_latents > 0:
                assert shape is not None, "SHOULD pass in the shape"
                num_cond_latents_thw = num_cond_latents * (N // shape[0])
                x_noise = x[:, num_cond_latents_thw:] # [B, N_noise, C]
                output_noise = self._process_cross_attn(x_noise, cond, kv_seqlen) # [B, N_noise, C]
                output = torch.cat([
                    torch.zeros((B, num_cond_latents_thw, C), dtype=output_noise.dtype, device=output_noise.device),
                    output_noise
                ], dim=1).contiguous()
            else:
                raise NotImplementedError
            return output
 class LayerNorm_FP32(nn.LayerNorm):
    def __init__(self, dim, eps, elementwise_affine):
        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        origin_dtype = inputs.dtype
        out = F.layer_norm(
            inputs.float(), 
            self.normalized_shape, 
            None if self.weight is None else self.weight.float(), 
            None if self.bias is None else self.bias.float() ,
            self.eps
        ).to(origin_dtype)
        return out
 def modulate_fp32(norm_func, x, shift, scale):
    # Suppose x is (B, N, D), shift is (B, -1, D), scale is (B, -1, D)
    # ensure the modulation params be fp32
    assert shift.dtype == torch.float32, scale.dtype == torch.float32
    dtype = x.dtype
    x = norm_func(x.to(torch.float32))
    x = x * (scale + 1) + shift
    x = x.to(dtype)
    return x
 class FinalLayer_FP32(nn.Module):
    """
    The final layer of DiT.
    """
    def __init__(self, hidden_size, num_patch, out_channels, adaln_tembed_dim):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_patch = num_patch
        self.out_channels = out_channels
        self.adaln_tembed_dim = adaln_tembed_dim
        self.norm_final = LayerNorm_FP32(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(adaln_tembed_dim, 2 * hidden_size, bias=True))
    def forward(self, x, t, latent_shape):
        # timestep shape: [B, T, C]
        assert t.dtype == torch.float32
        B, N, C = x.shape
        T, _, _ = latent_shape
        with amp.autocast(get_device_type(), dtype=torch.float32):
            shift, scale = self.adaLN_modulation(t).unsqueeze(2).chunk(2, dim=-1) # [B, T, 1, C]
            x = modulate_fp32(self.norm_final, x.view(B, T, -1, C), shift, scale).view(B, N, C)
            x = self.linear(x)
        return x
 class FeedForwardSwiGLU(nn.Module):
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        multiple_of: int = 256,
        ffn_dim_multiplier: Optional[float] = None,
    ):
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        # custom dim factor multiplier
        if ffn_dim_multiplier is not None:
            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
        self.dim = dim
        self.hidden_dim = hidden_dim
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))
 class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """
    def __init__(self, t_embed_dim, frequency_embedding_size=256):
        super().__init__()
        self.t_embed_dim = t_embed_dim
        self.frequency_embedding_size = frequency_embedding_size
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, t_embed_dim, bias=True),
            nn.SiLU(),
            nn.Linear(t_embed_dim, t_embed_dim, bias=True),
        )
    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        half = dim // 2
        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half)
        freqs = freqs.to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding
    def forward(self, t, dtype):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
        if t_freq.dtype != dtype:
            t_freq = t_freq.to(dtype)
        t_emb = self.mlp(t_freq)
        return t_emb
 class CaptionEmbedder(nn.Module):
    """
    Embeds class labels into vector representations.
    """
    def __init__(self, in_channels, hidden_size):
        super().__init__()
        self.in_channels = in_channels
        self.hidden_size = hidden_size
        self.y_proj = nn.Sequential(
            nn.Linear(in_channels, hidden_size, bias=True),
            nn.GELU(approximate="tanh"),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
    def forward(self, caption):
        B, _, N, C = caption.shape
        caption = self.y_proj(caption)
        return caption
 class PatchEmbed3D(nn.Module):
    """Video to Patch Embedding.
    Args:
        patch_size (int): Patch token size. Default: (2,4,4).
        in_chans (int): Number of input video channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """
    def __init__(
        self,
        patch_size=(2, 4, 4),
        in_chans=3,
        embed_dim=96,
        norm_layer=None,
        flatten=True,
    ):
        super().__init__()
        self.patch_size = patch_size
        self.flatten = flatten
        self.in_chans = in_chans
        self.embed_dim = embed_dim
        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None
    def forward(self, x):
        """Forward function."""
        # padding
        _, _, D, H, W = x.size()
        if W % self.patch_size[2] != 0:
            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
        if H % self.patch_size[1] != 0:
            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
        if D % self.patch_size[0] != 0:
            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
        B, C, T, H, W = x.shape
        x = self.proj(x)  # (B C T H W)
        if self.norm is not None:
            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
            x = x.flatten(2).transpose(1, 2)
            x = self.norm(x)
            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCTHW -> BNC
        return x
 class LongCatSingleStreamBlock(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        mlp_ratio: int,
        adaln_tembed_dim: int,
        enable_flashattn3: bool = False,
        enable_flashattn2: bool = False,
        enable_xformers: bool = False,
        enable_bsa: bool = False,
        bsa_params=None,
        cp_split_hw=None
    ):
        super().__init__()
        self.hidden_size = hidden_size
        # scale and gate modulation
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(adaln_tembed_dim, 6 * hidden_size, bias=True)
        )
        self.mod_norm_attn = LayerNorm_FP32(hidden_size, eps=1e-6, elementwise_affine=False)
        self.mod_norm_ffn  = LayerNorm_FP32(hidden_size, eps=1e-6, elementwise_affine=False)
        self.pre_crs_attn_norm = LayerNorm_FP32(hidden_size, eps=1e-6, elementwise_affine=True)
        self.attn = Attention(
            dim=hidden_size,
            num_heads=num_heads,
            enable_flashattn3=enable_flashattn3,
            enable_flashattn2=enable_flashattn2,
            enable_xformers=enable_xformers,
            enable_bsa=enable_bsa,
            bsa_params=bsa_params,
            cp_split_hw=cp_split_hw
        )
        self.cross_attn = MultiHeadCrossAttention(
            dim=hidden_size,
            num_heads=num_heads,
            enable_flashattn3=enable_flashattn3,
            enable_flashattn2=enable_flashattn2,
            enable_xformers=enable_xformers,
        )
        self.ffn = FeedForwardSwiGLU(dim=hidden_size, hidden_dim=int(hidden_size * mlp_ratio))
    def forward(self, x, y, t, y_seqlen, latent_shape, num_cond_latents=None, return_kv=False, kv_cache=None, skip_crs_attn=False):
        """
            x: [B, N, C]
            y: [1, N_valid_tokens, C]
            t: [B, T, C_t]
            y_seqlen: [B]; type of a list
            latent_shape: latent shape of a single item
        """
        x_dtype = x.dtype
        B, N, C = x.shape
        T, _, _ = latent_shape # S != T*H*W in case of CP split on H*W.
        # compute modulation params in fp32
        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
            shift_msa, scale_msa, gate_msa, \
            shift_mlp, scale_mlp, gate_mlp = \
                self.adaLN_modulation(t).unsqueeze(2).chunk(6, dim=-1) # [B, T, 1, C]
        # self attn with modulation
        x_m = modulate_fp32(self.mod_norm_attn, x.view(B, T, -1, C), shift_msa, scale_msa).view(B, N, C)
        if kv_cache is not None:
            kv_cache = (kv_cache[0].to(x.device), kv_cache[1].to(x.device))
            attn_outputs = self.attn.forward_with_kv_cache(x_m, shape=latent_shape, num_cond_latents=num_cond_latents, kv_cache=kv_cache)
        else:
            attn_outputs = self.attn(x_m, shape=latent_shape, num_cond_latents=num_cond_latents, return_kv=return_kv)
        if return_kv:
            x_s, kv_cache = attn_outputs
        else:
            x_s = attn_outputs
        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
            x = x + (gate_msa * x_s.view(B, -1, N//T, C)).view(B, -1, C) # [B, N, C]
        x = x.to(x_dtype)
        # cross attn
        if not skip_crs_attn:
            if kv_cache is not None:
                num_cond_latents = None
            x = x + self.cross_attn(self.pre_crs_attn_norm(x), y, y_seqlen, num_cond_latents=num_cond_latents, shape=latent_shape)
        # ffn with modulation
        x_m = modulate_fp32(self.mod_norm_ffn, x.view(B, -1, N//T, C), shift_mlp, scale_mlp).view(B, -1, C)
        x_s = self.ffn(x_m)
        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
            x = x + (gate_mlp * x_s.view(B, -1, N//T, C)).view(B, -1, C) # [B, N, C]
        x = x.to(x_dtype)
        if return_kv:
            return x, kv_cache
        else:
            return x
 class LongCatVideoTransformer3DModel(torch.nn.Module):
    def __init__(
        self,
        in_channels: int = 16,
        out_channels: int = 16,
        hidden_size: int = 4096,
        depth: int = 48,
        num_heads: int = 32,
        caption_channels: int = 4096,
        mlp_ratio: int = 4,
        adaln_tembed_dim: int = 512,
        frequency_embedding_size: int = 256,
        # default params
        patch_size: Tuple[int] = (1, 2, 2),
        # attention config
        enable_flashattn3: bool = False,
        enable_flashattn2: bool = True,
        enable_xformers: bool = False,
        enable_bsa: bool = False,
        bsa_params: dict = {'sparsity': 0.9375, 'chunk_3d_shape_q': [4, 4, 4], 'chunk_3d_shape_k': [4, 4, 4]},
        cp_split_hw: Optional[List[int]] = [1, 1],
        text_tokens_zero_pad: bool = True,
    ) -> None:
        super().__init__()
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.cp_split_hw = cp_split_hw
        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
        self.t_embedder = TimestepEmbedder(t_embed_dim=adaln_tembed_dim, frequency_embedding_size=frequency_embedding_size)
        self.y_embedder = CaptionEmbedder(
            in_channels=caption_channels,
            hidden_size=hidden_size,
        )
        self.blocks = nn.ModuleList(
            [
                LongCatSingleStreamBlock(
                    hidden_size=hidden_size,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    adaln_tembed_dim=adaln_tembed_dim,
                    enable_flashattn3=enable_flashattn3,
                    enable_flashattn2=enable_flashattn2,
                    enable_xformers=enable_xformers,
                    enable_bsa=enable_bsa,
                    bsa_params=bsa_params,
                    cp_split_hw=cp_split_hw
                )
                for i in range(depth)
            ]
        )
        self.final_layer = FinalLayer_FP32(
            hidden_size,
            np.prod(self.patch_size),
            out_channels,
            adaln_tembed_dim,
        )
        self.gradient_checkpointing = False
        self.text_tokens_zero_pad = text_tokens_zero_pad
        self.lora_dict = {}
        self.active_loras = []
    def enable_loras(self, lora_key_list=[]):
        self.disable_all_loras()
        module_loras = {}  # {module_name: [lora1, lora2, ...]}
        model_device = next(self.parameters()).device
        model_dtype = next(self.parameters()).dtype
        for lora_key in lora_key_list:
            if lora_key in self.lora_dict:
                for lora in self.lora_dict[lora_key].loras:
                    lora.to(model_device, dtype=model_dtype, non_blocking=True)
                    module_name = lora.lora_name.replace("lora___lorahyphen___", "").replace("___lorahyphen___", ".")
                    if module_name not in module_loras:
                        module_loras[module_name] = []
                    module_loras[module_name].append(lora)
                self.active_loras.append(lora_key)
        for module_name, loras in module_loras.items():
            module = self._get_module_by_name(module_name)
            if not hasattr(module, 'org_forward'):
                module.org_forward = module.forward
            module.forward = self._create_multi_lora_forward(module, loras)
    def _create_multi_lora_forward(self, module, loras):
        def multi_lora_forward(x, *args, **kwargs):
            weight_dtype = x.dtype
            org_output = module.org_forward(x, *args, **kwargs)
            total_lora_output = 0
            for lora in loras:
                if lora.use_lora:
                    lx = lora.lora_down(x.to(lora.lora_down.weight.dtype))
                    lx = lora.lora_up(lx)
                    lora_output = lx.to(weight_dtype) * lora.multiplier * lora.alpha_scale
                    total_lora_output += lora_output
            return org_output + total_lora_output
        return multi_lora_forward
    def _get_module_by_name(self, module_name):
        try:
            module = self
            for part in module_name.split('.'):
                module = getattr(module, part)
            return module
        except AttributeError as e:
            raise ValueError(f"Cannot find module: {module_name}, error: {e}")
    def disable_all_loras(self):
        for name, module in self.named_modules():
            if hasattr(module, 'org_forward'):
                module.forward = module.org_forward
                delattr(module, 'org_forward')
        for lora_key, lora_network in self.lora_dict.items():
            for lora in lora_network.loras:
                lora.to("cpu")
        self.active_loras.clear()
    def enable_bsa(self,):
        for block in self.blocks:
            block.attn.enable_bsa = True
    def disable_bsa(self,):
        for block in self.blocks:
            block.attn.enable_bsa = False    
    def forward(
        self, 
        hidden_states, 
        timestep, 
        encoder_hidden_states, 
        encoder_attention_mask=None, 
        num_cond_latents=0,
        return_kv=False, 
        kv_cache_dict={},
        skip_crs_attn=False, 
        offload_kv_cache=False,
        use_gradient_checkpointing=False,
        use_gradient_checkpointing_offload=False,
    ):
        B, _, T, H, W = hidden_states.shape
        N_t = T // self.patch_size[0]
        N_h = H // self.patch_size[1]
        N_w = W // self.patch_size[2]
        assert self.patch_size[0]==1, "Currently, 3D x_embedder should not compress the temporal dimension."
        # expand the shape of timestep from [B] to [B, T]
        if len(timestep.shape) == 1:
            timestep = timestep.unsqueeze(1).expand(-1, N_t).clone() # [B, T]
        timestep[:, :num_cond_latents] = 0
        dtype = hidden_states.dtype
        hidden_states = hidden_states.to(dtype)
        timestep = timestep.to(dtype)
        encoder_hidden_states = encoder_hidden_states.to(dtype)
        hidden_states = self.x_embedder(hidden_states)  # [B, N, C]
        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
            t = self.t_embedder(timestep.float().flatten(), dtype=torch.float32).reshape(B, N_t, -1)  # [B, T, C_t]
        encoder_hidden_states = self.y_embedder(encoder_hidden_states)  # [B, 1, N_token, C]
        if self.text_tokens_zero_pad and encoder_attention_mask is not None:
            encoder_hidden_states = encoder_hidden_states * encoder_attention_mask[:, None, :, None]
            encoder_attention_mask = (encoder_attention_mask * 0 + 1).to(encoder_attention_mask.dtype)
        if encoder_attention_mask is not None:
            encoder_attention_mask = encoder_attention_mask.squeeze(1).squeeze(1)
            encoder_hidden_states = encoder_hidden_states.squeeze(1).masked_select(encoder_attention_mask.unsqueeze(-1) != 0).view(1, -1, hidden_states.shape[-1]) # [1, N_valid_tokens, C]
            y_seqlens = encoder_attention_mask.sum(dim=1).tolist() # [B]
        else:
            y_seqlens = [encoder_hidden_states.shape[2]] * encoder_hidden_states.shape[0]
            encoder_hidden_states = encoder_hidden_states.squeeze(1).view(1, -1, hidden_states.shape[-1])
        # if self.cp_split_hw[0] * self.cp_split_hw[1] > 1:
        #     hidden_states = rearrange(hidden_states, "B (T H W) C -> B T H W C", T=N_t, H=N_h, W=N_w)
        #     hidden_states = context_parallel_util.split_cp_2d(hidden_states, seq_dim_hw=(2, 3), split_hw=self.cp_split_hw)
        #     hidden_states = rearrange(hidden_states, "B T H W C -> B (T H W) C")
        # blocks
        kv_cache_dict_ret = {}
        for i, block in enumerate(self.blocks):
            block_outputs = gradient_checkpoint_forward(
                block,
                use_gradient_checkpointing=use_gradient_checkpointing,
                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
                x=hidden_states,
                y=encoder_hidden_states,
                t=t,
                y_seqlen=y_seqlens,
                latent_shape=(N_t, N_h, N_w),
                num_cond_latents=num_cond_latents,
                return_kv=return_kv,
                kv_cache=kv_cache_dict.get(i, None),
                skip_crs_attn=skip_crs_attn,
            )
            if return_kv:
                hidden_states, kv_cache = block_outputs
                if offload_kv_cache:
                    kv_cache_dict_ret[i] = (kv_cache[0].cpu(), kv_cache[1].cpu())
                else:
                    kv_cache_dict_ret[i] = (kv_cache[0].contiguous(), kv_cache[1].contiguous())
            else:
                hidden_states = block_outputs
        hidden_states = self.final_layer(hidden_states, t, (N_t, N_h, N_w))  # [B, N, C=T_p*H_p*W_p*C_out]
        # if self.cp_split_hw[0] * self.cp_split_hw[1] > 1:
        #     hidden_states = context_parallel_util.gather_cp_2d(hidden_states, shape=(N_t, N_h, N_w), split_hw=self.cp_split_hw)
        hidden_states = self.unpatchify(hidden_states, N_t, N_h, N_w)  # [B, C_out, H, W]
        # cast to float32 for better accuracy
        hidden_states = hidden_states.to(torch.float32)
        if return_kv:
            return hidden_states, kv_cache_dict_ret
        else:
            return hidden_states
    def unpatchify(self, x, N_t, N_h, N_w):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]
        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        return x
    @staticmethod
    def state_dict_converter():
        return LongCatVideoTransformer3DModelDictConverter()
 class LongCatVideoTransformer3DModelDictConverter:
    def __init__(self):
        pass
    def from_diffusers(self, state_dict):
        return state_dict
    def from_civitai(self, state_dict):
        return state_dict
--- a/Show More
+++ b/Show More
		`@@ -1 +0,0 @@`
			`{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}`
		`@@ -0,0 +1,2 @@`
							`from .model_configs import MODEL_CONFIGS`
							`from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS, VERSION_CHECKER_MAPS`
		`@@ -1,2 +0,0 @@`
			`from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager`
			`from .processors import Annotator`
		`@@ -0,0 +1 @@`
							`from .unified_dataset import UnifiedDataset`
		`@@ -0,0 +1,2 @@`
							`from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name`
							`from .npu_compatible_device import IS_NPU_AVAILABLE, IS_CUDA_AVAILABLE`
		`@@ -0,0 +1 @@`
							`from .gradient_checkpoint import gradient_checkpoint_forward`
		`@@ -0,0 +1,2 @@`
							`from .initialization import skip_model_initialization`
							`from .layers import *`
		`@@ -1 +0,0 @@`
			`from .video import VideoData, save_video, save_frames`