flux_general_vram_config = { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.general_modules.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.flux_lora_encoder.LoRALayerBlock": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.flux_lora_patcher.LoraMerger": "diffsynth.core.vram.layers.AutoWrappedModule", } VRAM_MANAGEMENT_MODULE_MAPS = { "diffsynth.models.qwen_image_dit.QwenImageDiT": { "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionPatchEmbed": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.qwen_image_vae.QwenImageVAE": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.qwen_image_vae.QwenImageRMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.qwen_image_controlnet.BlockWiseControlBlock": { "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder": { "transformers.models.siglip.modeling_siglip.SiglipVisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder": { "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTLayerScale": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTRopePositionEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.wan_video_animate_adapter.WanAnimateAdapter": { "diffsynth.models.wan_video_animate_adapter.FaceEncoder": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_animate_adapter.EqualLinear": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_animate_adapter.ConvLayer": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_animate_adapter.FusedLeakyReLU": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_animate_adapter.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_dit_s2v.WanS2VModel": { "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_dit_s2v.WanS2VDiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_dit_s2v.CausalAudioEncoder": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_dit.WanModel": { "diffsynth.models.wan_video_dit.MLP": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule", "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_image_encoder.WanImageEncoder": { "diffsynth.models.wan_video_image_encoder.VisionTransformer": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_mot.MotWanModel": { "diffsynth.models.wan_video_mot.MotWanAttentionBlock": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_motion_controller.WanMotionControllerModel": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.wan_video_text_encoder.WanTextEncoder": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_text_encoder.T5RelativeEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_text_encoder.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_vace.VaceWanModel": { "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_vae.WanVideoVAE": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wan_video_vae.WanVideoVAE38": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.wav2vec.WanS2VAudioEncoder": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.longcat_video_dit.LongCatVideoTransformer3DModel": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.longcat_video_dit.RMSNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.longcat_video_dit.LayerNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.flux_dit.FluxDiT": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "diffsynth.models.flux_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.flux_text_encoder_clip.FluxTextEncoderClip": flux_general_vram_config, "diffsynth.models.flux_vae.FluxVAEEncoder": flux_general_vram_config, "diffsynth.models.flux_vae.FluxVAEDecoder": flux_general_vram_config, "diffsynth.models.flux_controlnet.FluxControlNet": flux_general_vram_config, "diffsynth.models.flux_infiniteyou.InfiniteYouImageProjector": flux_general_vram_config, "diffsynth.models.flux_ipadapter.FluxIpAdapter": flux_general_vram_config, "diffsynth.models.flux_lora_patcher.FluxLoraPatcher": flux_general_vram_config, "diffsynth.models.step1x_connector.Qwen2Connector": flux_general_vram_config, "diffsynth.models.flux_lora_encoder.FluxLoRAEncoder": flux_general_vram_config, "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.t5.modeling_t5.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.t5.modeling_t5.T5DenseActDense": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.t5.modeling_t5.T5DenseGatedActDense": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.flux_ipadapter.SiglipVisionModelSO400M": { "transformers.models.siglip.modeling_siglip.SiglipVisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.siglip.modeling_siglip.SiglipEncoder": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.MultiheadAttention": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.flux2_dit.Flux2DiT": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.flux2_text_encoder.Flux2TextEncoder": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.mistral.modeling_mistral.MistralRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.flux2_vae.Flux2VAE": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.z_image_text_encoder.ZImageTextEncoder": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.z_image_dit.ZImageDiT": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "diffsynth.models.z_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.z_image_controlnet.ZImageControlNet": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "diffsynth.models.z_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M": { "transformers.models.siglip2.modeling_siglip2.Siglip2VisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.siglip2.modeling_siglip2.Siglip2MultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", }, "diffsynth.models.ltx2_dit.LTXModel": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler": { "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder": { "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder": { "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder": { "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_audio_vae.LTX2Vocoder": { "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "diffsynth.models.ltx2_text_encoder.Embeddings1DConnector": "diffsynth.core.vram.layers.AutoWrappedModule", }, "diffsynth.models.ltx2_text_encoder.LTX2TextEncoder": { "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "transformers.models.gemma3.modeling_gemma3.Gemma3MultiModalProjector": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.gemma3.modeling_gemma3.Gemma3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", "transformers.models.gemma3.modeling_gemma3.Gemma3TextScaledWordEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule", }, }