Merge branch 'main' into cuda_replace

2026-03-18 22:08:13 +00:00 · 2026-01-20 10:12:31 +08:00
parent ad91d41601 a8b340c098
commit dd8d902624
57 changed files with 996 additions and 116 deletions
--- a/diffsynth/configs/model_configs.py
+++ b/diffsynth/configs/model_configs.py
@@ -481,6 +481,13 @@ flux_series = [
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
        "extra_kwargs": {"disable_guidance_embedder": True},
    },
+    {
+        # Example: ModelConfig(model_id="MAILAND/majicflus_v1", origin_file_pattern="majicflus_v134.safetensors")
+        "model_hash": "3394f306c4cbf04334b712bf5aaed95f",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
 ]

 flux2_series = [
@@ -503,6 +510,28 @@ flux2_series = [
        "model_name": "flux2_vae",
        "model_class": "diffsynth.models.flux2_vae.Flux2VAE",
    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "3bde7b817fec8143028b6825a63180df",
+        "model_name": "flux2_dit",
+        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
+        "extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 7680, "num_attention_heads": 24, "num_layers": 5, "num_single_layers": 20}
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="text_encoder/*.safetensors")
+        "model_hash": "9195f3ea256fcd0ae6d929c203470754",
+        "model_name": "z_image_text_encoder",
+        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
+        "extra_kwargs": {"model_size": "8B"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "39c6fc48f07bebecedbbaa971ff466c8",
+        "model_name": "flux2_dit",
+        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
+        "extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 12288, "num_attention_heads": 32, "num_layers": 8, "num_single_layers": 24}
+    },
 ]

 z_image_series = [
--- a/diffsynth/core/device/init.py
+++ b/diffsynth/core/device/init.py
@@ -1,2 +1,2 @@
 from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name
-from .npu_compatible_device import IS_NPU_AVAILABLE
+from .npu_compatible_device import IS_NPU_AVAILABLE, IS_CUDA_AVAILABLE
--- a/diffsynth/diffusion/training_module.py
+++ b/diffsynth/diffusion/training_module.py
@@ -1,4 +1,4 @@
-import torch, json
+import torch, json, os
 from ..core import ModelConfig, load_state_dict
 from ..utils.controlnet import ControlNetInput
 from peft import LoraConfig, inject_adapter_in_model
@@ -127,15 +127,29 @@ class DiffusionTrainingModule(torch.nn.Module):
        if model_id_with_origin_paths is not None:
            model_id_with_origin_paths = model_id_with_origin_paths.split(",")
            for model_id_with_origin_path in model_id_with_origin_paths:
-                model_id, origin_file_pattern = model_id_with_origin_path.split(":")
                vram_config = self.parse_vram_config(
                    fp8=model_id_with_origin_path in fp8_models,
                    offload=model_id_with_origin_path in offload_models,
                    device=device
                )
-                model_configs.append(ModelConfig(model_id=model_id, origin_file_pattern=origin_file_pattern, **vram_config))
+                config = self.parse_path_or_model_id(model_id_with_origin_path)
+                model_configs.append(ModelConfig(model_id=config.model_id, origin_file_pattern=config.origin_file_pattern, **vram_config))
        return model_configs
    
+
+    def parse_path_or_model_id(self, model_id_with_origin_path, default_value=None):
+        if model_id_with_origin_path is None:
+            return default_value
+        elif os.path.exists(model_id_with_origin_path):
+            return ModelConfig(path=model_id_with_origin_path)
+        else:
+            if ":" not in model_id_with_origin_path:
+                raise ValueError(f"Failed to parse model config: {model_id_with_origin_path}. This is neither a valid path nor in the format of `model_id/origin_file_pattern`.")
+            split_id = model_id_with_origin_path.rfind(":")
+            model_id = model_id_with_origin_path[:split_id]
+            origin_file_pattern = model_id_with_origin_path[split_id + 1:]
+            return ModelConfig(model_id=model_id, origin_file_pattern=origin_file_pattern)
+
    
    def switch_pipe_to_training_mode(
        self,
--- a/diffsynth/models/flux2_dit.py
+++ b/diffsynth/models/flux2_dit.py
@@ -823,7 +823,13 @@ class Flux2PosEmbed(nn.Module):


 class Flux2TimestepGuidanceEmbeddings(nn.Module):
-    def __init__(self, in_channels: int = 256, embedding_dim: int = 6144, bias: bool = False):
+    def __init__(
+        self,
+        in_channels: int = 256,
+        embedding_dim: int = 6144,
+        bias: bool = False,
+        guidance_embeds: bool = True,
+    ):
        super().__init__()

        self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
@@ -831,20 +837,24 @@ class Flux2TimestepGuidanceEmbeddings(nn.Module):
            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
        )

-        self.guidance_embedder = TimestepEmbedding(
-            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
-        )
+        if guidance_embeds:
+            self.guidance_embedder = TimestepEmbedding(
+                in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
+            )
+        else:
+            self.guidance_embedder = None

    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor) -> torch.Tensor:
        timesteps_proj = self.time_proj(timestep)
        timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype))  # (N, D)

-        guidance_proj = self.time_proj(guidance)
-        guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype))  # (N, D)
-
-        time_guidance_emb = timesteps_emb + guidance_emb
-
-        return time_guidance_emb
+        if guidance is not None and self.guidance_embedder is not None:
+            guidance_proj = self.time_proj(guidance)
+            guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype))  # (N, D)
+            time_guidance_emb = timesteps_emb + guidance_emb
+            return time_guidance_emb
+        else:
+            return timesteps_emb


 class Flux2Modulation(nn.Module):
@@ -882,6 +892,7 @@ class Flux2DiT(torch.nn.Module):
        axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
        rope_theta: int = 2000,
        eps: float = 1e-6,
+        guidance_embeds: bool = True,
    ):
        super().__init__()
        self.out_channels = out_channels or in_channels
@@ -892,7 +903,10 @@ class Flux2DiT(torch.nn.Module):

        # 2. Combined timestep + guidance embedding
        self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings(
-            in_channels=timestep_guidance_channels, embedding_dim=self.inner_dim, bias=False
+            in_channels=timestep_guidance_channels,
+            embedding_dim=self.inner_dim,
+            bias=False,
+            guidance_embeds=guidance_embeds,
        )

        # 3. Modulation (double stream and single stream blocks share modulation parameters, resp.)
@@ -953,34 +967,9 @@ class Flux2DiT(torch.nn.Module):
        txt_ids: torch.Tensor = None,
        guidance: torch.Tensor = None,
        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
        use_gradient_checkpointing=False,
        use_gradient_checkpointing_offload=False,
-    ) -> Union[torch.Tensor]:
-        """
-        The [`FluxTransformer2DModel`] forward method.
-
-        Args:
-            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
-                Input `hidden_states`.
-            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            timestep ( `torch.LongTensor`):
-                Used to indicate denoising step.
-            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
-                A list of tensors that if specified are added to the residuals of transformer blocks.
-            joint_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
+    ):
        # 0. Handle input arguments
        if joint_attention_kwargs is not None:
            joint_attention_kwargs = joint_attention_kwargs.copy()
@@ -992,7 +981,9 @@ class Flux2DiT(torch.nn.Module):

        # 1. Calculate timestep embedding and modulation parameters
        timestep = timestep.to(hidden_states.dtype) * 1000
-        guidance = guidance.to(hidden_states.dtype) * 1000
+
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000

        temb = self.time_guidance_embed(timestep, guidance)

--- a/diffsynth/models/wan_video_dit.py
+++ b/diffsynth/models/wan_video_dit.py
@@ -5,6 +5,7 @@ import math
 from typing import Tuple, Optional
 from einops import rearrange
 from .wan_video_camera_controller import SimpleAdapter
+
 try:
    import flash_attn_interface
    FLASH_ATTN_3_AVAILABLE = True
@@ -92,6 +93,7 @@ def rope_apply(x, freqs, num_heads):
    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
    x_out = torch.view_as_complex(x.to(torch.float64).reshape(
        x.shape[0], x.shape[1], x.shape[2], -1, 2))
+    freqs = freqs.to(torch.complex64) if freqs.device == "npu" else freqs
    x_out = torch.view_as_real(x_out * freqs).flatten(2)
    return x_out.to(x.dtype)

--- a/diffsynth/models/z_image_dit.py
+++ b/diffsynth/models/z_image_dit.py
@@ -6,7 +6,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pad_sequence

-from torch.nn import RMSNorm
+from .general_modules import RMSNorm
 from ..core.attention import attention_forward
 from ..core.device.npu_compatible_device import IS_NPU_AVAILABLE, get_device_type
 from ..core.gradient import gradient_checkpoint_forward
--- a/diffsynth/models/z_image_text_encoder.py
+++ b/diffsynth/models/z_image_text_encoder.py
@@ -3,38 +3,71 @@ import torch


 class ZImageTextEncoder(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, model_size="4B"):
        super().__init__()
-        config = Qwen3Config(**{
-            "architectures": [
-                "Qwen3ForCausalLM"
-            ],
-            "attention_bias": False,
-            "attention_dropout": 0.0,
-            "bos_token_id": 151643,
-            "eos_token_id": 151645,
-            "head_dim": 128,
-            "hidden_act": "silu",
-            "hidden_size": 2560,
-            "initializer_range": 0.02,
-            "intermediate_size": 9728,
-            "max_position_embeddings": 40960,
-            "max_window_layers": 36,
-            "model_type": "qwen3",
-            "num_attention_heads": 32,
-            "num_hidden_layers": 36,
-            "num_key_value_heads": 8,
-            "rms_norm_eps": 1e-06,
-            "rope_scaling": None,
-            "rope_theta": 1000000,
-            "sliding_window": None,
-            "tie_word_embeddings": True,
-            "torch_dtype": "bfloat16",
-            "transformers_version": "4.51.0",
-            "use_cache": True,
-            "use_sliding_window": False,
-            "vocab_size": 151936
-        })
+        config_dict = {
+            "4B": Qwen3Config(**{
+                "architectures": [
+                    "Qwen3ForCausalLM"
+                ],
+                "attention_bias": False,
+                "attention_dropout": 0.0,
+                "bos_token_id": 151643,
+                "eos_token_id": 151645,
+                "head_dim": 128,
+                "hidden_act": "silu",
+                "hidden_size": 2560,
+                "initializer_range": 0.02,
+                "intermediate_size": 9728,
+                "max_position_embeddings": 40960,
+                "max_window_layers": 36,
+                "model_type": "qwen3",
+                "num_attention_heads": 32,
+                "num_hidden_layers": 36,
+                "num_key_value_heads": 8,
+                "rms_norm_eps": 1e-06,
+                "rope_scaling": None,
+                "rope_theta": 1000000,
+                "sliding_window": None,
+                "tie_word_embeddings": True,
+                "torch_dtype": "bfloat16",
+                "transformers_version": "4.51.0",
+                "use_cache": True,
+                "use_sliding_window": False,
+                "vocab_size": 151936
+            }),
+            "8B": Qwen3Config(**{
+                "architectures": [
+                    "Qwen3ForCausalLM"
+                ],
+                "attention_bias": False,
+                "attention_dropout": 0.0,
+                "bos_token_id": 151643,
+                "dtype": "bfloat16",
+                "eos_token_id": 151645,
+                "head_dim": 128,
+                "hidden_act": "silu",
+                "hidden_size": 4096,
+                "initializer_range": 0.02,
+                "intermediate_size": 12288,
+                "max_position_embeddings": 40960,
+                "max_window_layers": 36,
+                "model_type": "qwen3",
+                "num_attention_heads": 32,
+                "num_hidden_layers": 36,
+                "num_key_value_heads": 8,
+                "rms_norm_eps": 1e-06,
+                "rope_scaling": None,
+                "rope_theta": 1000000,
+                "sliding_window": None,
+                "tie_word_embeddings": False,
+                "transformers_version": "4.56.1",
+                "use_cache": True,
+                "use_sliding_window": False,
+                "vocab_size": 151936
+            })
+        }
+        config = config_dict[model_size]
        self.model = Qwen3Model(config)
    
    def forward(self, *args, **kwargs):
--- a/diffsynth/pipelines/flux2_image.py
+++ b/diffsynth/pipelines/flux2_image.py
@@ -11,10 +11,11 @@ from ..diffusion import FlowMatchScheduler
 from ..core import ModelConfig, gradient_checkpoint_forward
 from ..diffusion.base_pipeline import BasePipeline, PipelineUnit, ControlNetInput

-from transformers import AutoProcessor
+from transformers import AutoProcessor, AutoTokenizer
 from ..models.flux2_text_encoder import Flux2TextEncoder
 from ..models.flux2_dit import Flux2DiT
 from ..models.flux2_vae import Flux2VAE
+from ..models.z_image_text_encoder import ZImageTextEncoder


 class Flux2ImagePipeline(BasePipeline):
@@ -26,6 +27,7 @@ class Flux2ImagePipeline(BasePipeline):
        )
        self.scheduler = FlowMatchScheduler("FLUX.2")
        self.text_encoder: Flux2TextEncoder = None
+        self.text_encoder_qwen3: ZImageTextEncoder = None
        self.dit: Flux2DiT = None
        self.vae: Flux2VAE = None
        self.tokenizer: AutoProcessor = None
@@ -33,6 +35,7 @@ class Flux2ImagePipeline(BasePipeline):
        self.units = [
            Flux2Unit_ShapeChecker(),
            Flux2Unit_PromptEmbedder(),
+            Flux2Unit_Qwen3PromptEmbedder(),
            Flux2Unit_NoiseInitializer(),
            Flux2Unit_InputImageEmbedder(),
            Flux2Unit_ImageIDs(),
@@ -54,11 +57,12 @@ class Flux2ImagePipeline(BasePipeline):
        
        # Fetch models
        pipe.text_encoder = model_pool.fetch_model("flux2_text_encoder")
+        pipe.text_encoder_qwen3 = model_pool.fetch_model("z_image_text_encoder")
        pipe.dit = model_pool.fetch_model("flux2_dit")
        pipe.vae = model_pool.fetch_model("flux2_vae")
        if tokenizer_config is not None:
            tokenizer_config.download_if_necessary()
-            pipe.tokenizer = AutoProcessor.from_pretrained(tokenizer_config.path)
+            pipe.tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.path)
        
        # VRAM Management
        pipe.vram_management_enabled = pipe.check_vram_management_state()
@@ -276,6 +280,10 @@ class Flux2Unit_PromptEmbedder(PipelineUnit):
        return prompt_embeds, text_ids

    def process(self, pipe: Flux2ImagePipeline, prompt):
+        # Skip if Qwen3 text encoder is available (handled by Qwen3PromptEmbedder)
+        if pipe.text_encoder_qwen3 is not None:
+            return {}
+        
        pipe.load_models_to_device(self.onload_model_names)
        prompt_embeds, text_ids = self.encode_prompt(
            pipe.text_encoder, pipe.tokenizer, prompt,
@@ -284,6 +292,136 @@ class Flux2Unit_PromptEmbedder(PipelineUnit):
        return {"prompt_embeds": prompt_embeds, "text_ids": text_ids}


+class Flux2Unit_Qwen3PromptEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={"prompt": "prompt"},
+            input_params_nega={"prompt": "negative_prompt"},
+            output_params=("prompt_emb", "prompt_emb_mask"),
+            onload_model_names=("text_encoder_qwen3",)
+        )
+        self.hidden_states_layers = (9, 18, 27)  # Qwen3 layers
+
+    def get_qwen3_prompt_embeds(
+        self,
+        text_encoder: ZImageTextEncoder,
+        tokenizer: AutoTokenizer,
+        prompt: Union[str, List[str]],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 512,
+    ):
+        dtype = text_encoder.dtype if dtype is None else dtype
+        device = text_encoder.device if device is None else device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        all_input_ids = []
+        all_attention_masks = []
+
+        for single_prompt in prompt:
+            messages = [{"role": "user", "content": single_prompt}]
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False,
+            )
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=max_sequence_length,
+            )
+
+            all_input_ids.append(inputs["input_ids"])
+            all_attention_masks.append(inputs["attention_mask"])
+
+        input_ids = torch.cat(all_input_ids, dim=0).to(device)
+        attention_mask = torch.cat(all_attention_masks, dim=0).to(device)
+
+        # Forward pass through the model
+        with torch.inference_mode():
+            output = text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                use_cache=False,
+            )
+
+        # Only use outputs from intermediate layers and stack them
+        out = torch.stack([output.hidden_states[k] for k in self.hidden_states_layers], dim=1)
+        out = out.to(dtype=dtype, device=device)
+
+        batch_size, num_channels, seq_len, hidden_dim = out.shape
+        prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim)
+        return prompt_embeds
+
+    def prepare_text_ids(
+        self,
+        x: torch.Tensor,  # (B, L, D) or (L, D)
+        t_coord: Optional[torch.Tensor] = None,
+    ):
+        B, L, _ = x.shape
+        out_ids = []
+
+        for i in range(B):
+            t = torch.arange(1) if t_coord is None else t_coord[i]
+            h = torch.arange(1)
+            w = torch.arange(1)
+            l = torch.arange(L)
+
+            coords = torch.cartesian_prod(t, h, w, l)
+            out_ids.append(coords)
+
+        return torch.stack(out_ids)
+
+    def encode_prompt(
+        self,
+        text_encoder: ZImageTextEncoder,
+        tokenizer: AutoTokenizer,
+        prompt: Union[str, List[str]],
+        dtype = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_embeds = self.get_qwen3_prompt_embeds(
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                prompt=prompt,
+                dtype=dtype,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+
+        batch_size, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        text_ids = self.prepare_text_ids(prompt_embeds)
+        text_ids = text_ids.to(device)
+        return prompt_embeds, text_ids
+
+    def process(self, pipe: Flux2ImagePipeline, prompt):
+        # Check if Qwen3 text encoder is available
+        if pipe.text_encoder_qwen3 is None:
+            return {}
+        
+        pipe.load_models_to_device(self.onload_model_names)
+        prompt_embeds, text_ids = self.encode_prompt(
+            pipe.text_encoder_qwen3, pipe.tokenizer, prompt,
+            dtype=pipe.torch_dtype, device=pipe.device,
+        )
+        return {"prompt_embeds": prompt_embeds, "text_ids": text_ids}
+
+
 class Flux2Unit_NoiseInitializer(PipelineUnit):
    def __init__(self):
        super().__init__(
--- a/diffsynth/pipelines/wan_video.py
+++ b/diffsynth/pipelines/wan_video.py
@@ -123,11 +123,15 @@ class WanVideoPipeline(BasePipeline):
                    model_config.model_id = redirect_dict[model_config.origin_file_pattern][0]
                    model_config.origin_file_pattern = redirect_dict[model_config.origin_file_pattern][1]
        
-        # Initialize pipeline
-        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
        if use_usp:
            from ..utils.xfuser import initialize_usp
            initialize_usp(device)
+            import torch.distributed as dist
+            from ..core.device.npu_compatible_device import get_device_name
+            if dist.is_available() and dist.is_initialized():
+                device = get_device_name()
+        # Initialize pipeline
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
        model_pool = pipe.download_and_load_models(model_configs, vram_limit)
        
        # Fetch models
--- a/diffsynth/utils/state_dict_converters/flux_dit.py
+++ b/diffsynth/utils/state_dict_converters/flux_dit.py
@@ -143,6 +143,8 @@ def FluxDiTStateDictConverterFromDiffusers(state_dict):
            suffix = ".weight" if name.endswith(".weight") else ".bias"
            prefix = name[:-len(suffix)]
            if prefix in global_rename_dict:
+                if global_rename_dict[prefix] == "final_norm_out.linear":
+                    param = torch.concat([param[3072:], param[:3072]], dim=0)
                state_dict_[global_rename_dict[prefix] + suffix] = param
            elif prefix.startswith("transformer_blocks."):
                names = prefix.split(".")
--- a/diffsynth/utils/state_dict_converters/z_image_text_encoder.py
+++ b/diffsynth/utils/state_dict_converters/z_image_text_encoder.py
@@ -0,0 +1,6 @@
+def ZImageTextEncoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for name in state_dict:
+        if name != "lm_head.weight":
+            state_dict_[name] = state_dict[name]
+    return state_dict_
--- a/diffsynth/utils/xfuser/xdit_context_parallel.py
+++ b/diffsynth/utils/xfuser/xdit_context_parallel.py
@@ -50,7 +50,7 @@ def rope_apply(x, freqs, num_heads):
    sp_rank = get_sequence_parallel_rank()
    freqs = pad_freqs(freqs, s_per_rank * sp_size)
    freqs_rank = freqs[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
-
+    freqs_rank = freqs_rank.to(torch.complex64) if freqs_rank.device == "npu" else freqs_rank
    x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
    return x_out.to(x.dtype)