Merge branch 'main' into ltx-2

2026-03-18 22:08:13 +00:00 · 2026-02-03 13:06:44 +08:00
parent 49bc84f78e a781138413
commit 25c3a3d3e2
34 changed files with 2132 additions and 37 deletions
--- a/diffsynth/configs/model_configs.py
+++ b/diffsynth/configs/model_configs.py
@@ -589,6 +589,14 @@ z_image_series = [
        "model_class": "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel",
        "extra_kwargs": {"compress_dim": 128},
    },
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors")
+        "model_hash": "1392adecee344136041e70553f875f31",
+        "model_name": "z_image_text_encoder",
+        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
+        "extra_kwargs": {"model_size": "0.6B"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
+    },
 ]

 ltx2_series = [
--- a/diffsynth/core/loader/config.py
+++ b/diffsynth/core/loader/config.py
@@ -1,5 +1,5 @@
 import torch, glob, os
-from typing import Optional, Union
+from typing import Optional, Union, Dict
 from dataclasses import dataclass
 from modelscope import snapshot_download
 from huggingface_hub import snapshot_download as hf_snapshot_download
@@ -23,13 +23,14 @@ class ModelConfig:
    computation_device: Optional[Union[str, torch.device]] = None
    computation_dtype: Optional[torch.dtype] = None
    clear_parameters: bool = False
+    state_dict: Dict[str, torch.Tensor] = None
    
    def check_input(self):
        if self.path is None and self.model_id is None:
            raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`. `skip_download=True` only supports the first one.""")
    
    def parse_original_file_pattern(self):
-        if self.origin_file_pattern is None or self.origin_file_pattern == "":
+        if self.origin_file_pattern in [None, "", "./"]:
            return "*"
        elif self.origin_file_pattern.endswith("/"):
            return self.origin_file_pattern + "*"
@@ -98,7 +99,7 @@ class ModelConfig:
        if self.require_downloading():
            self.download()
        if self.path is None:
-            if self.origin_file_pattern is None or self.origin_file_pattern == "":
+            if self.origin_file_pattern in [None, "", "./"]:
                self.path = os.path.join(self.local_model_path, self.model_id)
            else:
                self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern))
--- a/diffsynth/core/loader/file.py
+++ b/diffsynth/core/loader/file.py
@@ -2,16 +2,25 @@ from safetensors import safe_open
 import torch, hashlib


-def load_state_dict(file_path, torch_dtype=None, device="cpu"):
+def load_state_dict(file_path, torch_dtype=None, device="cpu", pin_memory=False, verbose=0):
    if isinstance(file_path, list):
        state_dict = {}
        for file_path_ in file_path:
-            state_dict.update(load_state_dict(file_path_, torch_dtype, device))
-        return state_dict
-    if file_path.endswith(".safetensors"):
-        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
+            state_dict.update(load_state_dict(file_path_, torch_dtype, device, pin_memory=pin_memory, verbose=verbose))
    else:
-        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)
+        if verbose >= 1:
+            print(f"Loading file [started]: {file_path}")
+        if file_path.endswith(".safetensors"):
+            state_dict = load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
+        else:
+            state_dict = load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)
+        # If load state dict in CPU memory, `pin_memory=True` will make `model.to("cuda")` faster.
+        if pin_memory:
+            for i in state_dict:
+                state_dict[i] = state_dict[i].pin_memory()
+        if verbose >= 1:
+            print(f"Loading file [done]: {file_path}")
+    return state_dict


 def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
--- a/diffsynth/core/loader/model.py
+++ b/diffsynth/core/loader/model.py
@@ -5,7 +5,7 @@ from .file import load_state_dict
 import torch


-def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, use_disk_map=False, module_map=None, vram_config=None, vram_limit=None):
+def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, use_disk_map=False, module_map=None, vram_config=None, vram_limit=None, state_dict=None):
    config = {} if config is None else config
    # Why do we use `skip_model_initialization`?
    # It skips the random initialization of model parameters,
@@ -20,7 +20,7 @@ def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, devic
        dtypes = [vram_config["offload_dtype"], vram_config["onload_dtype"], vram_config["preparing_dtype"], vram_config["computation_dtype"]]
        dtype = [d for d in dtypes if d != "disk"][0]
        if vram_config["offload_device"] != "disk":
-            state_dict = DiskMap(path, device, torch_dtype=dtype)
+            if state_dict is None: state_dict = DiskMap(path, device, torch_dtype=dtype)
            if state_dict_converter is not None:
                state_dict = state_dict_converter(state_dict)
            else:
@@ -35,7 +35,9 @@ def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, devic
        # Sometimes a model file contains multiple models,
        # and DiskMap can load only the parameters of a single model,
        # avoiding the need to load all parameters in the file.
-        if use_disk_map:
+        if state_dict is not None:
+            pass
+        elif use_disk_map:
            state_dict = DiskMap(path, device, torch_dtype=torch_dtype)
        else:
            state_dict = load_state_dict(path, torch_dtype, device)
--- a/diffsynth/diffusion/base_pipeline.py
+++ b/diffsynth/diffusion/base_pipeline.py
@@ -296,6 +296,7 @@ class BasePipeline(torch.nn.Module):
                vram_config=vram_config,
                vram_limit=vram_limit,
                clear_parameters=model_config.clear_parameters,
+                state_dict=model_config.state_dict,
            )
        return model_pool
    
--- a/diffsynth/diffusion/flow_match.py
+++ b/diffsynth/diffusion/flow_match.py
@@ -4,7 +4,7 @@ from typing_extensions import Literal

 class FlowMatchScheduler():

-    def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2"] = "FLUX.1"):
+    def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning"] = "FLUX.1"):
        self.set_timesteps_fn = {
            "FLUX.1": FlowMatchScheduler.set_timesteps_flux,
            "Wan": FlowMatchScheduler.set_timesteps_wan,
@@ -12,6 +12,7 @@ class FlowMatchScheduler():
            "FLUX.2": FlowMatchScheduler.set_timesteps_flux2,
            "Z-Image": FlowMatchScheduler.set_timesteps_z_image,
            "LTX-2": FlowMatchScheduler.set_timesteps_ltx2,
+            "Qwen-Image-Lightning": FlowMatchScheduler.set_timesteps_qwen_image_lightning,
        }.get(template, FlowMatchScheduler.set_timesteps_flux)
        self.num_train_timesteps = 1000

@@ -71,6 +72,28 @@ class FlowMatchScheduler():
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps
    
+    @staticmethod
+    def set_timesteps_qwen_image_lightning(num_inference_steps=100, denoising_strength=1.0, exponential_shift_mu=None, dynamic_shift_len=None):
+        sigma_min = 0.0
+        sigma_max = 1.0
+        num_train_timesteps = 1000
+        base_shift = math.log(3)
+        max_shift = math.log(3)
+        # Sigmas
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+        # Mu
+        if exponential_shift_mu is not None:
+            mu = exponential_shift_mu
+        elif dynamic_shift_len is not None:
+            mu = FlowMatchScheduler._calculate_shift_qwen_image(dynamic_shift_len, base_shift=base_shift, max_shift=max_shift)
+        else:
+            mu = 0.8
+        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
+        # Timesteps
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    
    @staticmethod
    def compute_empirical_mu(image_seq_len, num_steps):
        a1, b1 = 8.73809524e-05, 1.89833333
--- a/diffsynth/diffusion/loss.py
+++ b/diffsynth/diffusion/loss.py
@@ -13,9 +13,16 @@ def FlowMatchSFTLoss(pipe: BasePipeline, **inputs):
    inputs["latents"] = pipe.scheduler.add_noise(inputs["input_latents"], noise, timestep)
    training_target = pipe.scheduler.training_target(inputs["input_latents"], noise, timestep)
    
+    if "first_frame_latents" in inputs:
+        inputs["latents"][:, :, 0:1] = inputs["first_frame_latents"]
+    
    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
    noise_pred = pipe.model_fn(**models, **inputs, timestep=timestep)
    
+    if "first_frame_latents" in inputs:
+        noise_pred = noise_pred[:, :, 1:]
+        training_target = training_target[:, :, 1:]
+    
    loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
    loss = loss * pipe.scheduler.training_weight(timestep)
    return loss
--- a/diffsynth/models/model_loader.py
+++ b/diffsynth/models/model_loader.py
@@ -29,7 +29,7 @@ class ModelPool:
            module_map = None
        return module_map
    
-    def load_model_file(self, config, path, vram_config, vram_limit=None):
+    def load_model_file(self, config, path, vram_config, vram_limit=None, state_dict=None):
        model_class = self.import_model_class(config["model_class"])
        model_config = config.get("extra_kwargs", {})
        if "state_dict_converter" in config:
@@ -43,6 +43,7 @@ class ModelPool:
            state_dict_converter,
            use_disk_map=True,
            vram_config=vram_config, module_map=module_map, vram_limit=vram_limit,
+            state_dict=state_dict,
        )
        return model
    
@@ -59,7 +60,7 @@ class ModelPool:
        }
        return vram_config
    
-    def auto_load_model(self, path, vram_config=None, vram_limit=None, clear_parameters=False):
+    def auto_load_model(self, path, vram_config=None, vram_limit=None, clear_parameters=False, state_dict=None):
        print(f"Loading models from: {json.dumps(path, indent=4)}")
        if vram_config is None:
            vram_config = self.default_vram_config()
@@ -67,7 +68,7 @@ class ModelPool:
        loaded = False
        for config in MODEL_CONFIGS:
            if config["model_hash"] == model_hash:
-                model = self.load_model_file(config, path, vram_config, vram_limit=vram_limit)
+                model = self.load_model_file(config, path, vram_config, vram_limit=vram_limit, state_dict=state_dict)
                if clear_parameters: self.clear_parameters(model)
                self.model.append(model)
                model_name = config["model_name"]
--- a/diffsynth/models/z_image_text_encoder.py
+++ b/diffsynth/models/z_image_text_encoder.py
@@ -6,6 +6,36 @@ class ZImageTextEncoder(torch.nn.Module):
    def __init__(self, model_size="4B"):
        super().__init__()
        config_dict = {
+            "0.6B": Qwen3Config(**{
+                "architectures": [
+                    "Qwen3ForCausalLM"
+                ],
+                "attention_bias": False,
+                "attention_dropout": 0.0,
+                "bos_token_id": 151643,
+                "eos_token_id": 151645,
+                "head_dim": 128,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "initializer_range": 0.02,
+                "intermediate_size": 3072,
+                "max_position_embeddings": 40960,
+                "max_window_layers": 28,
+                "model_type": "qwen3",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 28,
+                "num_key_value_heads": 8,
+                "rms_norm_eps": 1e-06,
+                "rope_scaling": None,
+                "rope_theta": 1000000,
+                "sliding_window": None,
+                "tie_word_embeddings": True,
+                "torch_dtype": "bfloat16",
+                "transformers_version": "4.51.0",
+                "use_cache": True,
+                "use_sliding_window": False,
+                "vocab_size": 151936
+            }),
            "4B": Qwen3Config(**{
                "architectures": [
                    "Qwen3ForCausalLM"