Merge branch 'dev' into dev-dzj

2026-04-08 08:58:20 +00:00 · 2024-12-17 14:49:46 +08:00
parent 05e2028c5d 79249063b8
commit 54ed532e3e
21 changed files with 1351876 additions and 15 deletions
--- a/diffsynth/configs/model_config.py
+++ b/diffsynth/configs/model_config.py
@@ -94,6 +94,7 @@ model_loader_configs = [
    (None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
    (None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"),
    (None, "77ff18050dbc23f50382e45d51a779fe", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
+    (None, "5da81baee73198a7c19e6d2fe8b5148e", ["sd3_text_encoder_1"], [SD3TextEncoder1], "diffusers"),
    (None, "b9588f02e78f5ccafc9d7c0294e46308", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
 ]
 huggingface_model_loader_configs = [
@@ -103,10 +104,11 @@ huggingface_model_loader_configs = [
    ("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
    ("BloomForCausalLM", "transformers.models.bloom.modeling_bloom", "beautiful_prompt", None),
    ("Qwen2ForCausalLM", "transformers.models.qwen2.modeling_qwen2", "qwen_prompt", None),
-    ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
+    # ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
    ("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
    ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
-    ("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel")
+    ("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
+    ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "hunyuan_video_text_encoder_2", "LlamaModel")
 ]
 patch_model_loader_configs = [
    # These configs are provided for detecting model type automatically.
@@ -629,6 +631,22 @@ preset_models_on_modelscope = {
        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
    ],
+    "HunyuanVideo":{
+        "file_list": [
+            ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00001-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00002-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00003-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00004-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "config.json", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model.safetensors.index.json", "models/HunyuanVideo/text_encoder_2"),
+            
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+        ],
+    },
 }
 Preset_model_id: TypeAlias = Literal[
    "HunyuanDiT",
@@ -684,4 +702,5 @@ Preset_model_id: TypeAlias = Literal[
    "Annotators:Openpose",
    "StableDiffusion3.5-large",
    "StableDiffusion3.5-medium",
+    "HunyuanVideo",
 ]
--- a/diffsynth/models/lora.py
+++ b/diffsynth/models/lora.py
@@ -306,6 +306,53 @@ class FluxLoRAConverter:
                state_dict_[rename.replace("lora_up.weight", "alpha")] = torch.tensor((alpha,))[0]
        return state_dict_
    
+    @staticmethod
+    def align_to_diffsynth_format(state_dict):
+        rename_dict = {
+            "lora_unet_double_blocks_blockid_img_mod_lin.lora_down.weight": "blocks.blockid.norm1_a.linear.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_mod_lin.lora_up.weight": "blocks.blockid.norm1_a.linear.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_down.weight": "blocks.blockid.norm1_b.linear.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_up.weight": "blocks.blockid.norm1_b.linear.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_down.weight": "blocks.blockid.attn.a_to_qkv.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_up.weight": "blocks.blockid.attn.a_to_qkv.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_down.weight": "blocks.blockid.attn.b_to_qkv.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_up.weight": "blocks.blockid.attn.b_to_qkv.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_proj.lora_down.weight": "blocks.blockid.attn.a_to_out.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_proj.lora_up.weight": "blocks.blockid.attn.a_to_out.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_down.weight": "blocks.blockid.attn.b_to_out.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_up.weight": "blocks.blockid.attn.b_to_out.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_0.lora_down.weight": "blocks.blockid.ff_a.0.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_0.lora_up.weight": "blocks.blockid.ff_a.0.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_2.lora_down.weight": "blocks.blockid.ff_a.2.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_2.lora_up.weight": "blocks.blockid.ff_a.2.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_down.weight": "blocks.blockid.ff_b.0.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_up.weight": "blocks.blockid.ff_b.0.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_down.weight": "blocks.blockid.ff_b.2.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_up.weight": "blocks.blockid.ff_b.2.lora_B.default.weight",
+            "lora_unet_single_blocks_blockid_modulation_lin.lora_down.weight": "single_blocks.blockid.norm.linear.lora_A.default.weight",
+            "lora_unet_single_blocks_blockid_modulation_lin.lora_up.weight": "single_blocks.blockid.norm.linear.lora_B.default.weight",
+            "lora_unet_single_blocks_blockid_linear1.lora_down.weight": "single_blocks.blockid.to_qkv_mlp.lora_A.default.weight",
+            "lora_unet_single_blocks_blockid_linear1.lora_up.weight": "single_blocks.blockid.to_qkv_mlp.lora_B.default.weight",
+            "lora_unet_single_blocks_blockid_linear2.lora_down.weight": "single_blocks.blockid.proj_out.lora_A.default.weight",
+            "lora_unet_single_blocks_blockid_linear2.lora_up.weight": "single_blocks.blockid.proj_out.lora_B.default.weight",
+        }
+        def guess_block_id(name):
+            names = name.split("_")
+            for i in names:
+                if i.isdigit():
+                    return i, name.replace(f"_{i}_", "_blockid_")
+            return None, None
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            block_id, source_name = guess_block_id(name)
+            if source_name in rename_dict:
+                target_name = rename_dict[source_name]
+                target_name = target_name.replace(".blockid.", f".{block_id}.")
+                state_dict_[target_name] = param
+            else:
+                state_dict_[name] = param
+        return state_dict_
+    

 def get_lora_loaders():
    return [SDLoRAFromCivitai(), SDXLLoRAFromCivitai(), FluxLoRAFromCivitai(), GeneralLoRAFromPeft()]
--- a/diffsynth/pipelines/init.py
+++ b/diffsynth/pipelines/init.py
@@ -9,4 +9,5 @@ from .flux_image import FluxImagePipeline
 from .cog_video import CogVideoPipeline
 from .omnigen_image import OmnigenImagePipeline
 from .pipeline_runner import SDVideoPipelineRunner
+from .hunyuan_video import HunyuanVideoPipeline
 KolorsImagePipeline = SDXLImagePipeline
--- a/diffsynth/pipelines/hunyuan_video.py
+++ b/diffsynth/pipelines/hunyuan_video.py
@@ -0,0 +1,51 @@
+from ..models import ModelManager, SD3TextEncoder1
+from .base import BasePipeline
+from ..prompters import HunyuanVideoPrompter
+import torch
+from transformers import LlamaModel
+from tqdm import tqdm
+
+class HunyuanVideoPipeline(BasePipeline):
+
+    def __init__(self, device="cuda", torch_dtype=torch.float16):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+        # 参照diffsynth的排序，text_encoder_1指CLIP；text_encoder_2指llm，与hunyuanvideo源代码刚好相反
+        self.prompter = HunyuanVideoPrompter()
+        self.text_encoder_1: SD3TextEncoder1 = None
+        self.text_encoder_2: LlamaModel = None
+    
+
+    def fetch_models(self, model_manager: ModelManager):
+        self.text_encoder_1 = model_manager.fetch_model("sd3_text_encoder_1")
+        self.text_encoder_2 = model_manager.fetch_model("hunyuan_video_text_encoder_2")
+        self.prompter.fetch_models(self.text_encoder_1, self.text_encoder_2)
+
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager, device=None):
+
+        pipe = HunyuanVideoPipeline(
+            device=model_manager.device if device is None else device,
+            torch_dtype=model_manager.torch_dtype,
+        )
+        pipe.fetch_models(model_manager)
+        return pipe
+
+    def encode_prompt(self, prompt, positive=True, clip_sequence_length=77, llm_sequence_length=256):
+        prompt_emb, pooled_prompt_emb = self.prompter.encode_prompt(
+            prompt, device=self.device, positive=positive, clip_sequence_length=clip_sequence_length, llm_sequence_length=llm_sequence_length
+        )
+        return {"prompt_emb": prompt_emb, "pooled_prompt_emb": pooled_prompt_emb}
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        negative_prompt="",
+        seed=None,
+        progress_bar_cmd=tqdm,
+        progress_bar_st=None,
+    ):
+        pass
+
+        prompt_emb_posi = self.encode_prompt(prompt, positive=True)
+        return prompt_emb_posi
--- a/diffsynth/prompters/init.py
+++ b/diffsynth/prompters/init.py
@@ -7,3 +7,4 @@ from .kolors_prompter import KolorsPrompter
 from .flux_prompter import FluxPrompter
 from .omost import OmostPromter
 from .cog_prompter import CogPrompter
+from .hunyuan_video_prompter import HunyuanVideoPrompter
--- a/diffsynth/prompters/hunyuan_video_prompter.py
+++ b/diffsynth/prompters/hunyuan_video_prompter.py
@@ -0,0 +1,149 @@
+from .base_prompter import BasePrompter
+from ..models.sd3_text_encoder import SD3TextEncoder1
+from transformers import CLIPTokenizer, LlamaTokenizerFast, LlamaModel
+import os, torch
+
+PROMPT_TEMPLATE_ENCODE = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
+    "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>")
+
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>")
+
+PROMPT_TEMPLATE = {
+    "dit-llm-encode": {
+        "template": PROMPT_TEMPLATE_ENCODE,
+        "crop_start": 36,
+    },
+    "dit-llm-encode-video": {
+        "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
+        "crop_start": 95,
+    },
+}
+
+NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
+
+
+class HunyuanVideoPrompter(BasePrompter):
+
+    def __init__(
+        self,
+        tokenizer_1_path=None,
+        tokenizer_2_path=None,
+    ):
+        if tokenizer_1_path is None:
+            base_path = os.path.dirname(os.path.dirname(__file__))
+            tokenizer_1_path = os.path.join(
+                base_path, "tokenizer_configs/hunyuan_video/tokenizer_1")
+        if tokenizer_2_path is None:
+            base_path = os.path.dirname(os.path.dirname(__file__))
+            tokenizer_2_path = os.path.join(
+                base_path, "tokenizer_configs/hunyuan_video/tokenizer_2")
+        super().__init__()
+        self.tokenizer_1 = CLIPTokenizer.from_pretrained(tokenizer_1_path)
+        self.tokenizer_2 = LlamaTokenizerFast.from_pretrained(tokenizer_2_path, padding_side='right')
+        self.text_encoder_1: SD3TextEncoder1 = None
+        self.text_encoder_2: LlamaModel = None
+
+        self.prompt_template = PROMPT_TEMPLATE['dit-llm-encode']
+        self.prompt_template_video = PROMPT_TEMPLATE['dit-llm-encode-video']
+
+    def fetch_models(self, text_encoder_1: SD3TextEncoder1 = None, text_encoder_2: LlamaModel = None):
+        self.text_encoder_1 = text_encoder_1
+        self.text_encoder_2 = text_encoder_2
+
+    def apply_text_to_template(self, text, template):
+        assert isinstance(template, str)
+        if isinstance(text, list):
+            return [self.apply_text_to_template(text_) for text_ in text]
+        elif isinstance(text, str):
+            # Will send string to tokenizer. Used for llm
+            return template.format(text)
+        else:
+            raise TypeError(f"Unsupported prompt type: {type(text)}")
+
+    def encode_prompt_using_clip(self, prompt, max_length, device):
+        input_ids = self.tokenizer_1(prompt,
+                                     return_tensors="pt",
+                                     padding="max_length",
+                                     max_length=max_length,
+                                     truncation=True).input_ids.to(device)
+        return self.text_encoder_1(input_ids=input_ids)[0]
+
+    def encode_prompt_using_llm(self,
+                                prompt,
+                                max_length,
+                                device,
+                                crop_start,
+                                hidden_state_skip_layer=2,
+                                apply_final_norm=False,
+                                use_attention_mask=True):
+        max_length += crop_start
+        inputs = self.tokenizer_2(prompt,
+                                  return_tensors="pt",
+                                  padding="max_length",
+                                  max_length=max_length,
+                                  truncation=True)
+        input_ids = inputs.input_ids.to(device)
+        attention_mask = inputs.attention_mask.to(device)
+        output_hidden_states = hidden_state_skip_layer is not None
+        outputs = self.text_encoder_2(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states)
+
+        if hidden_state_skip_layer is not None:
+            last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
+            if hidden_state_skip_layer > 0 and apply_final_norm:
+                last_hidden_state = self.text_encoder_2.norm(last_hidden_state)
+        else:
+            last_hidden_state = outputs['last_hidden_state']
+        # crop out
+        if crop_start > 0:
+            last_hidden_state = last_hidden_state[:, crop_start:]
+            attention_mask = (attention_mask[:, crop_start:] if use_attention_mask else None)
+
+        return last_hidden_state
+
+    def encode_prompt(self,
+                      prompt,
+                      positive=True,
+                      device="cuda",
+                      clip_sequence_length=77,
+                      llm_sequence_length=256,
+                      data_type='video',
+                      use_template=True,
+                      hidden_state_skip_layer=2,
+                      apply_final_norm=False,
+                      use_attention_mask=True):
+
+        prompt = self.process_prompt(prompt, positive=positive)
+
+        # apply template
+        if use_template:
+            template = self.prompt_template_video if data_type == 'video' else self.prompt_template
+            prompt_formated = self.apply_text_to_template(prompt, template['template'])
+        else:
+            prompt_formated = prompt
+        # Text encoder
+        if data_type == 'video':
+            crop_start = self.prompt_template_video.get("crop_start", 0)
+        else:
+            crop_start = self.prompt_template.get("crop_start", 0)
+
+        # CLIP
+        pooled_prompt_emb = self.encode_prompt_using_clip(prompt, clip_sequence_length, device)
+
+        # LLM
+        prompt_emb = self.encode_prompt_using_llm(
+            prompt_formated, llm_sequence_length, device, crop_start,
+            hidden_state_skip_layer, apply_final_norm, use_attention_mask)
+
+        return prompt_emb, pooled_prompt_emb
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/special_tokens_map.json
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/tokenizer_config.json
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/tokenizer_config.json
@@ -0,0 +1,30 @@
+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/special_tokens_map.json
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json
--- a/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer_config.json
+++ b/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer_config.json
--- a/diffsynth/trainers/text_to_image.py
+++ b/diffsynth/trainers/text_to_image.py
@@ -3,6 +3,7 @@ from peft import LoraConfig, inject_adapter_in_model
 import torch, os
 from ..data.simple_text_image import TextImageDataset
 from modelscope.hub.api import HubApi
+from ..models.utils import load_state_dict



@@ -33,7 +34,7 @@ class LightningModelForT2ILoRA(pl.LightningModule):
        self.pipe.denoising_model().train()

    
-    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian"):
+    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian", pretrained_lora_path=None, state_dict_converter=None):
        # Add LoRA to UNet
        self.lora_alpha = lora_alpha
        if init_lora_weights == "kaiming":
@@ -51,6 +52,17 @@ class LightningModelForT2ILoRA(pl.LightningModule):
            if param.requires_grad:
                param.data = param.to(torch.float32)

+        # Lora pretrained lora weights
+        if pretrained_lora_path is not None:
+            state_dict = load_state_dict(pretrained_lora_path)
+            if state_dict_converter is not None:
+                state_dict = state_dict_converter(state_dict)
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            all_keys = [i for i, _ in model.named_parameters()]
+            num_updated_keys = len(all_keys) - len(missing_keys)
+            num_unexpected_keys = len(unexpected_keys)
+            print(f"{num_updated_keys} parameters are loaded from {pretrained_lora_path}. {num_unexpected_keys} parameters are unexpected.")
+

    def training_step(self, batch, batch_idx):
        # Data
@@ -229,6 +241,12 @@ def add_general_parsers(parser):
        default=None,
        help="Access key on ModelScope (https://www.modelscope.cn/). Required if you want to upload the model to ModelScope.",
    )
+    parser.add_argument(
+        "--pretrained_lora_path",
+        type=str,
+        default=None,
+        help="Pretrained LoRA path. Required if the training is resumed.",
+    )
    return parser


--- a/examples/train/flux/train_flux_lora.py
+++ b/examples/train/flux/train_flux_lora.py
@@ -10,7 +10,7 @@ class LightningModel(LightningModelForT2ILoRA):
        self,
        torch_dtype=torch.float16, pretrained_weights=[], preset_lora_path=None,
        learning_rate=1e-4, use_gradient_checkpointing=True,
-        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="kaiming",
+        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="kaiming", pretrained_lora_path=None,
        state_dict_converter=None, quantize = None
    ):
        super().__init__(learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing, state_dict_converter=state_dict_converter)
@@ -34,7 +34,15 @@ class LightningModel(LightningModelForT2ILoRA):
        self.pipe.scheduler.set_timesteps(1000, training=True)

        self.freeze_parameters()
-        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
+        self.add_lora_to_model(
+            self.pipe.denoising_model(),
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_target_modules=lora_target_modules,
+            init_lora_weights=init_lora_weights,
+            pretrained_lora_path=pretrained_lora_path,
+            state_dict_converter=FluxLoRAConverter.align_to_diffsynth_format
+        )


 def parse_args():
@@ -109,6 +117,7 @@ if __name__ == '__main__':
        lora_alpha=args.lora_alpha,
        lora_target_modules=args.lora_target_modules,
        init_lora_weights=args.init_lora_weights,
+        pretrained_lora_path=args.pretrained_lora_path,
        state_dict_converter=FluxLoRAConverter.align_to_opensource_format if args.align_to_opensource_format else None,
        quantize={"float8_e4m3fn": torch.float8_e4m3fn}.get(args.quantize, None),
    )
--- a/examples/train/hunyuan_dit/train_hunyuan_dit_lora.py
+++ b/examples/train/hunyuan_dit/train_hunyuan_dit_lora.py
@@ -9,7 +9,7 @@ class LightningModel(LightningModelForT2ILoRA):
        self,
        torch_dtype=torch.float16, pretrained_weights=[],
        learning_rate=1e-4, use_gradient_checkpointing=True,
-        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian",
+        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian", pretrained_lora_path=None,
    ):
        super().__init__(learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing)
        # Load models
@@ -19,7 +19,14 @@ class LightningModel(LightningModelForT2ILoRA):
        self.pipe.scheduler.set_timesteps(1000)

        self.freeze_parameters()
-        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
+        self.add_lora_to_model(
+            self.pipe.denoising_model(),
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_target_modules=lora_target_modules,
+            init_lora_weights=init_lora_weights,
+            pretrained_lora_path=pretrained_lora_path,
+        )


 def parse_args():
@@ -57,6 +64,7 @@ if __name__ == '__main__':
        lora_rank=args.lora_rank,
        lora_alpha=args.lora_alpha,
        init_lora_weights=args.init_lora_weights,
+        pretrained_lora_path=args.pretrained_lora_path,
        lora_target_modules=args.lora_target_modules
    )
    launch_training_task(model, args)
--- a/examples/train/kolors/train_kolors_lora.py
+++ b/examples/train/kolors/train_kolors_lora.py
@@ -9,7 +9,7 @@ class LightningModel(LightningModelForT2ILoRA):
        self,
        torch_dtype=torch.float16, pretrained_weights=[],
        learning_rate=1e-4, use_gradient_checkpointing=True,
-        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian",
+        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian", pretrained_lora_path=None,
    ):
        super().__init__(learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing)
        # Load models
@@ -22,7 +22,14 @@ class LightningModel(LightningModelForT2ILoRA):
        self.pipe.vae_encoder.to(torch_dtype)

        self.freeze_parameters()
-        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
+        self.add_lora_to_model(
+            self.pipe.denoising_model(),
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_target_modules=lora_target_modules,
+            init_lora_weights=init_lora_weights,
+            pretrained_lora_path=pretrained_lora_path,
+        )


 def parse_args():
@@ -73,6 +80,7 @@ if __name__ == '__main__':
        lora_rank=args.lora_rank,
        lora_alpha=args.lora_alpha,
        init_lora_weights=args.init_lora_weights,
+        pretrained_lora_path=args.pretrained_lora_path,
        lora_target_modules=args.lora_target_modules
    )
    launch_training_task(model, args)
--- a/examples/train/stable_diffusion/train_sd_lora.py
+++ b/examples/train/stable_diffusion/train_sd_lora.py
@@ -9,7 +9,7 @@ class LightningModel(LightningModelForT2ILoRA):
        self,
        torch_dtype=torch.float16, pretrained_weights=[],
        learning_rate=1e-4, use_gradient_checkpointing=True,
-        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian",
+        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian", pretrained_lora_path=None,
    ):
        super().__init__(learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing)
        # Load models
@@ -19,7 +19,14 @@ class LightningModel(LightningModelForT2ILoRA):
        self.pipe.scheduler.set_timesteps(1000)

        self.freeze_parameters()
-        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
+        self.add_lora_to_model(
+            self.pipe.denoising_model(),
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_target_modules=lora_target_modules,
+            init_lora_weights=init_lora_weights,
+            pretrained_lora_path=pretrained_lora_path,
+        )


 def parse_args():
@@ -52,6 +59,7 @@ if __name__ == '__main__':
        lora_rank=args.lora_rank,
        lora_alpha=args.lora_alpha,
        init_lora_weights=args.init_lora_weights,
+        pretrained_lora_path=args.pretrained_lora_path,
        lora_target_modules=args.lora_target_modules
    )
    launch_training_task(model, args)
--- a/examples/train/stable_diffusion_3/train_sd3_lora.py
+++ b/examples/train/stable_diffusion_3/train_sd3_lora.py
@@ -9,7 +9,7 @@ class LightningModel(LightningModelForT2ILoRA):
        self,
        torch_dtype=torch.float16, pretrained_weights=[], preset_lora_path=None,
        learning_rate=1e-4, use_gradient_checkpointing=True,
-        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian",
+        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian", pretrained_lora_path=None,
    ):
        super().__init__(learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing)
        # Load models
@@ -24,7 +24,14 @@ class LightningModel(LightningModelForT2ILoRA):
                model_manager.load_lora(path)

        self.freeze_parameters()
-        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
+        self.add_lora_to_model(
+            self.pipe.denoising_model(),
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_target_modules=lora_target_modules,
+            init_lora_weights=init_lora_weights,
+            pretrained_lora_path=pretrained_lora_path,
+        )


 def parse_args():
@@ -70,6 +77,7 @@ if __name__ == '__main__':
        lora_rank=args.lora_rank,
        lora_alpha=args.lora_alpha,
        init_lora_weights=args.init_lora_weights,
+        pretrained_lora_path=args.pretrained_lora_path,
        lora_target_modules=args.lora_target_modules
    )
    launch_training_task(model, args)
--- a/examples/train/stable_diffusion_xl/train_sdxl_lora.py
+++ b/examples/train/stable_diffusion_xl/train_sdxl_lora.py
@@ -9,7 +9,7 @@ class LightningModel(LightningModelForT2ILoRA):
        self,
        torch_dtype=torch.float16, pretrained_weights=[],
        learning_rate=1e-4, use_gradient_checkpointing=True,
-        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian",
+        lora_rank=4, lora_alpha=4, lora_target_modules="to_q,to_k,to_v,to_out", init_lora_weights="gaussian", pretrained_lora_path=None,
    ):
        super().__init__(learning_rate=learning_rate, use_gradient_checkpointing=use_gradient_checkpointing)
        # Load models
@@ -19,7 +19,14 @@ class LightningModel(LightningModelForT2ILoRA):
        self.pipe.scheduler.set_timesteps(1000)

        self.freeze_parameters()
-        self.add_lora_to_model(self.pipe.denoising_model(), lora_rank=lora_rank, lora_alpha=lora_alpha, lora_target_modules=lora_target_modules, init_lora_weights=init_lora_weights)
+        self.add_lora_to_model(
+            self.pipe.denoising_model(),
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_target_modules=lora_target_modules,
+            init_lora_weights=init_lora_weights,
+            pretrained_lora_path=pretrained_lora_path,
+        )


 def parse_args():
@@ -52,6 +59,7 @@ if __name__ == '__main__':
        lora_rank=args.lora_rank,
        lora_alpha=args.lora_alpha,
        init_lora_weights=args.init_lora_weights,
+        pretrained_lora_path=args.pretrained_lora_path,
        lora_target_modules=args.lora_target_modules
    )
    launch_training_task(model, args)
--- a/examples/video_synthesis/hunyuanvideo.py
+++ b/examples/video_synthesis/hunyuanvideo.py
@@ -0,0 +1,16 @@
+from diffsynth import ModelManager, HunyuanVideoPipeline, download_models
+import torch
+
+
+# Download models (automatically)
+download_models(["HunyuanVideo"])
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+    "t2i_models/HunyuanVideo/text_encoder/model.safetensors",
+    "t2i_models/HunyuanVideo/text_encoder_2",
+])
+pipe = HunyuanVideoPipeline.from_model_manager(model_manager)
+prompt = 'A cat walks on the grass, realistic style.'
+pipe(prompt)