diffsynth 2.0 prototype

2026-03-20 23:58:12 +00:00 · 2025-11-04 10:59:29 +08:00
parent a30ed9093f
commit 288fb7604c
664 changed files with 3581 additions and 2237905 deletions
--- a/diffsynth/utils/init.py
+++ b/diffsynth/utils/init.py
@@ -1,275 +0,0 @@
-import torch, warnings, glob, os
-import numpy as np
-from PIL import Image
-from einops import repeat, reduce
-from typing import Optional, Union
-from dataclasses import dataclass
-from modelscope import snapshot_download
-import numpy as np
-from PIL import Image
-from typing import Optional
-
-
-class BasePipeline(torch.nn.Module):
-
-    def __init__(
-        self,
-        device="cuda", torch_dtype=torch.float16,
-        height_division_factor=64, width_division_factor=64,
-        time_division_factor=None, time_division_remainder=None,
-    ):
-        super().__init__()
-        # The device and torch_dtype is used for the storage of intermediate variables, not models.
-        self.device = device
-        self.torch_dtype = torch_dtype
-        # The following parameters are used for shape check.
-        self.height_division_factor = height_division_factor
-        self.width_division_factor = width_division_factor
-        self.time_division_factor = time_division_factor
-        self.time_division_remainder = time_division_remainder
-        self.vram_management_enabled = False
-        
-        
-    def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
-        if device is not None:
-            self.device = device
-        if dtype is not None:
-            self.torch_dtype = dtype
-        super().to(*args, **kwargs)
-        return self
-
-
-    def check_resize_height_width(self, height, width, num_frames=None):
-        # Shape check
-        if height % self.height_division_factor != 0:
-            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
-            print(f"height % {self.height_division_factor} != 0. We round it up to {height}.")
-        if width % self.width_division_factor != 0:
-            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
-            print(f"width % {self.width_division_factor} != 0. We round it up to {width}.")
-        if num_frames is None:
-            return height, width
-        else:
-            if num_frames % self.time_division_factor != self.time_division_remainder:
-                num_frames = (num_frames + self.time_division_factor - 1) // self.time_division_factor * self.time_division_factor + self.time_division_remainder
-                print(f"num_frames % {self.time_division_factor} != {self.time_division_remainder}. We round it up to {num_frames}.")
-            return height, width, num_frames
-
-
-    def preprocess_image(self, image, torch_dtype=None, device=None, pattern="B C H W", min_value=-1, max_value=1):
-        # Transform a PIL.Image to torch.Tensor
-        image = torch.Tensor(np.array(image, dtype=np.float32))
-        image = image.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
-        image = image * ((max_value - min_value) / 255) + min_value
-        image = repeat(image, f"H W C -> {pattern}", **({"B": 1} if "B" in pattern else {}))
-        return image
-
-
-    def preprocess_video(self, video, torch_dtype=None, device=None, pattern="B C T H W", min_value=-1, max_value=1):
-        # Transform a list of PIL.Image to torch.Tensor
-        video = [self.preprocess_image(image, torch_dtype=torch_dtype, device=device, min_value=min_value, max_value=max_value) for image in video]
-        video = torch.stack(video, dim=pattern.index("T") // 2)
-        return video
-
-
-    def vae_output_to_image(self, vae_output, pattern="B C H W", min_value=-1, max_value=1):
-        # Transform a torch.Tensor to PIL.Image
-        if pattern != "H W C":
-            vae_output = reduce(vae_output, f"{pattern} -> H W C", reduction="mean")
-        image = ((vae_output - min_value) * (255 / (max_value - min_value))).clip(0, 255)
-        image = image.to(device="cpu", dtype=torch.uint8)
-        image = Image.fromarray(image.numpy())
-        return image
-
-
-    def vae_output_to_video(self, vae_output, pattern="B C T H W", min_value=-1, max_value=1):
-        # Transform a torch.Tensor to list of PIL.Image
-        if pattern != "T H W C":
-            vae_output = reduce(vae_output, f"{pattern} -> T H W C", reduction="mean")
-        video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
-        return video
-
-
-    def load_models_to_device(self, model_names=[]):
-        if self.vram_management_enabled:
-            # offload models
-            for name, model in self.named_children():
-                if name not in model_names:
-                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
-                        for module in model.modules():
-                            if hasattr(module, "offload"):
-                                module.offload()
-                    else:
-                        model.cpu()
-            torch.cuda.empty_cache()
-            # onload models
-            for name, model in self.named_children():
-                if name in model_names:
-                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
-                        for module in model.modules():
-                            if hasattr(module, "onload"):
-                                module.onload()
-                    else:
-                        model.to(self.device)
-
-
-    def generate_noise(self, shape, seed=None, rand_device="cpu", rand_torch_dtype=torch.float32, device=None, torch_dtype=None):
-        # Initialize Gaussian noise
-        generator = None if seed is None else torch.Generator(rand_device).manual_seed(seed)
-        noise = torch.randn(shape, generator=generator, device=rand_device, dtype=rand_torch_dtype)
-        noise = noise.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
-        return noise
-
-
-    def enable_cpu_offload(self):
-        warnings.warn("`enable_cpu_offload` will be deprecated. Please use `enable_vram_management`.")
-        self.vram_management_enabled = True
-        
-        
-    def get_vram(self):
-        return torch.cuda.mem_get_info(self.device)[1] / (1024 ** 3)
-    
-    
-    def freeze_except(self, model_names):
-        for name, model in self.named_children():
-            if name in model_names:
-                model.train()
-                model.requires_grad_(True)
-            else:
-                model.eval()
-                model.requires_grad_(False)
-                
-    
-    def blend_with_mask(self, base, addition, mask):
-        return base * (1 - mask) + addition * mask
-    
-    
-    def step(self, scheduler, latents, progress_id, noise_pred, input_latents=None, inpaint_mask=None, **kwargs):
-        timestep = scheduler.timesteps[progress_id]
-        if inpaint_mask is not None:
-            noise_pred_expected = scheduler.return_to_timestep(scheduler.timesteps[progress_id], latents, input_latents)
-            noise_pred = self.blend_with_mask(noise_pred_expected, noise_pred, inpaint_mask)
-        latents_next = scheduler.step(noise_pred, timestep, latents)
-        return latents_next
-
-
-
-@dataclass
-class ModelConfig:
-    path: Union[str, list[str]] = None
-    model_id: str = None
-    origin_file_pattern: Union[str, list[str]] = None
-    download_resource: str = "ModelScope"
-    offload_device: Optional[Union[str, torch.device]] = None
-    offload_dtype: Optional[torch.dtype] = None
-    local_model_path: str = None
-    skip_download: bool = False
-
-    def download_if_necessary(self, use_usp=False):
-        if self.path is None:
-            # Check model_id and origin_file_pattern
-            if self.model_id is None:
-                raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`.""")
-            
-            # Skip if not in rank 0
-            if use_usp:
-                import torch.distributed as dist
-                skip_download = self.skip_download or dist.get_rank() != 0
-            else:
-                skip_download = self.skip_download
-                
-            # Check whether the origin path is a folder
-            if self.origin_file_pattern is None or self.origin_file_pattern == "":
-                self.origin_file_pattern = ""
-                allow_file_pattern = None
-                is_folder = True
-            elif isinstance(self.origin_file_pattern, str) and self.origin_file_pattern.endswith("/"):
-                allow_file_pattern = self.origin_file_pattern + "*"
-                is_folder = True
-            else:
-                allow_file_pattern = self.origin_file_pattern
-                is_folder = False
-            
-            # Download
-            if self.local_model_path is None:
-                self.local_model_path = "./models"
-            if not skip_download:
-                downloaded_files = glob.glob(self.origin_file_pattern, root_dir=os.path.join(self.local_model_path, self.model_id))
-                snapshot_download(
-                    self.model_id,
-                    local_dir=os.path.join(self.local_model_path, self.model_id),
-                    allow_file_pattern=allow_file_pattern,
-                    ignore_file_pattern=downloaded_files,
-                    local_files_only=False
-                )
-            
-            # Let rank 1, 2, ... wait for rank 0
-            if use_usp:
-                import torch.distributed as dist
-                dist.barrier(device_ids=[dist.get_rank()])
-                
-            # Return downloaded files
-            if is_folder:
-                self.path = os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern)
-            else:
-                self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern))
-            if isinstance(self.path, list) and len(self.path) == 1:
-                self.path = self.path[0]
-
-
-
-class PipelineUnit:
-    def __init__(
-        self,
-        seperate_cfg: bool = False,
-        take_over: bool = False,
-        input_params: tuple[str] = None,
-        input_params_posi: dict[str, str] = None,
-        input_params_nega: dict[str, str] = None,
-        onload_model_names: tuple[str] = None
-    ):
-        self.seperate_cfg = seperate_cfg
-        self.take_over = take_over
-        self.input_params = input_params
-        self.input_params_posi = input_params_posi
-        self.input_params_nega = input_params_nega
-        self.onload_model_names = onload_model_names
-
-
-    def process(self, pipe: BasePipeline, inputs: dict, positive=True, **kwargs) -> dict:
-        raise NotImplementedError("`process` is not implemented.")
-
-
-
-class PipelineUnitRunner:
-    def __init__(self):
-        pass
-
-    def __call__(self, unit: PipelineUnit, pipe: BasePipeline, inputs_shared: dict, inputs_posi: dict, inputs_nega: dict) -> tuple[dict, dict]:
-        if unit.take_over:
-            # Let the pipeline unit take over this function.
-            inputs_shared, inputs_posi, inputs_nega = unit.process(pipe, inputs_shared=inputs_shared, inputs_posi=inputs_posi, inputs_nega=inputs_nega)
-        elif unit.seperate_cfg:
-            # Positive side
-            processor_inputs = {name: inputs_posi.get(name_) for name, name_ in unit.input_params_posi.items()}
-            if unit.input_params is not None:
-                for name in unit.input_params:
-                    processor_inputs[name] = inputs_shared.get(name)
-            processor_outputs = unit.process(pipe, **processor_inputs)
-            inputs_posi.update(processor_outputs)
-            # Negative side
-            if inputs_shared["cfg_scale"] != 1:
-                processor_inputs = {name: inputs_nega.get(name_) for name, name_ in unit.input_params_nega.items()}
-                if unit.input_params is not None:
-                    for name in unit.input_params:
-                        processor_inputs[name] = inputs_shared.get(name)
-                processor_outputs = unit.process(pipe, **processor_inputs)
-                inputs_nega.update(processor_outputs)
-            else:
-                inputs_nega.update(processor_outputs)
-        else:
-            processor_inputs = {name: inputs_shared.get(name) for name in unit.input_params}
-            processor_outputs = unit.process(pipe, **processor_inputs)
-            inputs_shared.update(processor_outputs)
-        return inputs_shared, inputs_posi, inputs_nega
--- a/diffsynth/utils/controlnet/init.py
+++ b/diffsynth/utils/controlnet/init.py
@@ -0,0 +1,2 @@
+from .controlnet_input import ControlNetInput
+from .annotator import Annotator
--- a/diffsynth/utils/controlnet/annotator.py
+++ b/diffsynth/utils/controlnet/annotator.py
@@ -0,0 +1,62 @@
+from typing_extensions import Literal, TypeAlias
+
+
+Processor_id: TypeAlias = Literal[
+    "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "normal", "tile", "none", "inpaint"
+]
+
+class Annotator:
+    def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda', skip_processor=False):
+        if not skip_processor:
+            if processor_id == "canny":
+                from controlnet_aux.processor import CannyDetector
+                self.processor = CannyDetector()
+            elif processor_id == "depth":
+                from controlnet_aux.processor import MidasDetector
+                self.processor = MidasDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "softedge":
+                from controlnet_aux.processor import HEDdetector
+                self.processor = HEDdetector.from_pretrained(model_path).to(device)
+            elif processor_id == "lineart":
+                from controlnet_aux.processor import LineartDetector
+                self.processor = LineartDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "lineart_anime":
+                from controlnet_aux.processor import LineartAnimeDetector
+                self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "openpose":
+                from controlnet_aux.processor import OpenposeDetector
+                self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "normal":
+                from controlnet_aux.processor import NormalBaeDetector
+                self.processor = NormalBaeDetector.from_pretrained(model_path).to(device)
+            elif processor_id == "tile" or processor_id == "none" or processor_id == "inpaint":
+                self.processor = None
+            else:
+                raise ValueError(f"Unsupported processor_id: {processor_id}")
+        else:
+            self.processor = None
+
+        self.processor_id = processor_id
+        self.detect_resolution = detect_resolution
+    
+    def to(self,device):
+        if hasattr(self.processor,"model") and hasattr(self.processor.model,"to"):
+
+            self.processor.model.to(device)
+
+    def __call__(self, image, mask=None):
+        width, height = image.size
+        if self.processor_id == "openpose":
+            kwargs = {
+                "include_body": True,
+                "include_hand": True,
+                "include_face": True
+            }
+        else:
+            kwargs = {}
+        if self.processor is not None:
+            detect_resolution = self.detect_resolution if self.detect_resolution is not None else min(width, height)
+            image = self.processor(image, detect_resolution=detect_resolution, image_resolution=min(width, height), **kwargs)
+        image = image.resize((width, height))
+        return image
+
--- a/diffsynth/utils/controlnet/controlnet_input.py
+++ b/diffsynth/utils/controlnet/controlnet_input.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from PIL import Image
+
+
+@dataclass
+class ControlNetInput:
+    controlnet_id: int = 0
+    scale: float = 1.0
+    start: float = 1.0
+    end: float = 0.0
+    image: Image.Image = None
+    inpaint_mask: Image.Image = None
+    processor_id: str = None
--- a/diffsynth/utils/lora/init.py
+++ b/diffsynth/utils/lora/init.py
@@ -0,0 +1 @@
+from .general import GeneralLoRALoader
--- a/diffsynth/utils/lora/general.py
+++ b/diffsynth/utils/lora/general.py
@@ -0,0 +1,62 @@
+import torch
+
+
+class GeneralLoRALoader:
+    def __init__(self, device="cpu", torch_dtype=torch.float32):
+        self.device = device
+        self.torch_dtype = torch_dtype
+    
+    
+    def get_name_dict(self, lora_state_dict):
+        lora_name_dict = {}
+        for key in lora_state_dict:
+            if ".lora_up." in key:
+                lora_A_key = "lora_down"
+                lora_B_key = "lora_up"
+            else:
+                lora_A_key = "lora_A"
+                lora_B_key = "lora_B"
+            if lora_B_key not in key:
+                continue
+            keys = key.split(".")
+            if len(keys) > keys.index(lora_B_key) + 2:
+                keys.pop(keys.index(lora_B_key) + 1)
+            keys.pop(keys.index(lora_B_key))
+            if keys[0] == "diffusion_model":
+                keys.pop(0)
+            keys.pop(-1)
+            target_name = ".".join(keys)
+            lora_name_dict[target_name] = (key, key.replace(lora_B_key, lora_A_key))
+        return lora_name_dict
+    
+    
+    def convert_state_dict(self, state_dict, suffix=".weight"):
+        name_dict = self.get_name_dict(state_dict)
+        state_dict_ = {}
+        for name in name_dict:
+            weight_up = state_dict[name_dict[name][0]]
+            weight_down = state_dict[name_dict[name][1]]
+            state_dict_[name + f".lora_B{suffix}"] = weight_up
+            state_dict_[name + f".lora_A{suffix}"] = weight_down
+        return state_dict_
+
+
+    def fuse_lora_to_base_model(self, model: torch.nn.Module, state_dict, alpha=1.0):
+        updated_num = 0
+        state_dict = self.convert_state_dict(state_dict)
+        lora_layer_names = set([i.replace(".lora_B.weight", "") for i in state_dict if i.endswith(".lora_B.weight")])
+        for name, module in model.named_modules():
+            if name in lora_layer_names:
+                weight_up = state_dict[name + ".lora_B.weight"].to(device=self.device, dtype=self.torch_dtype)
+                weight_down = state_dict[name + ".lora_A.weight"].to(device=self.device, dtype=self.torch_dtype)
+                if len(weight_up.shape) == 4:
+                    weight_up = weight_up.squeeze(3).squeeze(2)
+                    weight_down = weight_down.squeeze(3).squeeze(2)
+                    weight_lora = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+                else:
+                    weight_lora = alpha * torch.mm(weight_up, weight_down)
+                state_dict_base = module.state_dict()
+                state_dict_base["weight"] = state_dict_base["weight"].to(device=self.device, dtype=self.torch_dtype) + weight_lora
+                module.load_state_dict(state_dict_base)
+                updated_num += 1
+        print(f"{updated_num} tensors are fused by LoRA. Fused LoRA layers cannot be cleared by `pipe.clear_lora()`.")
--- a/diffsynth/utils/state_dict_converters/init.py
+++ b/diffsynth/utils/state_dict_converters/init.py
--- a/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py
+++ b/diffsynth/utils/state_dict_converters/qwen_image_text_encoder.py
@@ -0,0 +1,10 @@
+def QwenImageTextEncoderStateDictConverter(state_dict):
+    state_dict_ = {}
+    for k in state_dict:
+        v = state_dict[k]
+        if k.startswith("visual."):
+            k = "model." + k
+        elif k.startswith("model."):
+            k = k.replace("model.", "model.language_model.")
+        state_dict_[k] = v
+    return state_dict_