support CogVideoX-5B (#184)

* support cogvideo * update examples
2026-03-18 22:08:13 +00:00 · 2024-09-03 11:37:54 +08:00
parent fe485b3fa1
commit d154bee18a
22 changed files with 2653 additions and 107 deletions
--- a/diffsynth/models/cog_dit.py
+++ b/diffsynth/models/cog_dit.py
@@ -0,0 +1,395 @@
+import torch
+from einops import rearrange, repeat
+from .sd3_dit import TimestepEmbeddings
+from .attention import Attention
+from .utils import load_state_dict_from_folder
+from .tiler import TileWorker2Dto3D
+import numpy as np
+
+
+
+class CogPatchify(torch.nn.Module):
+    def __init__(self, dim_in, dim_out, patch_size) -> None:
+        super().__init__()
+        self.proj = torch.nn.Conv3d(dim_in, dim_out, kernel_size=(1, patch_size, patch_size), stride=(1, patch_size, patch_size))
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = rearrange(hidden_states, "B C T H W -> B (T H W) C")
+        return hidden_states
+    
+
+
+class CogAdaLayerNorm(torch.nn.Module):
+    def __init__(self, dim, dim_cond, single=False):
+        super().__init__()
+        self.single = single
+        self.linear = torch.nn.Linear(dim_cond, dim * (2 if single else 6))
+        self.norm = torch.nn.LayerNorm(dim, elementwise_affine=True, eps=1e-5)
+
+
+    def forward(self, hidden_states, prompt_emb, emb):
+        emb = self.linear(torch.nn.functional.silu(emb))
+        if self.single:
+            shift, scale = emb.unsqueeze(1).chunk(2, dim=2)
+            hidden_states = self.norm(hidden_states) * (1 + scale) + shift
+            return hidden_states
+        else:
+            shift_a, scale_a, gate_a, shift_b, scale_b, gate_b = emb.unsqueeze(1).chunk(6, dim=2)
+            hidden_states = self.norm(hidden_states) * (1 + scale_a) + shift_a
+            prompt_emb = self.norm(prompt_emb) * (1 + scale_b) + shift_b
+            return hidden_states, prompt_emb, gate_a, gate_b
+
+
+
+class CogDiTBlock(torch.nn.Module):
+    def __init__(self, dim, dim_cond, num_heads):
+        super().__init__()
+        self.norm1 = CogAdaLayerNorm(dim, dim_cond)
+        self.attn1 = Attention(q_dim=dim, num_heads=48, head_dim=dim//num_heads, bias_q=True, bias_kv=True, bias_out=True)
+        self.norm_q = torch.nn.LayerNorm((dim//num_heads,), eps=1e-06, elementwise_affine=True)
+        self.norm_k = torch.nn.LayerNorm((dim//num_heads,), eps=1e-06, elementwise_affine=True)
+
+        self.norm2 = CogAdaLayerNorm(dim, dim_cond)
+        self.ff = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim*4),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(dim*4, dim)
+        )
+    
+
+    def apply_rotary_emb(self, x, freqs_cis):
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    
+
+    def process_qkv(self, q, k, v, image_rotary_emb, text_seq_length):
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q[:, :, text_seq_length:] = self.apply_rotary_emb(q[:, :, text_seq_length:], image_rotary_emb)
+        k[:, :, text_seq_length:] = self.apply_rotary_emb(k[:, :, text_seq_length:], image_rotary_emb)
+        return q, k, v
+        
+
+    def forward(self, hidden_states, prompt_emb, time_emb, image_rotary_emb):
+        # Attention
+        norm_hidden_states, norm_encoder_hidden_states, gate_a, gate_b = self.norm1(
+            hidden_states, prompt_emb, time_emb
+        )
+        attention_io = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        attention_io = self.attn1(
+            attention_io,
+            qkv_preprocessor=lambda q, k, v: self.process_qkv(q, k, v, image_rotary_emb, prompt_emb.shape[1])
+        )
+
+        hidden_states = hidden_states + gate_a * attention_io[:, prompt_emb.shape[1]:]
+        prompt_emb = prompt_emb + gate_b * attention_io[:, :prompt_emb.shape[1]]
+
+        # Feed forward
+        norm_hidden_states, norm_encoder_hidden_states, gate_a, gate_b = self.norm2(
+            hidden_states, prompt_emb, time_emb
+        )
+        ff_io = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_io = self.ff(ff_io)
+
+        hidden_states = hidden_states + gate_a * ff_io[:, prompt_emb.shape[1]:]
+        prompt_emb = prompt_emb + gate_b * ff_io[:, :prompt_emb.shape[1]]
+
+        return hidden_states, prompt_emb
+
+
+
+class CogDiT(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.patchify = CogPatchify(16, 3072, 2)
+        self.time_embedder = TimestepEmbeddings(3072, 512)
+        self.context_embedder = torch.nn.Linear(4096, 3072)
+        self.blocks = torch.nn.ModuleList([CogDiTBlock(3072, 512, 48) for _ in range(42)])
+        self.norm_final = torch.nn.LayerNorm((3072,), eps=1e-05, elementwise_affine=True)
+        self.norm_out = CogAdaLayerNorm(3072, 512, single=True)
+        self.proj_out = torch.nn.Linear(3072, 64, bias=True)
+
+
+    def get_resize_crop_region_for_grid(self, src, tgt_width, tgt_height):
+        tw = tgt_width
+        th = tgt_height
+        h, w = src
+        r = h / w
+        if r > (th / tw):
+            resize_height = th
+            resize_width = int(round(th / h * w))
+        else:
+            resize_width = tw
+            resize_height = int(round(tw / w * h))
+
+        crop_top = int(round((th - resize_height) / 2.0))
+        crop_left = int(round((tw - resize_width) / 2.0))
+
+        return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+    
+
+    def get_3d_rotary_pos_embed(
+        self, embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
+    ):
+        start, stop = crops_coords
+        grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
+        grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
+        grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+
+        # Compute dimensions for each axis
+        dim_t = embed_dim // 4
+        dim_h = embed_dim // 8 * 3
+        dim_w = embed_dim // 8 * 3
+
+        # Temporal frequencies
+        freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t))
+        grid_t = torch.from_numpy(grid_t).float()
+        freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t)
+        freqs_t = freqs_t.repeat_interleave(2, dim=-1)
+
+        # Spatial frequencies for height and width
+        freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h))
+        freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w))
+        grid_h = torch.from_numpy(grid_h).float()
+        grid_w = torch.from_numpy(grid_w).float()
+        freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h)
+        freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w)
+        freqs_h = freqs_h.repeat_interleave(2, dim=-1)
+        freqs_w = freqs_w.repeat_interleave(2, dim=-1)
+
+        # Broadcast and concatenate tensors along specified dimension
+        def broadcast(tensors, dim=-1):
+            num_tensors = len(tensors)
+            shape_lens = {len(t.shape) for t in tensors}
+            assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+            shape_len = list(shape_lens)[0]
+            dim = (dim + shape_len) if dim < 0 else dim
+            dims = list(zip(*(list(t.shape) for t in tensors)))
+            expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+            assert all(
+                [*(len(set(t[1])) <= 2 for t in expandable_dims)]
+            ), "invalid dimensions for broadcastable concatenation"
+            max_dims = [(t[0], max(t[1])) for t in expandable_dims]
+            expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims]
+            expanded_dims.insert(dim, (dim, dims[dim]))
+            expandable_shapes = list(zip(*(t[1] for t in expanded_dims)))
+            tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)]
+            return torch.cat(tensors, dim=dim)
+
+        freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
+
+        t, h, w, d = freqs.shape
+        freqs = freqs.view(t * h * w, d)
+
+        # Generate sine and cosine components
+        sin = freqs.sin()
+        cos = freqs.cos()
+
+        if use_real:
+            return cos, sin
+        else:
+            freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+            return freqs_cis
+    
+
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ):
+        grid_height = height // 2
+        grid_width = width // 2
+        base_size_width = 720 // (8 * 2)
+        base_size_height = 480 // (8 * 2)
+
+        grid_crops_coords = self.get_resize_crop_region_for_grid(
+            (grid_height, grid_width), base_size_width, base_size_height
+        )
+        freqs_cos, freqs_sin = self.get_3d_rotary_pos_embed(
+            embed_dim=64,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+            use_real=True,
+        )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+
+
+    def unpatchify(self, hidden_states, height, width):
+        hidden_states = rearrange(hidden_states, "B (T H W) (C P Q) -> B C T (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
+        return hidden_states
+    
+
+    def build_mask(self, T, H, W, dtype, device, is_bound):
+        t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W)
+        h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W)
+        w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W)
+        border_width = (H + W) // 4
+        pad = torch.ones_like(h) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else t + 1,
+            pad if is_bound[1] else T - t,
+            pad if is_bound[2] else h + 1,
+            pad if is_bound[3] else H - h,
+            pad if is_bound[4] else w + 1,
+            pad if is_bound[5] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=dtype, device=device)
+        mask = rearrange(mask, "T H W -> 1 1 T H W")
+        return mask
+    
+
+    def tiled_forward(self, hidden_states, timestep, prompt_emb, tile_size=(60, 90), tile_stride=(30, 45)):
+        B, C, T, H, W = hidden_states.shape
+        value = torch.zeros((B, C, T, H, W), dtype=hidden_states.dtype, device=hidden_states.device)
+        weight = torch.zeros((B, C, T, H, W), dtype=hidden_states.dtype, device=hidden_states.device)
+
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride):
+            for w in range(0, W, tile_stride):
+                if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
+                    continue
+                h_, w_ = h + tile_size, w + tile_size
+                if h_ > H: h, h_ = max(H - tile_size, 0), H
+                if w_ > W: w, w_ = max(W - tile_size, 0), W
+                tasks.append((h, h_, w, w_))
+
+        # Run
+        for hl, hr, wl, wr in tasks:
+            mask = self.build_mask(
+                value.shape[2], (hr-hl), (wr-wl),
+                hidden_states.dtype, hidden_states.device,
+                is_bound=(True, True, hl==0, hr>=H, wl==0, wr>=W)
+            )
+            model_output = self.forward(hidden_states[:, :, :, hl:hr, wl:wr], timestep, prompt_emb)
+            value[:, :, :, hl:hr, wl:wr] += model_output * mask
+            weight[:, :, :, hl:hr, wl:wr] += mask
+        value = value / weight
+
+        return value
+
+
+    def forward(self, hidden_states, timestep, prompt_emb, image_rotary_emb=None, tiled=False, tile_size=90, tile_stride=30):
+        if tiled:
+            return TileWorker2Dto3D().tiled_forward(
+                forward_fn=lambda x: self.forward(x, timestep, prompt_emb),
+                model_input=hidden_states,
+                tile_size=tile_size, tile_stride=tile_stride,
+                tile_device=hidden_states.device, tile_dtype=hidden_states.dtype,
+                computation_device=self.context_embedder.weight.device, computation_dtype=self.context_embedder.weight.dtype
+            )
+        num_frames, height, width = hidden_states.shape[-3:]
+        if image_rotary_emb is None:
+            image_rotary_emb = self.prepare_rotary_positional_embeddings(height, width, num_frames, device=self.context_embedder.weight.device)
+        hidden_states = self.patchify(hidden_states)
+        time_emb = self.time_embedder(timestep, dtype=hidden_states.dtype)
+        prompt_emb = self.context_embedder(prompt_emb)
+        for block in self.blocks:
+            hidden_states, prompt_emb = block(hidden_states, prompt_emb, time_emb, image_rotary_emb)
+
+        hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
+        hidden_states = self.norm_final(hidden_states)
+        hidden_states = hidden_states[:, prompt_emb.shape[1]:]
+        hidden_states = self.norm_out(hidden_states, prompt_emb, time_emb)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = self.unpatchify(hidden_states, height, width)
+
+        return hidden_states
+    
+
+    @staticmethod
+    def state_dict_converter():
+        return CogDiTStateDictConverter()
+    
+
+    @staticmethod
+    def from_pretrained(file_path, torch_dtype=torch.bfloat16):
+        model = CogDiT().to(torch_dtype)
+        state_dict = load_state_dict_from_folder(file_path, torch_dtype=torch_dtype)
+        state_dict = CogDiT.state_dict_converter().from_diffusers(state_dict)
+        model.load_state_dict(state_dict)
+        return model
+
+
+
+class CogDiTStateDictConverter:
+    def __init__(self):
+        pass
+
+
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "patch_embed.proj.weight": "patchify.proj.weight",
+            "patch_embed.proj.bias": "patchify.proj.bias",
+            "patch_embed.text_proj.weight": "context_embedder.weight",
+            "patch_embed.text_proj.bias": "context_embedder.bias",
+            "time_embedding.linear_1.weight": "time_embedder.timestep_embedder.0.weight",
+            "time_embedding.linear_1.bias": "time_embedder.timestep_embedder.0.bias",
+            "time_embedding.linear_2.weight": "time_embedder.timestep_embedder.2.weight",
+            "time_embedding.linear_2.bias": "time_embedder.timestep_embedder.2.bias",
+
+            "norm_final.weight": "norm_final.weight",
+            "norm_final.bias": "norm_final.bias",
+            "norm_out.linear.weight": "norm_out.linear.weight",
+            "norm_out.linear.bias": "norm_out.linear.bias",
+            "norm_out.norm.weight": "norm_out.norm.weight",
+            "norm_out.norm.bias": "norm_out.norm.bias",
+            "proj_out.weight": "proj_out.weight",
+            "proj_out.bias": "proj_out.bias",
+        }
+        suffix_dict = {
+            "norm1.linear.weight": "norm1.linear.weight",
+            "norm1.linear.bias": "norm1.linear.bias",
+            "norm1.norm.weight": "norm1.norm.weight",
+            "norm1.norm.bias": "norm1.norm.bias",
+            "attn1.norm_q.weight": "norm_q.weight",
+            "attn1.norm_q.bias": "norm_q.bias",
+            "attn1.norm_k.weight": "norm_k.weight",
+            "attn1.norm_k.bias": "norm_k.bias",
+            "attn1.to_q.weight": "attn1.to_q.weight",
+            "attn1.to_q.bias": "attn1.to_q.bias",
+            "attn1.to_k.weight": "attn1.to_k.weight",
+            "attn1.to_k.bias": "attn1.to_k.bias",
+            "attn1.to_v.weight": "attn1.to_v.weight",
+            "attn1.to_v.bias": "attn1.to_v.bias",
+            "attn1.to_out.0.weight": "attn1.to_out.weight",
+            "attn1.to_out.0.bias": "attn1.to_out.bias",
+            "norm2.linear.weight": "norm2.linear.weight",
+            "norm2.linear.bias": "norm2.linear.bias",
+            "norm2.norm.weight": "norm2.norm.weight",
+            "norm2.norm.bias": "norm2.norm.bias",
+            "ff.net.0.proj.weight": "ff.0.weight",
+            "ff.net.0.proj.bias": "ff.0.bias",
+            "ff.net.2.weight": "ff.2.weight",
+            "ff.net.2.bias": "ff.2.bias",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                if name == "patch_embed.proj.weight":
+                    param = param.unsqueeze(2)
+                state_dict_[rename_dict[name]] = param
+            else:
+                names = name.split(".")
+                if names[0] == "transformer_blocks":
+                    suffix = ".".join(names[2:])
+                    state_dict_[f"blocks.{names[1]}." + suffix_dict[suffix]] = param
+        return state_dict_
+    
+
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)
--- a/diffsynth/models/cog_vae.py
+++ b/diffsynth/models/cog_vae.py
@@ -0,0 +1,518 @@
+import torch
+from einops import rearrange, repeat
+from .tiler import TileWorker2Dto3D
+
+
+
+class Downsample3D(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 2,
+        padding: int = 0,
+        compress_time: bool = False,
+    ):
+        super().__init__()
+
+        self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+
+    def forward(self, x: torch.Tensor, xq: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            batch_size, channels, frames, height, width = x.shape
+
+            # (batch_size, channels, frames, height, width) -> (batch_size, height, width, channels, frames) -> (batch_size * height * width, channels, frames)
+            x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames)
+
+            if x.shape[-1] % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if x_rest.shape[-1] > 0:
+                    # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2)
+                    x_rest = torch.nn.functional.avg_pool1d(x_rest, kernel_size=2, stride=2)
+
+                x = torch.cat([x_first[..., None], x_rest], dim=-1)
+                # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+            else:
+                # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2)
+                x = torch.nn.functional.avg_pool1d(x, kernel_size=2, stride=2)
+                # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+
+        # Pad the tensor
+        pad = (0, 1, 0, 1)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        batch_size, channels, frames, height, width = x.shape
+        # (batch_size, channels, frames, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size * frames, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channels, height, width)
+        x = self.conv(x)
+        # (batch_size * frames, channels, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size, channels, frames, height, width)
+        x = x.reshape(batch_size, frames, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
+        return x
+
+
+
+class Upsample3D(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: int = 1,
+        compress_time: bool = False,
+    ) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+
+    def forward(self, inputs: torch.Tensor, xq: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
+                # split first frame
+                x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
+
+                x_first = torch.nn.functional.interpolate(x_first, scale_factor=2.0)
+                x_rest = torch.nn.functional.interpolate(x_rest, scale_factor=2.0)
+                x_first = x_first[:, :, None, :, :]
+                inputs = torch.cat([x_first, x_rest], dim=2)
+            elif inputs.shape[2] > 1:
+                inputs = torch.nn.functional.interpolate(inputs, scale_factor=2.0)
+            else:
+                inputs = inputs.squeeze(2)
+                inputs = torch.nn.functional.interpolate(inputs, scale_factor=2.0)
+                inputs = inputs[:, :, None, :, :]
+        else:
+            # only interpolate 2D
+            b, c, t, h, w = inputs.shape
+            inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+            inputs = torch.nn.functional.interpolate(inputs, scale_factor=2.0)
+            inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4)
+
+        b, c, t, h, w = inputs.shape
+        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        inputs = self.conv(inputs)
+        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        return inputs
+
+
+
+class CogVideoXSpatialNorm3D(torch.nn.Module):
+    def __init__(self, f_channels, zq_channels, groups):
+        super().__init__()
+        self.norm_layer = torch.nn.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
+        self.conv_y = torch.nn.Conv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+        self.conv_b = torch.nn.Conv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+
+
+    def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor:
+        if f.shape[2] > 1 and f.shape[2] % 2 == 1:
+            f_first, f_rest = f[:, :, :1], f[:, :, 1:]
+            f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:]
+            z_first, z_rest = zq[:, :, :1], zq[:, :, 1:]
+            z_first = torch.nn.functional.interpolate(z_first, size=f_first_size)
+            z_rest = torch.nn.functional.interpolate(z_rest, size=f_rest_size)
+            zq = torch.cat([z_first, z_rest], dim=2)
+        else:
+            zq = torch.nn.functional.interpolate(zq, size=f.shape[-3:])
+
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
+
+
+
+class Resnet3DBlock(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, spatial_norm_dim, groups, eps=1e-6, use_conv_shortcut=False):
+        super().__init__()
+        self.nonlinearity = torch.nn.SiLU()
+        if spatial_norm_dim is None:
+            self.norm1 = torch.nn.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
+            self.norm2 = torch.nn.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
+        else:
+            self.norm1 = CogVideoXSpatialNorm3D(in_channels, spatial_norm_dim, groups)
+            self.norm2 = CogVideoXSpatialNorm3D(out_channels, spatial_norm_dim, groups)
+
+        self.conv1 = CachedConv3d(in_channels, out_channels, kernel_size=3, padding=(0, 1, 1))
+
+        self.conv2 = CachedConv3d(out_channels, out_channels, kernel_size=3, padding=(0, 1, 1))
+
+        if in_channels != out_channels:
+            if use_conv_shortcut:
+                self.conv_shortcut = CachedConv3d(in_channels, out_channels, kernel_size=3, padding=(0, 1, 1))
+            else:
+                self.conv_shortcut = torch.nn.Conv3d(in_channels, out_channels, kernel_size=1)
+        else:
+            self.conv_shortcut = lambda x: x
+
+
+    def forward(self, hidden_states, zq):
+        residual = hidden_states
+
+        hidden_states = self.norm1(hidden_states, zq) if isinstance(self.norm1, CogVideoXSpatialNorm3D) else self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, zq) if isinstance(self.norm2, CogVideoXSpatialNorm3D) else self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        hidden_states = hidden_states + self.conv_shortcut(residual)
+
+        return hidden_states
+    
+
+
+class CachedConv3d(torch.nn.Conv3d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
+        super().__init__(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.cached_tensor = None
+
+
+    def clear_cache(self):
+        self.cached_tensor = None
+    
+
+    def forward(self, input: torch.Tensor, use_cache = True) -> torch.Tensor:
+        if use_cache:
+            if self.cached_tensor is None:
+                self.cached_tensor = torch.concat([input[:, :, :1]] * 2, dim=2)
+            input = torch.concat([self.cached_tensor, input], dim=2)
+            self.cached_tensor = input[:, :, -2:]
+        return super().forward(input)
+
+
+
+class CogVAEDecoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.7
+        self.conv_in = CachedConv3d(16, 512, kernel_size=3, stride=1, padding=(0, 1, 1))
+
+        self.blocks = torch.nn.ModuleList([
+            Resnet3DBlock(512, 512, 16, 32),
+            Resnet3DBlock(512, 512, 16, 32),
+            Resnet3DBlock(512, 512, 16, 32),
+            Resnet3DBlock(512, 512, 16, 32),
+            Resnet3DBlock(512, 512, 16, 32),
+            Resnet3DBlock(512, 512, 16, 32),
+            Upsample3D(512, 512, compress_time=True),
+            Resnet3DBlock(512, 256, 16, 32),
+            Resnet3DBlock(256, 256, 16, 32),
+            Resnet3DBlock(256, 256, 16, 32),
+            Resnet3DBlock(256, 256, 16, 32),
+            Upsample3D(256, 256, compress_time=True),
+            Resnet3DBlock(256, 256, 16, 32),
+            Resnet3DBlock(256, 256, 16, 32),
+            Resnet3DBlock(256, 256, 16, 32),
+            Resnet3DBlock(256, 256, 16, 32),
+            Upsample3D(256, 256, compress_time=False),
+            Resnet3DBlock(256, 128, 16, 32),
+            Resnet3DBlock(128, 128, 16, 32),
+            Resnet3DBlock(128, 128, 16, 32),
+            Resnet3DBlock(128, 128, 16, 32),
+        ])
+
+        self.norm_out = CogVideoXSpatialNorm3D(128, 16, 32)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = CachedConv3d(128, 3, kernel_size=3, stride=1, padding=(0, 1, 1))
+
+
+    def forward(self, sample):
+        sample = sample / self.scaling_factor
+        hidden_states = self.conv_in(sample)
+
+        for block in self.blocks:
+            hidden_states = block(hidden_states, sample)
+        
+        hidden_states = self.norm_out(hidden_states, sample)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+    
+
+    def decode_video(self, sample, tiled=True, tile_size=(60, 90), tile_stride=(30, 45), progress_bar=lambda x:x):
+        if tiled:
+            B, C, T, H, W = sample.shape
+            return TileWorker2Dto3D().tiled_forward(
+                forward_fn=lambda x: self.decode_small_video(x),
+                model_input=sample,
+                tile_size=tile_size, tile_stride=tile_stride,
+                tile_device=sample.device, tile_dtype=sample.dtype,
+                computation_device=sample.device, computation_dtype=sample.dtype,
+                scales=(3/16, (T//2*8+T%2)/T, 8, 8),
+                progress_bar=progress_bar
+            )
+        else:
+            return self.decode_small_video(sample)
+    
+
+    def decode_small_video(self, sample):
+        B, C, T, H, W = sample.shape
+        computation_device = self.conv_in.weight.device
+        computation_dtype = self.conv_in.weight.dtype
+        value = []
+        for i in range(T//2):
+            tl = i*2 + T%2 - (T%2 and i==0)
+            tr = i*2 + 2 + T%2
+            model_input = sample[:, :, tl: tr, :, :].to(dtype=computation_dtype, device=computation_device)
+            model_output = self.forward(model_input).to(dtype=sample.dtype, device=sample.device)
+            value.append(model_output)
+        value = torch.concat(value, dim=2)
+        for name, module in self.named_modules():
+            if isinstance(module, CachedConv3d):
+                module.clear_cache()
+        return value
+    
+
+    @staticmethod
+    def state_dict_converter():
+        return CogVAEDecoderStateDictConverter()
+    
+
+
+class CogVAEEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.7
+        self.conv_in = CachedConv3d(3, 128, kernel_size=3, stride=1, padding=(0, 1, 1))
+
+        self.blocks = torch.nn.ModuleList([
+            Resnet3DBlock(128, 128, None, 32),
+            Resnet3DBlock(128, 128, None, 32),
+            Resnet3DBlock(128, 128, None, 32),
+            Downsample3D(128, 128, compress_time=True),
+            Resnet3DBlock(128, 256, None, 32),
+            Resnet3DBlock(256, 256, None, 32),
+            Resnet3DBlock(256, 256, None, 32),
+            Downsample3D(256, 256, compress_time=True),
+            Resnet3DBlock(256, 256, None, 32),
+            Resnet3DBlock(256, 256, None, 32),
+            Resnet3DBlock(256, 256, None, 32),
+            Downsample3D(256, 256, compress_time=False),
+            Resnet3DBlock(256, 512, None, 32),
+            Resnet3DBlock(512, 512, None, 32),
+            Resnet3DBlock(512, 512, None, 32),
+            Resnet3DBlock(512, 512, None, 32),
+            Resnet3DBlock(512, 512, None, 32),
+        ])
+
+        self.norm_out = torch.nn.GroupNorm(32, 512, eps=1e-06, affine=True)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = CachedConv3d(512, 32, kernel_size=3, stride=1, padding=(0, 1, 1))
+
+
+    def forward(self, sample):
+        hidden_states = self.conv_in(sample)
+
+        for block in self.blocks:
+            hidden_states = block(hidden_states, sample)
+        
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)[:, :16]
+        hidden_states = hidden_states * self.scaling_factor
+
+        return hidden_states
+    
+
+    def encode_video(self, sample, tiled=True, tile_size=(60, 90), tile_stride=(30, 45), progress_bar=lambda x:x):
+        if tiled:
+            B, C, T, H, W = sample.shape
+            return TileWorker2Dto3D().tiled_forward(
+                forward_fn=lambda x: self.encode_small_video(x),
+                model_input=sample,
+                tile_size=(i * 8 for i in tile_size), tile_stride=(i * 8 for i in tile_stride),
+                tile_device=sample.device, tile_dtype=sample.dtype,
+                computation_device=sample.device, computation_dtype=sample.dtype,
+                scales=(16/3, (T//4+T%2)/T, 1/8, 1/8),
+                progress_bar=progress_bar
+            )
+        else:
+            return self.encode_small_video(sample)
+    
+
+    def encode_small_video(self, sample):
+        B, C, T, H, W = sample.shape
+        computation_device = self.conv_in.weight.device
+        computation_dtype = self.conv_in.weight.dtype
+        value = []
+        for i in range(T//8):
+            t = i*8 + T%2 - (T%2 and i==0)
+            t_ = i*8 + 8 + T%2
+            model_input = sample[:, :, t: t_, :, :].to(dtype=computation_dtype, device=computation_device)
+            model_output = self.forward(model_input).to(dtype=sample.dtype, device=sample.device)
+            value.append(model_output)
+        value = torch.concat(value, dim=2)
+        for name, module in self.named_modules():
+            if isinstance(module, CachedConv3d):
+                module.clear_cache()
+        return value
+    
+
+    @staticmethod
+    def state_dict_converter():
+        return CogVAEEncoderStateDictConverter()
+
+
+
+class CogVAEEncoderStateDictConverter:
+    def __init__(self):
+        pass
+
+
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "encoder.conv_in.conv.weight": "conv_in.weight",
+            "encoder.conv_in.conv.bias": "conv_in.bias",
+            "encoder.down_blocks.0.downsamplers.0.conv.weight": "blocks.3.conv.weight",
+            "encoder.down_blocks.0.downsamplers.0.conv.bias": "blocks.3.conv.bias",
+            "encoder.down_blocks.1.downsamplers.0.conv.weight": "blocks.7.conv.weight",
+            "encoder.down_blocks.1.downsamplers.0.conv.bias": "blocks.7.conv.bias",
+            "encoder.down_blocks.2.downsamplers.0.conv.weight": "blocks.11.conv.weight",
+            "encoder.down_blocks.2.downsamplers.0.conv.bias": "blocks.11.conv.bias",
+            "encoder.norm_out.weight": "norm_out.weight",
+            "encoder.norm_out.bias": "norm_out.bias",
+            "encoder.conv_out.conv.weight": "conv_out.weight",
+            "encoder.conv_out.conv.bias": "conv_out.bias",
+        }
+        prefix_dict = {
+            "encoder.down_blocks.0.resnets.0.": "blocks.0.",
+            "encoder.down_blocks.0.resnets.1.": "blocks.1.",
+            "encoder.down_blocks.0.resnets.2.": "blocks.2.",
+            "encoder.down_blocks.1.resnets.0.": "blocks.4.",
+            "encoder.down_blocks.1.resnets.1.": "blocks.5.",
+            "encoder.down_blocks.1.resnets.2.": "blocks.6.",
+            "encoder.down_blocks.2.resnets.0.": "blocks.8.",
+            "encoder.down_blocks.2.resnets.1.": "blocks.9.",
+            "encoder.down_blocks.2.resnets.2.": "blocks.10.",
+            "encoder.down_blocks.3.resnets.0.": "blocks.12.",
+            "encoder.down_blocks.3.resnets.1.": "blocks.13.",
+            "encoder.down_blocks.3.resnets.2.": "blocks.14.",
+            "encoder.mid_block.resnets.0.": "blocks.15.",
+            "encoder.mid_block.resnets.1.": "blocks.16.",
+        }
+        suffix_dict = {
+            "norm1.norm_layer.weight": "norm1.norm_layer.weight",
+            "norm1.norm_layer.bias": "norm1.norm_layer.bias",
+            "norm1.conv_y.conv.weight": "norm1.conv_y.weight",
+            "norm1.conv_y.conv.bias": "norm1.conv_y.bias",
+            "norm1.conv_b.conv.weight": "norm1.conv_b.weight",
+            "norm1.conv_b.conv.bias": "norm1.conv_b.bias",
+            "norm2.norm_layer.weight": "norm2.norm_layer.weight",
+            "norm2.norm_layer.bias": "norm2.norm_layer.bias",
+            "norm2.conv_y.conv.weight": "norm2.conv_y.weight",
+            "norm2.conv_y.conv.bias": "norm2.conv_y.bias",
+            "norm2.conv_b.conv.weight": "norm2.conv_b.weight",
+            "norm2.conv_b.conv.bias": "norm2.conv_b.bias",
+            "conv1.conv.weight": "conv1.weight",
+            "conv1.conv.bias": "conv1.bias",
+            "conv2.conv.weight": "conv2.weight",
+            "conv2.conv.bias": "conv2.bias",
+            "conv_shortcut.weight": "conv_shortcut.weight",
+            "conv_shortcut.bias": "conv_shortcut.bias",
+            "norm1.weight": "norm1.weight",
+            "norm1.bias": "norm1.bias",
+            "norm2.weight": "norm2.weight",
+            "norm2.bias": "norm2.bias",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+            else:
+                for prefix in prefix_dict:
+                    if name.startswith(prefix):
+                        suffix = name[len(prefix):]
+                        state_dict_[prefix_dict[prefix] + suffix_dict[suffix]] = param
+        return state_dict_
+    
+
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)
+
+
+
+class CogVAEDecoderStateDictConverter:
+    def __init__(self):
+        pass
+
+
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "decoder.conv_in.conv.weight": "conv_in.weight",
+            "decoder.conv_in.conv.bias": "conv_in.bias",
+            "decoder.up_blocks.0.upsamplers.0.conv.weight": "blocks.6.conv.weight",
+            "decoder.up_blocks.0.upsamplers.0.conv.bias": "blocks.6.conv.bias",
+            "decoder.up_blocks.1.upsamplers.0.conv.weight": "blocks.11.conv.weight",
+            "decoder.up_blocks.1.upsamplers.0.conv.bias": "blocks.11.conv.bias",
+            "decoder.up_blocks.2.upsamplers.0.conv.weight": "blocks.16.conv.weight",
+            "decoder.up_blocks.2.upsamplers.0.conv.bias": "blocks.16.conv.bias",
+            "decoder.norm_out.norm_layer.weight": "norm_out.norm_layer.weight",
+            "decoder.norm_out.norm_layer.bias": "norm_out.norm_layer.bias",
+            "decoder.norm_out.conv_y.conv.weight": "norm_out.conv_y.weight",
+            "decoder.norm_out.conv_y.conv.bias": "norm_out.conv_y.bias",
+            "decoder.norm_out.conv_b.conv.weight": "norm_out.conv_b.weight",
+            "decoder.norm_out.conv_b.conv.bias": "norm_out.conv_b.bias",
+            "decoder.conv_out.conv.weight": "conv_out.weight",
+            "decoder.conv_out.conv.bias": "conv_out.bias"
+        }
+        prefix_dict = {
+            "decoder.mid_block.resnets.0.": "blocks.0.",
+            "decoder.mid_block.resnets.1.": "blocks.1.",
+            "decoder.up_blocks.0.resnets.0.": "blocks.2.",
+            "decoder.up_blocks.0.resnets.1.": "blocks.3.",
+            "decoder.up_blocks.0.resnets.2.": "blocks.4.",
+            "decoder.up_blocks.0.resnets.3.": "blocks.5.",
+            "decoder.up_blocks.1.resnets.0.": "blocks.7.",
+            "decoder.up_blocks.1.resnets.1.": "blocks.8.",
+            "decoder.up_blocks.1.resnets.2.": "blocks.9.",
+            "decoder.up_blocks.1.resnets.3.": "blocks.10.",
+            "decoder.up_blocks.2.resnets.0.": "blocks.12.",
+            "decoder.up_blocks.2.resnets.1.": "blocks.13.",
+            "decoder.up_blocks.2.resnets.2.": "blocks.14.",
+            "decoder.up_blocks.2.resnets.3.": "blocks.15.",
+            "decoder.up_blocks.3.resnets.0.": "blocks.17.",
+            "decoder.up_blocks.3.resnets.1.": "blocks.18.",
+            "decoder.up_blocks.3.resnets.2.": "blocks.19.",
+            "decoder.up_blocks.3.resnets.3.": "blocks.20.",
+        }
+        suffix_dict = {
+            "norm1.norm_layer.weight": "norm1.norm_layer.weight",
+            "norm1.norm_layer.bias": "norm1.norm_layer.bias",
+            "norm1.conv_y.conv.weight": "norm1.conv_y.weight",
+            "norm1.conv_y.conv.bias": "norm1.conv_y.bias",
+            "norm1.conv_b.conv.weight": "norm1.conv_b.weight",
+            "norm1.conv_b.conv.bias": "norm1.conv_b.bias",
+            "norm2.norm_layer.weight": "norm2.norm_layer.weight",
+            "norm2.norm_layer.bias": "norm2.norm_layer.bias",
+            "norm2.conv_y.conv.weight": "norm2.conv_y.weight",
+            "norm2.conv_y.conv.bias": "norm2.conv_y.bias",
+            "norm2.conv_b.conv.weight": "norm2.conv_b.weight",
+            "norm2.conv_b.conv.bias": "norm2.conv_b.bias",
+            "conv1.conv.weight": "conv1.weight",
+            "conv1.conv.bias": "conv1.bias",
+            "conv2.conv.weight": "conv2.weight",
+            "conv2.conv.bias": "conv2.bias",
+            "conv_shortcut.weight": "conv_shortcut.weight",
+            "conv_shortcut.bias": "conv_shortcut.bias",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+            else:
+                for prefix in prefix_dict:
+                    if name.startswith(prefix):
+                        suffix = name[len(prefix):]
+                        state_dict_[prefix_dict[prefix] + suffix_dict[suffix]] = param
+        return state_dict_
+    
+
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)
+
--- a/diffsynth/models/model_manager.py
+++ b/diffsynth/models/model_manager.py
@@ -43,93 +43,17 @@ from .flux_dit import FluxDiT
 from .flux_text_encoder import FluxTextEncoder1, FluxTextEncoder2
 from .flux_vae import FluxVAEEncoder, FluxVAEDecoder

+from .cog_vae import CogVAEEncoder, CogVAEDecoder
+from .cog_dit import CogDiT
+
+from ..extensions.RIFE import IFNet
+from ..extensions.ESRGAN import RRDBNet
+
 from ..configs.model_config import model_loader_configs, huggingface_model_loader_configs, patch_model_loader_configs
+from .utils import load_state_dict



-def load_state_dict(file_path, torch_dtype=None):
-    if file_path.endswith(".safetensors"):
-        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
-    else:
-        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
-
-
-def load_state_dict_from_safetensors(file_path, torch_dtype=None):
-    state_dict = {}
-    with safe_open(file_path, framework="pt", device="cpu") as f:
-        for k in f.keys():
-            state_dict[k] = f.get_tensor(k)
-            if torch_dtype is not None:
-                state_dict[k] = state_dict[k].to(torch_dtype)
-    return state_dict
-
-
-def load_state_dict_from_bin(file_path, torch_dtype=None):
-    state_dict = torch.load(file_path, map_location="cpu")
-    if torch_dtype is not None:
-        for i in state_dict:
-            if isinstance(state_dict[i], torch.Tensor):
-                state_dict[i] = state_dict[i].to(torch_dtype)
-    return state_dict
-
-
-def search_for_embeddings(state_dict):
-    embeddings = []
-    for k in state_dict:
-        if isinstance(state_dict[k], torch.Tensor):
-            embeddings.append(state_dict[k])
-        elif isinstance(state_dict[k], dict):
-            embeddings += search_for_embeddings(state_dict[k])
-    return embeddings
-
-
-def search_parameter(param, state_dict):
-    for name, param_ in state_dict.items():
-        if param.numel() == param_.numel():
-            if param.shape == param_.shape:
-                if torch.dist(param, param_) < 1e-3:
-                    return name
-            else:
-                if torch.dist(param.flatten(), param_.flatten()) < 1e-3:
-                    return name
-    return None
-
-
-def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
-    matched_keys = set()
-    with torch.no_grad():
-        for name in source_state_dict:
-            rename = search_parameter(source_state_dict[name], target_state_dict)
-            if rename is not None:
-                print(f'"{name}": "{rename}",')
-                matched_keys.add(rename)
-            elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
-                length = source_state_dict[name].shape[0] // 3
-                rename = []
-                for i in range(3):
-                    rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
-                if None not in rename:
-                    print(f'"{name}": {rename},')
-                    for rename_ in rename:
-                        matched_keys.add(rename_)
-    for name in target_state_dict:
-        if name not in matched_keys:
-            print("Cannot find", name, target_state_dict[name].shape)
-
-
-def search_for_files(folder, extensions):
-    files = []
-    if os.path.isdir(folder):
-        for file in sorted(os.listdir(folder)):
-            files += search_for_files(os.path.join(folder, file), extensions)
-    elif os.path.isfile(folder):
-        for extension in extensions:
-            if folder.endswith(extension):
-                files.append(folder)
-                break
-    return files
-
-
 def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
    keys = []
    for key, value in state_dict.items():
@@ -356,7 +280,7 @@ class ModelDetectorFromHuggingfaceFolder:
            return False
        with open(os.path.join(file_path, "config.json"), "r") as f:
            config = json.load(f)
-        if "architectures" not in config:
+        if "architectures" not in config and "_class_name" not in config:
            return False
        return True

@@ -365,7 +289,8 @@ class ModelDetectorFromHuggingfaceFolder:
        with open(os.path.join(file_path, "config.json"), "r") as f:
            config = json.load(f)
        loaded_model_names, loaded_models = [], []
-        for architecture in config["architectures"]:
+        architectures = config["architectures"] if "architectures" in config else [config["_class_name"]]
+        for architecture in architectures:
            huggingface_lib, model_name, redirected_architecture = self.architecture_dict[architecture]
            if redirected_architecture is not None:
                architecture = redirected_architecture
--- a/diffsynth/models/tiler.py
+++ b/diffsynth/models/tiler.py
@@ -103,4 +103,78 @@ class TileWorker:
        
        # Done!
        model_output = model_output.to(device=inference_device, dtype=inference_dtype)
-        return model_output
+        return model_output
+    
+
+
+class TileWorker2Dto3D:
+    """
+    Process 3D tensors, but only enable TileWorker on 2D.
+    """
+    def __init__(self):
+        pass
+
+
+    def build_mask(self, T, H, W, dtype, device, is_bound, border_width):
+        t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W)
+        h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W)
+        w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W)
+        border_width = (H + W) // 4 if border_width is None else border_width
+        pad = torch.ones_like(h) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else t + 1,
+            pad if is_bound[1] else T - t,
+            pad if is_bound[2] else h + 1,
+            pad if is_bound[3] else H - h,
+            pad if is_bound[4] else w + 1,
+            pad if is_bound[5] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=dtype, device=device)
+        mask = rearrange(mask, "T H W -> 1 1 T H W")
+        return mask
+
+
+    def tiled_forward(
+        self,
+        forward_fn,
+        model_input,
+        tile_size, tile_stride,
+        tile_device="cpu", tile_dtype=torch.float32,
+        computation_device="cuda", computation_dtype=torch.float32,
+        border_width=None, scales=[1, 1, 1, 1],
+        progress_bar=lambda x:x
+    ):
+        B, C, T, H, W = model_input.shape
+        scale_C, scale_T, scale_H, scale_W = scales
+        tile_size_H, tile_size_W = tile_size
+        tile_stride_H, tile_stride_W = tile_stride
+
+        value = torch.zeros((B, int(C*scale_C), int(T*scale_T), int(H*scale_H), int(W*scale_W)), dtype=tile_dtype, device=tile_device)
+        weight = torch.zeros((1, 1, int(T*scale_T), int(H*scale_H), int(W*scale_W)), dtype=tile_dtype, device=tile_device)
+
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride_H):
+            for w in range(0, W, tile_stride_W):
+                if (h-tile_stride_H >= 0 and h-tile_stride_H+tile_size_H >= H) or (w-tile_stride_W >= 0 and w-tile_stride_W+tile_size_W >= W):
+                    continue
+                h_, w_ = h + tile_size_H, w + tile_size_W
+                if h_ > H: h, h_ = max(H - tile_size_H, 0), H
+                if w_ > W: w, w_ = max(W - tile_size_W, 0), W
+                tasks.append((h, h_, w, w_))
+
+        # Run
+        for hl, hr, wl, wr in progress_bar(tasks):
+            mask = self.build_mask(
+                int(T*scale_T), int((hr-hl)*scale_H), int((wr-wl)*scale_W),
+                tile_dtype, tile_device,
+                is_bound=(True, True, hl==0, hr>=H, wl==0, wr>=W),
+                border_width=border_width
+            )
+            grid_input = model_input[:, :, :, hl:hr, wl:wr].to(dtype=computation_dtype, device=computation_device)
+            grid_output = forward_fn(grid_input).to(dtype=tile_dtype, device=tile_device)
+            value[:, :, :, int(hl*scale_H):int(hr*scale_H), int(wl*scale_W):int(wr*scale_W)] += grid_output * mask
+            weight[:, :, :, int(hl*scale_H):int(hr*scale_H), int(wl*scale_W):int(wr*scale_W)] += mask
+        value = value / weight
+        return value
--- a/diffsynth/models/utils.py
+++ b/diffsynth/models/utils.py
@@ -0,0 +1,96 @@
+import torch, os
+from safetensors import safe_open
+
+
+
+def load_state_dict_from_folder(file_path, torch_dtype=None):
+    state_dict = {}
+    for file_name in os.listdir(file_path):
+        if "." in file_name and file_name.split(".")[-1] in [
+            "safetensors", "bin", "ckpt", "pth", "pt"
+        ]:
+            state_dict.update(load_state_dict(os.path.join(file_path, file_name), torch_dtype=torch_dtype))
+    return state_dict
+
+
+def load_state_dict(file_path, torch_dtype=None):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
+    else:
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
+
+
+def load_state_dict_from_safetensors(file_path, torch_dtype=None):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+
+
+def load_state_dict_from_bin(file_path, torch_dtype=None):
+    state_dict = torch.load(file_path, map_location="cpu")
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+
+
+def search_for_embeddings(state_dict):
+    embeddings = []
+    for k in state_dict:
+        if isinstance(state_dict[k], torch.Tensor):
+            embeddings.append(state_dict[k])
+        elif isinstance(state_dict[k], dict):
+            embeddings += search_for_embeddings(state_dict[k])
+    return embeddings
+
+
+def search_parameter(param, state_dict):
+    for name, param_ in state_dict.items():
+        if param.numel() == param_.numel():
+            if param.shape == param_.shape:
+                if torch.dist(param, param_) < 1e-3:
+                    return name
+            else:
+                if torch.dist(param.flatten(), param_.flatten()) < 1e-3:
+                    return name
+    return None
+
+
+def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
+    matched_keys = set()
+    with torch.no_grad():
+        for name in source_state_dict:
+            rename = search_parameter(source_state_dict[name], target_state_dict)
+            if rename is not None:
+                print(f'"{name}": "{rename}",')
+                matched_keys.add(rename)
+            elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
+                length = source_state_dict[name].shape[0] // 3
+                rename = []
+                for i in range(3):
+                    rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
+                if None not in rename:
+                    print(f'"{name}": {rename},')
+                    for rename_ in rename:
+                        matched_keys.add(rename_)
+    for name in target_state_dict:
+        if name not in matched_keys:
+            print("Cannot find", name, target_state_dict[name].shape)
+
+
+def search_for_files(folder, extensions):
+    files = []
+    if os.path.isdir(folder):
+        for file in sorted(os.listdir(folder)):
+            files += search_for_files(os.path.join(folder, file), extensions)
+    elif os.path.isfile(folder):
+        for extension in extensions:
+            if folder.endswith(extension):
+                files.append(folder)
+                break
+    return files