From 096547775002877514ac793102de7af8ab23b4ec Mon Sep 17 00:00:00 2001 From: Artiprocher Date: Sun, 5 May 2024 22:48:38 +0800 Subject: [PATCH] svd --- diffsynth/extensions/ESRGAN/__init__.py | 118 ++ diffsynth/extensions/RIFE/__init__.py | 4 +- diffsynth/models/__init__.py | 28 +- diffsynth/models/sd_unet.py | 2 + diffsynth/models/svd_image_encoder.py | 413 +++++ diffsynth/models/svd_unet.py | 1512 ++++++++++++++++- diffsynth/models/svd_vae_decoder.py | 349 +++- diffsynth/models/svd_vae_encoder.py | 138 ++ diffsynth/pipelines/__init__.py | 1 + diffsynth/pipelines/stable_video_diffusion.py | 289 ++++ diffsynth/schedulers/__init__.py | 67 +- diffsynth/schedulers/continuous_ode.py | 52 + diffsynth/schedulers/ddim.py | 60 + examples/svd_text_to_video.py | 37 + ...table Video Diffusion checkpoints here.txt | 0 15 files changed, 2991 insertions(+), 79 deletions(-) create mode 100644 diffsynth/extensions/ESRGAN/__init__.py create mode 100644 diffsynth/models/svd_vae_encoder.py create mode 100644 diffsynth/pipelines/stable_video_diffusion.py create mode 100644 diffsynth/schedulers/continuous_ode.py create mode 100644 diffsynth/schedulers/ddim.py create mode 100644 examples/svd_text_to_video.py create mode 100644 models/stable_video_diffusion/Put Stable Video Diffusion checkpoints here.txt diff --git a/diffsynth/extensions/ESRGAN/__init__.py b/diffsynth/extensions/ESRGAN/__init__.py new file mode 100644 index 0000000..e71cd3f --- /dev/null +++ b/diffsynth/extensions/ESRGAN/__init__.py @@ -0,0 +1,118 @@ +import torch +from einops import repeat +from PIL import Image +import numpy as np + + +class ResidualDenseBlock(torch.nn.Module): + + def __init__(self, num_feat=64, num_grow_ch=32): + super(ResidualDenseBlock, self).__init__() + self.conv1 = torch.nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1) + self.conv2 = torch.nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1) + self.conv3 = torch.nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1) + self.conv4 = torch.nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1) + self.conv5 = torch.nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1) + self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True) + + def forward(self, x): + x1 = self.lrelu(self.conv1(x)) + x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1))) + x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1))) + x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1))) + x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1)) + return x5 * 0.2 + x + + +class RRDB(torch.nn.Module): + + def __init__(self, num_feat, num_grow_ch=32): + super(RRDB, self).__init__() + self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch) + self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch) + self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch) + + def forward(self, x): + out = self.rdb1(x) + out = self.rdb2(out) + out = self.rdb3(out) + return out * 0.2 + x + + +class RRDBNet(torch.nn.Module): + + def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32): + super(RRDBNet, self).__init__() + self.conv_first = torch.nn.Conv2d(num_in_ch, num_feat, 3, 1, 1) + self.body = torch.torch.nn.Sequential(*[RRDB(num_feat=num_feat, num_grow_ch=num_grow_ch) for _ in range(num_block)]) + self.conv_body = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) + # upsample + self.conv_up1 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_up2 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_hr = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_last = torch.nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True) + + def forward(self, x): + feat = x + feat = self.conv_first(feat) + body_feat = self.conv_body(self.body(feat)) + feat = feat + body_feat + # upsample + feat = repeat(feat, "B C H W -> B C (H 2) (W 2)") + feat = self.lrelu(self.conv_up1(feat)) + feat = repeat(feat, "B C H W -> B C (H 2) (W 2)") + feat = self.lrelu(self.conv_up2(feat)) + out = self.conv_last(self.lrelu(self.conv_hr(feat))) + return out + + +class ESRGAN(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + @staticmethod + def from_pretrained(model_path): + model = RRDBNet() + state_dict = torch.load(model_path, map_location="cpu")["params_ema"] + model.load_state_dict(state_dict) + model.eval() + return ESRGAN(model) + + def process_image(self, image): + image = torch.Tensor(np.array(image, dtype=np.float32) / 255).permute(2, 0, 1) + return image + + def process_images(self, images): + images = [self.process_image(image) for image in images] + images = torch.stack(images) + return images + + def decode_images(self, images): + images = (images.permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8) + images = [Image.fromarray(image) for image in images] + return images + + @torch.no_grad() + def upscale(self, images, batch_size=4, progress_bar=lambda x:x): + # Preprocess + input_tensor = self.process_images(images) + + # Interpolate + output_tensor = [] + for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)): + batch_id_ = min(batch_id + batch_size, input_tensor.shape[0]) + batch_input_tensor = input_tensor[batch_id: batch_id_] + batch_input_tensor = batch_input_tensor.to( + device=self.model.conv_first.weight.device, + dtype=self.model.conv_first.weight.dtype) + batch_output_tensor = self.model(batch_input_tensor) + output_tensor.append(batch_output_tensor.cpu()) + + # Output + output_tensor = torch.concat(output_tensor, dim=0) + + # To images + output_images = self.decode_images(output_tensor) + return output_images diff --git a/diffsynth/extensions/RIFE/__init__.py b/diffsynth/extensions/RIFE/__init__.py index 421def9..410550e 100644 --- a/diffsynth/extensions/RIFE/__init__.py +++ b/diffsynth/extensions/RIFE/__init__.py @@ -167,7 +167,7 @@ class RIFEInterpolater: @torch.no_grad() - def interpolate(self, images, scale=1.0, batch_size=4, num_iter=1): + def interpolate(self, images, scale=1.0, batch_size=4, num_iter=1, progress_bar=lambda x:x): # Preprocess processed_images = self.process_images(images) @@ -177,7 +177,7 @@ class RIFEInterpolater: # Interpolate output_tensor = [] - for batch_id in range(0, input_tensor.shape[0], batch_size): + for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)): batch_id_ = min(batch_id + batch_size, input_tensor.shape[0]) batch_input_tensor = input_tensor[batch_id: batch_id_] batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype) diff --git a/diffsynth/models/__init__.py b/diffsynth/models/__init__.py index 8ed4c93..889eec7 100644 --- a/diffsynth/models/__init__.py +++ b/diffsynth/models/__init__.py @@ -16,6 +16,11 @@ from .sd_controlnet import SDControlNet from .sd_motion import SDMotionModel +from .svd_image_encoder import SVDImageEncoder +from .svd_unet import SVDUNet +from .svd_vae_decoder import SVDVAEDecoder +from .svd_vae_encoder import SVDVAEEncoder + class ModelManager: def __init__(self, torch_dtype=torch.float16, device="cuda"): @@ -25,6 +30,10 @@ class ModelManager: self.model_path = {} self.textual_inversion_dict = {} + def is_stable_video_diffusion(self, state_dict): + param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight" + return param_name in state_dict + def is_RIFE(self, state_dict): param_name = "block_tea.convblock3.0.1.weight" return param_name in state_dict or ("module." + param_name) in state_dict @@ -60,6 +69,21 @@ class ModelManager: param_name = "model.encoder.layers.5.self_attn_layer_norm.weight" return param_name in state_dict and len(state_dict) == 254 + def load_stable_video_diffusion(self, state_dict, components=None, file_path=""): + component_dict = { + "image_encoder": SVDImageEncoder, + "unet": SVDUNet, + "vae_decoder": SVDVAEDecoder, + "vae_encoder": SVDVAEEncoder, + } + if components is None: + components = ["image_encoder", "unet", "vae_decoder", "vae_encoder"] + for component in components: + self.model[component] = component_dict[component]() + self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict)) + self.model[component].to(self.torch_dtype).to(self.device) + self.model_path[component] = file_path + def load_stable_diffusion(self, state_dict, components=None, file_path=""): component_dict = { "text_encoder": SDTextEncoder, @@ -190,7 +214,9 @@ class ModelManager: def load_model(self, file_path, components=None, lora_alphas=[]): state_dict = load_state_dict(file_path, torch_dtype=self.torch_dtype) - if self.is_animatediff(state_dict): + if self.is_stable_video_diffusion(state_dict): + self.load_stable_video_diffusion(state_dict, file_path=file_path) + elif self.is_animatediff(state_dict): self.load_animatediff(state_dict, file_path=file_path) elif self.is_controlnet(state_dict): self.load_controlnet(state_dict, file_path=file_path) diff --git a/diffsynth/models/sd_unet.py b/diffsynth/models/sd_unet.py index 3f12a22..6c99ae4 100644 --- a/diffsynth/models/sd_unet.py +++ b/diffsynth/models/sd_unet.py @@ -165,6 +165,8 @@ class AttentionBlock(torch.nn.Module): encoder_hidden_states = text_emb.mean(dim=0, keepdim=True) else: encoder_hidden_states = text_emb + if encoder_hidden_states.shape[0] != hidden_states.shape[0]: + encoder_hidden_states = encoder_hidden_states.repeat(hidden_states.shape[0], 1, 1) if tiled: tile_size = min(tile_size, min(height, width)) diff --git a/diffsynth/models/svd_image_encoder.py b/diffsynth/models/svd_image_encoder.py index 8b38051..416aebb 100644 --- a/diffsynth/models/svd_image_encoder.py +++ b/diffsynth/models/svd_image_encoder.py @@ -87,3 +87,416 @@ class SVDImageEncoderStateDictConverter: name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail]) state_dict_[name_] = param return state_dict_ + + def from_civitai(self, state_dict): + rename_dict = { + "conditioner.embedders.0.open_clip.model.visual.class_embedding": "embeddings.class_embedding", + "conditioner.embedders.0.open_clip.model.visual.conv1.weight": "embeddings.patch_embedding.weight", + "conditioner.embedders.0.open_clip.model.visual.ln_post.bias": "post_layernorm.bias", + "conditioner.embedders.0.open_clip.model.visual.ln_post.weight": "post_layernorm.weight", + "conditioner.embedders.0.open_clip.model.visual.ln_pre.bias": "pre_layernorm.bias", + "conditioner.embedders.0.open_clip.model.visual.ln_pre.weight": "pre_layernorm.weight", + "conditioner.embedders.0.open_clip.model.visual.positional_embedding": "embeddings.position_embeds", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.in_proj_bias": ['encoders.0.attn.to_q.bias', 'encoders.0.attn.to_k.bias', 'encoders.0.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.in_proj_weight": ['encoders.0.attn.to_q.weight', 'encoders.0.attn.to_k.weight', 'encoders.0.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.out_proj.bias": "encoders.0.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.out_proj.weight": "encoders.0.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_1.bias": "encoders.0.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_1.weight": "encoders.0.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_2.bias": "encoders.0.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_2.weight": "encoders.0.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_fc.bias": "encoders.0.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_fc.weight": "encoders.0.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_proj.bias": "encoders.0.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_proj.weight": "encoders.0.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.in_proj_bias": ['encoders.1.attn.to_q.bias', 'encoders.1.attn.to_k.bias', 'encoders.1.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.in_proj_weight": ['encoders.1.attn.to_q.weight', 'encoders.1.attn.to_k.weight', 'encoders.1.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.out_proj.bias": "encoders.1.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.out_proj.weight": "encoders.1.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_1.bias": "encoders.1.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_1.weight": "encoders.1.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_2.bias": "encoders.1.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_2.weight": "encoders.1.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_fc.bias": "encoders.1.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_fc.weight": "encoders.1.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_proj.bias": "encoders.1.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_proj.weight": "encoders.1.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.in_proj_bias": ['encoders.10.attn.to_q.bias', 'encoders.10.attn.to_k.bias', 'encoders.10.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.in_proj_weight": ['encoders.10.attn.to_q.weight', 'encoders.10.attn.to_k.weight', 'encoders.10.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.out_proj.bias": "encoders.10.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.out_proj.weight": "encoders.10.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_1.bias": "encoders.10.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_1.weight": "encoders.10.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_2.bias": "encoders.10.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_2.weight": "encoders.10.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_fc.bias": "encoders.10.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_fc.weight": "encoders.10.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_proj.bias": "encoders.10.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_proj.weight": "encoders.10.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.in_proj_bias": ['encoders.11.attn.to_q.bias', 'encoders.11.attn.to_k.bias', 'encoders.11.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.in_proj_weight": ['encoders.11.attn.to_q.weight', 'encoders.11.attn.to_k.weight', 'encoders.11.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.out_proj.bias": "encoders.11.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.out_proj.weight": "encoders.11.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_1.bias": "encoders.11.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_1.weight": "encoders.11.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_2.bias": "encoders.11.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_2.weight": "encoders.11.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_fc.bias": "encoders.11.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_fc.weight": "encoders.11.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_proj.bias": "encoders.11.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_proj.weight": "encoders.11.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.in_proj_bias": ['encoders.12.attn.to_q.bias', 'encoders.12.attn.to_k.bias', 'encoders.12.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.in_proj_weight": ['encoders.12.attn.to_q.weight', 'encoders.12.attn.to_k.weight', 'encoders.12.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.out_proj.bias": "encoders.12.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.out_proj.weight": "encoders.12.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_1.bias": "encoders.12.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_1.weight": "encoders.12.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_2.bias": "encoders.12.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_2.weight": "encoders.12.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_fc.bias": "encoders.12.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_fc.weight": "encoders.12.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_proj.bias": "encoders.12.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_proj.weight": "encoders.12.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.in_proj_bias": ['encoders.13.attn.to_q.bias', 'encoders.13.attn.to_k.bias', 'encoders.13.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.in_proj_weight": ['encoders.13.attn.to_q.weight', 'encoders.13.attn.to_k.weight', 'encoders.13.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.out_proj.bias": "encoders.13.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.out_proj.weight": "encoders.13.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_1.bias": "encoders.13.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_1.weight": "encoders.13.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_2.bias": "encoders.13.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_2.weight": "encoders.13.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_fc.bias": "encoders.13.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_fc.weight": "encoders.13.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_proj.bias": "encoders.13.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_proj.weight": "encoders.13.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.in_proj_bias": ['encoders.14.attn.to_q.bias', 'encoders.14.attn.to_k.bias', 'encoders.14.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.in_proj_weight": ['encoders.14.attn.to_q.weight', 'encoders.14.attn.to_k.weight', 'encoders.14.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.out_proj.bias": "encoders.14.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.out_proj.weight": "encoders.14.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_1.bias": "encoders.14.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_1.weight": "encoders.14.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_2.bias": "encoders.14.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_2.weight": "encoders.14.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_fc.bias": "encoders.14.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_fc.weight": "encoders.14.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_proj.bias": "encoders.14.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_proj.weight": "encoders.14.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.in_proj_bias": ['encoders.15.attn.to_q.bias', 'encoders.15.attn.to_k.bias', 'encoders.15.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.in_proj_weight": ['encoders.15.attn.to_q.weight', 'encoders.15.attn.to_k.weight', 'encoders.15.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.out_proj.bias": "encoders.15.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.out_proj.weight": "encoders.15.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_1.bias": "encoders.15.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_1.weight": "encoders.15.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_2.bias": "encoders.15.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_2.weight": "encoders.15.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_fc.bias": "encoders.15.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_fc.weight": "encoders.15.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_proj.bias": "encoders.15.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_proj.weight": "encoders.15.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.in_proj_bias": ['encoders.16.attn.to_q.bias', 'encoders.16.attn.to_k.bias', 'encoders.16.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.in_proj_weight": ['encoders.16.attn.to_q.weight', 'encoders.16.attn.to_k.weight', 'encoders.16.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.out_proj.bias": "encoders.16.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.out_proj.weight": "encoders.16.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_1.bias": "encoders.16.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_1.weight": "encoders.16.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_2.bias": "encoders.16.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_2.weight": "encoders.16.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_fc.bias": "encoders.16.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_fc.weight": "encoders.16.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_proj.bias": "encoders.16.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_proj.weight": "encoders.16.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.in_proj_bias": ['encoders.17.attn.to_q.bias', 'encoders.17.attn.to_k.bias', 'encoders.17.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.in_proj_weight": ['encoders.17.attn.to_q.weight', 'encoders.17.attn.to_k.weight', 'encoders.17.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.out_proj.bias": "encoders.17.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.out_proj.weight": "encoders.17.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_1.bias": "encoders.17.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_1.weight": "encoders.17.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_2.bias": "encoders.17.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_2.weight": "encoders.17.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_fc.bias": "encoders.17.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_fc.weight": "encoders.17.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_proj.bias": "encoders.17.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_proj.weight": "encoders.17.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.in_proj_bias": ['encoders.18.attn.to_q.bias', 'encoders.18.attn.to_k.bias', 'encoders.18.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.in_proj_weight": ['encoders.18.attn.to_q.weight', 'encoders.18.attn.to_k.weight', 'encoders.18.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.out_proj.bias": "encoders.18.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.out_proj.weight": "encoders.18.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_1.bias": "encoders.18.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_1.weight": "encoders.18.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_2.bias": "encoders.18.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_2.weight": "encoders.18.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_fc.bias": "encoders.18.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_fc.weight": "encoders.18.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_proj.bias": "encoders.18.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_proj.weight": "encoders.18.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.in_proj_bias": ['encoders.19.attn.to_q.bias', 'encoders.19.attn.to_k.bias', 'encoders.19.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.in_proj_weight": ['encoders.19.attn.to_q.weight', 'encoders.19.attn.to_k.weight', 'encoders.19.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.out_proj.bias": "encoders.19.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.out_proj.weight": "encoders.19.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_1.bias": "encoders.19.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_1.weight": "encoders.19.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_2.bias": "encoders.19.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_2.weight": "encoders.19.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_fc.bias": "encoders.19.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_fc.weight": "encoders.19.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_proj.bias": "encoders.19.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_proj.weight": "encoders.19.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.in_proj_bias": ['encoders.2.attn.to_q.bias', 'encoders.2.attn.to_k.bias', 'encoders.2.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.in_proj_weight": ['encoders.2.attn.to_q.weight', 'encoders.2.attn.to_k.weight', 'encoders.2.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.out_proj.bias": "encoders.2.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.out_proj.weight": "encoders.2.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_1.bias": "encoders.2.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_1.weight": "encoders.2.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_2.bias": "encoders.2.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_2.weight": "encoders.2.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_fc.bias": "encoders.2.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_fc.weight": "encoders.2.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_proj.bias": "encoders.2.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_proj.weight": "encoders.2.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.in_proj_bias": ['encoders.20.attn.to_q.bias', 'encoders.20.attn.to_k.bias', 'encoders.20.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.in_proj_weight": ['encoders.20.attn.to_q.weight', 'encoders.20.attn.to_k.weight', 'encoders.20.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.out_proj.bias": "encoders.20.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.out_proj.weight": "encoders.20.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_1.bias": "encoders.20.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_1.weight": "encoders.20.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_2.bias": "encoders.20.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_2.weight": "encoders.20.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_fc.bias": "encoders.20.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_fc.weight": "encoders.20.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_proj.bias": "encoders.20.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_proj.weight": "encoders.20.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.in_proj_bias": ['encoders.21.attn.to_q.bias', 'encoders.21.attn.to_k.bias', 'encoders.21.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.in_proj_weight": ['encoders.21.attn.to_q.weight', 'encoders.21.attn.to_k.weight', 'encoders.21.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.out_proj.bias": "encoders.21.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.out_proj.weight": "encoders.21.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_1.bias": "encoders.21.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_1.weight": "encoders.21.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_2.bias": "encoders.21.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_2.weight": "encoders.21.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_fc.bias": "encoders.21.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_fc.weight": "encoders.21.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_proj.bias": "encoders.21.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_proj.weight": "encoders.21.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.in_proj_bias": ['encoders.22.attn.to_q.bias', 'encoders.22.attn.to_k.bias', 'encoders.22.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.in_proj_weight": ['encoders.22.attn.to_q.weight', 'encoders.22.attn.to_k.weight', 'encoders.22.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.out_proj.bias": "encoders.22.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.out_proj.weight": "encoders.22.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_1.bias": "encoders.22.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_1.weight": "encoders.22.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_2.bias": "encoders.22.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_2.weight": "encoders.22.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_fc.bias": "encoders.22.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_fc.weight": "encoders.22.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_proj.bias": "encoders.22.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_proj.weight": "encoders.22.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.in_proj_bias": ['encoders.23.attn.to_q.bias', 'encoders.23.attn.to_k.bias', 'encoders.23.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.in_proj_weight": ['encoders.23.attn.to_q.weight', 'encoders.23.attn.to_k.weight', 'encoders.23.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.out_proj.bias": "encoders.23.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.out_proj.weight": "encoders.23.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_1.bias": "encoders.23.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_1.weight": "encoders.23.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_2.bias": "encoders.23.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_2.weight": "encoders.23.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_fc.bias": "encoders.23.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_fc.weight": "encoders.23.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_proj.bias": "encoders.23.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_proj.weight": "encoders.23.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.in_proj_bias": ['encoders.24.attn.to_q.bias', 'encoders.24.attn.to_k.bias', 'encoders.24.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.in_proj_weight": ['encoders.24.attn.to_q.weight', 'encoders.24.attn.to_k.weight', 'encoders.24.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.out_proj.bias": "encoders.24.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.out_proj.weight": "encoders.24.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_1.bias": "encoders.24.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_1.weight": "encoders.24.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_2.bias": "encoders.24.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_2.weight": "encoders.24.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_fc.bias": "encoders.24.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_fc.weight": "encoders.24.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_proj.bias": "encoders.24.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_proj.weight": "encoders.24.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.in_proj_bias": ['encoders.25.attn.to_q.bias', 'encoders.25.attn.to_k.bias', 'encoders.25.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.in_proj_weight": ['encoders.25.attn.to_q.weight', 'encoders.25.attn.to_k.weight', 'encoders.25.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.out_proj.bias": "encoders.25.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.out_proj.weight": "encoders.25.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_1.bias": "encoders.25.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_1.weight": "encoders.25.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_2.bias": "encoders.25.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_2.weight": "encoders.25.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_fc.bias": "encoders.25.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_fc.weight": "encoders.25.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_proj.bias": "encoders.25.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_proj.weight": "encoders.25.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.in_proj_bias": ['encoders.26.attn.to_q.bias', 'encoders.26.attn.to_k.bias', 'encoders.26.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.in_proj_weight": ['encoders.26.attn.to_q.weight', 'encoders.26.attn.to_k.weight', 'encoders.26.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.out_proj.bias": "encoders.26.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.out_proj.weight": "encoders.26.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_1.bias": "encoders.26.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_1.weight": "encoders.26.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_2.bias": "encoders.26.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_2.weight": "encoders.26.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_fc.bias": "encoders.26.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_fc.weight": "encoders.26.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_proj.bias": "encoders.26.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_proj.weight": "encoders.26.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.in_proj_bias": ['encoders.27.attn.to_q.bias', 'encoders.27.attn.to_k.bias', 'encoders.27.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.in_proj_weight": ['encoders.27.attn.to_q.weight', 'encoders.27.attn.to_k.weight', 'encoders.27.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.out_proj.bias": "encoders.27.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.out_proj.weight": "encoders.27.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_1.bias": "encoders.27.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_1.weight": "encoders.27.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_2.bias": "encoders.27.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_2.weight": "encoders.27.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_fc.bias": "encoders.27.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_fc.weight": "encoders.27.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_proj.bias": "encoders.27.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_proj.weight": "encoders.27.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.in_proj_bias": ['encoders.28.attn.to_q.bias', 'encoders.28.attn.to_k.bias', 'encoders.28.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.in_proj_weight": ['encoders.28.attn.to_q.weight', 'encoders.28.attn.to_k.weight', 'encoders.28.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.out_proj.bias": "encoders.28.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.out_proj.weight": "encoders.28.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_1.bias": "encoders.28.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_1.weight": "encoders.28.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_2.bias": "encoders.28.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_2.weight": "encoders.28.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_fc.bias": "encoders.28.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_fc.weight": "encoders.28.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_proj.bias": "encoders.28.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_proj.weight": "encoders.28.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.in_proj_bias": ['encoders.29.attn.to_q.bias', 'encoders.29.attn.to_k.bias', 'encoders.29.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.in_proj_weight": ['encoders.29.attn.to_q.weight', 'encoders.29.attn.to_k.weight', 'encoders.29.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.out_proj.bias": "encoders.29.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.out_proj.weight": "encoders.29.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_1.bias": "encoders.29.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_1.weight": "encoders.29.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_2.bias": "encoders.29.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_2.weight": "encoders.29.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_fc.bias": "encoders.29.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_fc.weight": "encoders.29.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_proj.bias": "encoders.29.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_proj.weight": "encoders.29.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.in_proj_bias": ['encoders.3.attn.to_q.bias', 'encoders.3.attn.to_k.bias', 'encoders.3.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.in_proj_weight": ['encoders.3.attn.to_q.weight', 'encoders.3.attn.to_k.weight', 'encoders.3.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.out_proj.bias": "encoders.3.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.out_proj.weight": "encoders.3.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_1.bias": "encoders.3.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_1.weight": "encoders.3.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_2.bias": "encoders.3.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_2.weight": "encoders.3.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_fc.bias": "encoders.3.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_fc.weight": "encoders.3.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_proj.bias": "encoders.3.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_proj.weight": "encoders.3.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.in_proj_bias": ['encoders.30.attn.to_q.bias', 'encoders.30.attn.to_k.bias', 'encoders.30.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.in_proj_weight": ['encoders.30.attn.to_q.weight', 'encoders.30.attn.to_k.weight', 'encoders.30.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.out_proj.bias": "encoders.30.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.out_proj.weight": "encoders.30.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_1.bias": "encoders.30.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_1.weight": "encoders.30.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_2.bias": "encoders.30.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_2.weight": "encoders.30.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_fc.bias": "encoders.30.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_fc.weight": "encoders.30.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_proj.bias": "encoders.30.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_proj.weight": "encoders.30.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.in_proj_bias": ['encoders.31.attn.to_q.bias', 'encoders.31.attn.to_k.bias', 'encoders.31.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.in_proj_weight": ['encoders.31.attn.to_q.weight', 'encoders.31.attn.to_k.weight', 'encoders.31.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.out_proj.bias": "encoders.31.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.out_proj.weight": "encoders.31.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_1.bias": "encoders.31.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_1.weight": "encoders.31.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_2.bias": "encoders.31.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_2.weight": "encoders.31.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_fc.bias": "encoders.31.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_fc.weight": "encoders.31.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_proj.bias": "encoders.31.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_proj.weight": "encoders.31.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.in_proj_bias": ['encoders.4.attn.to_q.bias', 'encoders.4.attn.to_k.bias', 'encoders.4.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.in_proj_weight": ['encoders.4.attn.to_q.weight', 'encoders.4.attn.to_k.weight', 'encoders.4.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.out_proj.bias": "encoders.4.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.out_proj.weight": "encoders.4.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_1.bias": "encoders.4.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_1.weight": "encoders.4.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_2.bias": "encoders.4.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_2.weight": "encoders.4.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_fc.bias": "encoders.4.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_fc.weight": "encoders.4.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_proj.bias": "encoders.4.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_proj.weight": "encoders.4.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.in_proj_bias": ['encoders.5.attn.to_q.bias', 'encoders.5.attn.to_k.bias', 'encoders.5.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.in_proj_weight": ['encoders.5.attn.to_q.weight', 'encoders.5.attn.to_k.weight', 'encoders.5.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.out_proj.bias": "encoders.5.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.out_proj.weight": "encoders.5.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_1.bias": "encoders.5.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_1.weight": "encoders.5.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_2.bias": "encoders.5.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_2.weight": "encoders.5.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_fc.bias": "encoders.5.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_fc.weight": "encoders.5.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_proj.bias": "encoders.5.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_proj.weight": "encoders.5.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.in_proj_bias": ['encoders.6.attn.to_q.bias', 'encoders.6.attn.to_k.bias', 'encoders.6.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.in_proj_weight": ['encoders.6.attn.to_q.weight', 'encoders.6.attn.to_k.weight', 'encoders.6.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.out_proj.bias": "encoders.6.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.out_proj.weight": "encoders.6.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_1.bias": "encoders.6.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_1.weight": "encoders.6.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_2.bias": "encoders.6.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_2.weight": "encoders.6.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_fc.bias": "encoders.6.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_fc.weight": "encoders.6.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_proj.bias": "encoders.6.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_proj.weight": "encoders.6.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.in_proj_bias": ['encoders.7.attn.to_q.bias', 'encoders.7.attn.to_k.bias', 'encoders.7.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.in_proj_weight": ['encoders.7.attn.to_q.weight', 'encoders.7.attn.to_k.weight', 'encoders.7.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.out_proj.bias": "encoders.7.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.out_proj.weight": "encoders.7.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_1.bias": "encoders.7.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_1.weight": "encoders.7.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_2.bias": "encoders.7.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_2.weight": "encoders.7.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_fc.bias": "encoders.7.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_fc.weight": "encoders.7.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_proj.bias": "encoders.7.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_proj.weight": "encoders.7.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.in_proj_bias": ['encoders.8.attn.to_q.bias', 'encoders.8.attn.to_k.bias', 'encoders.8.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.in_proj_weight": ['encoders.8.attn.to_q.weight', 'encoders.8.attn.to_k.weight', 'encoders.8.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.out_proj.bias": "encoders.8.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.out_proj.weight": "encoders.8.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_1.bias": "encoders.8.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_1.weight": "encoders.8.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_2.bias": "encoders.8.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_2.weight": "encoders.8.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_fc.bias": "encoders.8.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_fc.weight": "encoders.8.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_proj.bias": "encoders.8.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_proj.weight": "encoders.8.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.in_proj_bias": ['encoders.9.attn.to_q.bias', 'encoders.9.attn.to_k.bias', 'encoders.9.attn.to_v.bias'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.in_proj_weight": ['encoders.9.attn.to_q.weight', 'encoders.9.attn.to_k.weight', 'encoders.9.attn.to_v.weight'], + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.out_proj.bias": "encoders.9.attn.to_out.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.out_proj.weight": "encoders.9.attn.to_out.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_1.bias": "encoders.9.layer_norm1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_1.weight": "encoders.9.layer_norm1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_2.bias": "encoders.9.layer_norm2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_2.weight": "encoders.9.layer_norm2.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_fc.bias": "encoders.9.fc1.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_fc.weight": "encoders.9.fc1.weight", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_proj.bias": "encoders.9.fc2.bias", + "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_proj.weight": "encoders.9.fc2.weight", + "conditioner.embedders.0.open_clip.model.visual.proj": "visual_projection.weight", + } + state_dict_ = {} + for name in state_dict: + if name in rename_dict: + param = state_dict[name] + if name == "conditioner.embedders.0.open_clip.model.visual.class_embedding": + param = param.reshape((1, 1, param.shape[0])) + elif name == "conditioner.embedders.0.open_clip.model.visual.positional_embedding": + param = param.reshape((1, param.shape[0], param.shape[1])) + elif name == "conditioner.embedders.0.open_clip.model.visual.proj": + param = param.T + if isinstance(rename_dict[name], str): + state_dict_[rename_dict[name]] = param + else: + length = param.shape[0] // 3 + for i, rename in enumerate(rename_dict[name]): + state_dict_[rename] = param[i*length: i*length+length] + return state_dict_ diff --git a/diffsynth/models/svd_unet.py b/diffsynth/models/svd_unet.py index 3ff0d1a..1ec4a76 100644 --- a/diffsynth/models/svd_unet.py +++ b/diffsynth/models/svd_unet.py @@ -17,7 +17,7 @@ class TemporalResnetBlock(torch.nn.Module): if in_channels != out_channels: self.conv_shortcut = torch.nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True) - def forward(self, hidden_states, time_emb, text_emb, res_stack): + def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs): x = rearrange(hidden_states, "f c h w -> 1 c f h w") x = self.norm1(x) x = self.nonlinearity(x) @@ -134,10 +134,12 @@ class TemporalAttentionBlock(torch.nn.Module): self.act_fn_out = GEGLU(in_channels, in_channels * 4) self.ff_out = torch.nn.Linear(in_channels * 4, in_channels) - def forward(self, hidden_states, time_emb, text_emb, res_stack): + def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs): batch, inner_dim, height, width = hidden_states.shape - pos_emb = torch.arange(batch) + pos_emb = torch.arange(batch, dtype=hidden_states.dtype) + if batch > 25: + pos_emb *= 25 / batch pos_emb = self.positional_embedding(pos_emb).to(dtype=hidden_states.dtype, device=hidden_states.device) pos_emb = self.positional_embedding_proj(pos_emb)[None, :, :] @@ -177,7 +179,7 @@ class PopMixBlock(torch.nn.Module): if self.need_proj: self.proj = torch.nn.Linear(in_channels, in_channels) - def forward(self, hidden_states, time_emb, text_emb, res_stack): + def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs): res_hidden_states = res_stack.pop() alpha = torch.sigmoid(self.mix_factor) hidden_states = alpha * res_hidden_states + (1 - alpha) * hidden_states @@ -266,10 +268,69 @@ class SVDUNet(torch.nn.Module): self.conv_norm_out = torch.nn.GroupNorm(32, 320, eps=1e-05, affine=True) self.conv_act = torch.nn.SiLU() self.conv_out = torch.nn.Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + + + def build_mask(self, data, is_bound): + T, C, H, W = data.shape + t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W) + h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W) + w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W) + border_width = (T + H + W) // 6 + pad = torch.ones_like(t) * border_width + mask = torch.stack([ + pad if is_bound[0] else t + 1, + pad if is_bound[1] else T - t, + pad if is_bound[2] else h + 1, + pad if is_bound[3] else H - h, + pad if is_bound[4] else w + 1, + pad if is_bound[5] else W - w + ]).min(dim=0).values + mask = mask.clip(1, border_width) + mask = (mask / border_width).to(dtype=data.dtype, device=data.device) + mask = rearrange(mask, "T H W -> T 1 H W") + return mask + + + def tiled_forward( + self, sample, timestep, encoder_hidden_states, add_time_id, + batch_time=25, batch_height=128, batch_width=128, + stride_time=5, stride_height=64, stride_width=64, + progress_bar=lambda x:x + ): + data_device = sample.device + computation_device = self.conv_in.weight.device + torch_dtype = sample.dtype + T, C, H, W = sample.shape + + weight = torch.zeros((T, 1, H, W), dtype=torch_dtype, device=data_device) + values = torch.zeros((T, 4, H, W), dtype=torch_dtype, device=data_device) + + # Split tasks + tasks = [] + for t in range(0, T, stride_time): + for h in range(0, H, stride_height): + for w in range(0, W, stride_width): + if (t-stride_time >= 0 and t-stride_time+batch_time >= T)\ + or (h-stride_height >= 0 and h-stride_height+batch_height >= H)\ + or (w-stride_width >= 0 and w-stride_width+batch_width >= W): + continue + tasks.append((t, t+batch_time, h, h+batch_height, w, w+batch_width)) + + # Run + for tl, tr, hl, hr, wl, wr in progress_bar(tasks): + sample_batch = sample[tl:tr, :, hl:hr, wl:wr].to(computation_device) + sample_batch = self.forward(sample_batch, timestep, encoder_hidden_states, add_time_id).to(data_device) + mask = self.build_mask(sample_batch, is_bound=(tl==0, tr>=T, hl==0, hr>=H, wl==0, wr>=W)) + values[tl:tr, :, hl:hr, wl:wr] += sample_batch * mask + weight[tl:tr, :, hl:hr, wl:wr] += mask + values /= weight + return values + def forward(self, sample, timestep, encoder_hidden_states, add_time_id, **kwargs): # 1. time - t_emb = self.time_proj(timestep[None]).to(sample.dtype) + timestep = torch.tensor((timestep,)).to(sample.device) + t_emb = self.time_proj(timestep).to(sample.dtype) t_emb = self.time_embedding(t_emb) add_embeds = self.add_time_proj(add_time_id.flatten()).to(sample.dtype) @@ -434,3 +495,1444 @@ class SVDUNetStateDictConverter: rename = ".".join(["blocks", str(blocks_rename_dict[block_name]), names[-1]]) state_dict_[rename] = param return state_dict_ + + + def from_civitai(self, state_dict): + rename_dict = { + "model.diffusion_model.input_blocks.0.0.bias": "conv_in.bias", + "model.diffusion_model.input_blocks.0.0.weight": "conv_in.weight", + "model.diffusion_model.input_blocks.1.0.emb_layers.1.bias": "blocks.0.time_emb_proj.bias", + "model.diffusion_model.input_blocks.1.0.emb_layers.1.weight": "blocks.0.time_emb_proj.weight", + "model.diffusion_model.input_blocks.1.0.in_layers.0.bias": "blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.1.0.in_layers.0.weight": "blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.1.0.in_layers.2.bias": "blocks.0.conv1.bias", + "model.diffusion_model.input_blocks.1.0.in_layers.2.weight": "blocks.0.conv1.weight", + "model.diffusion_model.input_blocks.1.0.out_layers.0.bias": "blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.1.0.out_layers.0.weight": "blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.1.0.out_layers.3.bias": "blocks.0.conv2.bias", + "model.diffusion_model.input_blocks.1.0.out_layers.3.weight": "blocks.0.conv2.weight", + "model.diffusion_model.input_blocks.1.0.time_mixer.mix_factor": "blocks.3.mix_factor", + "model.diffusion_model.input_blocks.1.0.time_stack.emb_layers.1.bias": "blocks.2.time_emb_proj.bias", + "model.diffusion_model.input_blocks.1.0.time_stack.emb_layers.1.weight": "blocks.2.time_emb_proj.weight", + "model.diffusion_model.input_blocks.1.0.time_stack.in_layers.0.bias": "blocks.2.norm1.bias", + "model.diffusion_model.input_blocks.1.0.time_stack.in_layers.0.weight": "blocks.2.norm1.weight", + "model.diffusion_model.input_blocks.1.0.time_stack.in_layers.2.bias": "blocks.2.conv1.bias", + "model.diffusion_model.input_blocks.1.0.time_stack.in_layers.2.weight": "blocks.2.conv1.weight", + "model.diffusion_model.input_blocks.1.0.time_stack.out_layers.0.bias": "blocks.2.norm2.bias", + "model.diffusion_model.input_blocks.1.0.time_stack.out_layers.0.weight": "blocks.2.norm2.weight", + "model.diffusion_model.input_blocks.1.0.time_stack.out_layers.3.bias": "blocks.2.conv2.bias", + "model.diffusion_model.input_blocks.1.0.time_stack.out_layers.3.weight": "blocks.2.conv2.weight", + "model.diffusion_model.input_blocks.1.1.norm.bias": "blocks.5.norm.bias", + "model.diffusion_model.input_blocks.1.1.norm.weight": "blocks.5.norm.weight", + "model.diffusion_model.input_blocks.1.1.proj_in.bias": "blocks.5.proj_in.bias", + "model.diffusion_model.input_blocks.1.1.proj_in.weight": "blocks.5.proj_in.weight", + "model.diffusion_model.input_blocks.1.1.proj_out.bias": "blocks.8.proj.bias", + "model.diffusion_model.input_blocks.1.1.proj_out.weight": "blocks.8.proj.weight", + "model.diffusion_model.input_blocks.1.1.time_mixer.mix_factor": "blocks.8.mix_factor", + "model.diffusion_model.input_blocks.1.1.time_pos_embed.0.bias": "blocks.7.positional_embedding_proj.0.bias", + "model.diffusion_model.input_blocks.1.1.time_pos_embed.0.weight": "blocks.7.positional_embedding_proj.0.weight", + "model.diffusion_model.input_blocks.1.1.time_pos_embed.2.bias": "blocks.7.positional_embedding_proj.2.bias", + "model.diffusion_model.input_blocks.1.1.time_pos_embed.2.weight": "blocks.7.positional_embedding_proj.2.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn1.to_k.weight": "blocks.7.attn1.to_k.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn1.to_out.0.bias": "blocks.7.attn1.to_out.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn1.to_out.0.weight": "blocks.7.attn1.to_out.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn1.to_q.weight": "blocks.7.attn1.to_q.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn1.to_v.weight": "blocks.7.attn1.to_v.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn2.to_k.weight": "blocks.7.attn2.to_k.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn2.to_out.0.bias": "blocks.7.attn2.to_out.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn2.to_out.0.weight": "blocks.7.attn2.to_out.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn2.to_q.weight": "blocks.7.attn2.to_q.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.attn2.to_v.weight": "blocks.7.attn2.to_v.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff.net.0.proj.bias": "blocks.7.act_fn_out.proj.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff.net.0.proj.weight": "blocks.7.act_fn_out.proj.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff.net.2.bias": "blocks.7.ff_out.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff.net.2.weight": "blocks.7.ff_out.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.7.act_fn_in.proj.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.7.act_fn_in.proj.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff_in.net.2.bias": "blocks.7.ff_in.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.ff_in.net.2.weight": "blocks.7.ff_in.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm1.bias": "blocks.7.norm1.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm1.weight": "blocks.7.norm1.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm2.bias": "blocks.7.norm2.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm2.weight": "blocks.7.norm2.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm3.bias": "blocks.7.norm_out.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm3.weight": "blocks.7.norm_out.weight", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm_in.bias": "blocks.7.norm_in.bias", + "model.diffusion_model.input_blocks.1.1.time_stack.0.norm_in.weight": "blocks.7.norm_in.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_k.weight": "blocks.5.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.5.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.5.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_q.weight": "blocks.5.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_v.weight": "blocks.5.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight": "blocks.5.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.5.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.5.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_q.weight": "blocks.5.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_v.weight": "blocks.5.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.5.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.5.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.bias": "blocks.5.transformer_blocks.0.ff.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.weight": "blocks.5.transformer_blocks.0.ff.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm1.bias": "blocks.5.transformer_blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm1.weight": "blocks.5.transformer_blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm2.bias": "blocks.5.transformer_blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm2.weight": "blocks.5.transformer_blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm3.bias": "blocks.5.transformer_blocks.0.norm3.bias", + "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm3.weight": "blocks.5.transformer_blocks.0.norm3.weight", + "model.diffusion_model.input_blocks.10.0.emb_layers.1.bias": "blocks.66.time_emb_proj.bias", + "model.diffusion_model.input_blocks.10.0.emb_layers.1.weight": "blocks.66.time_emb_proj.weight", + "model.diffusion_model.input_blocks.10.0.in_layers.0.bias": "blocks.66.norm1.bias", + "model.diffusion_model.input_blocks.10.0.in_layers.0.weight": "blocks.66.norm1.weight", + "model.diffusion_model.input_blocks.10.0.in_layers.2.bias": "blocks.66.conv1.bias", + "model.diffusion_model.input_blocks.10.0.in_layers.2.weight": "blocks.66.conv1.weight", + "model.diffusion_model.input_blocks.10.0.out_layers.0.bias": "blocks.66.norm2.bias", + "model.diffusion_model.input_blocks.10.0.out_layers.0.weight": "blocks.66.norm2.weight", + "model.diffusion_model.input_blocks.10.0.out_layers.3.bias": "blocks.66.conv2.bias", + "model.diffusion_model.input_blocks.10.0.out_layers.3.weight": "blocks.66.conv2.weight", + "model.diffusion_model.input_blocks.10.0.time_mixer.mix_factor": "blocks.69.mix_factor", + "model.diffusion_model.input_blocks.10.0.time_stack.emb_layers.1.bias": "blocks.68.time_emb_proj.bias", + "model.diffusion_model.input_blocks.10.0.time_stack.emb_layers.1.weight": "blocks.68.time_emb_proj.weight", + "model.diffusion_model.input_blocks.10.0.time_stack.in_layers.0.bias": "blocks.68.norm1.bias", + "model.diffusion_model.input_blocks.10.0.time_stack.in_layers.0.weight": "blocks.68.norm1.weight", + "model.diffusion_model.input_blocks.10.0.time_stack.in_layers.2.bias": "blocks.68.conv1.bias", + "model.diffusion_model.input_blocks.10.0.time_stack.in_layers.2.weight": "blocks.68.conv1.weight", + "model.diffusion_model.input_blocks.10.0.time_stack.out_layers.0.bias": "blocks.68.norm2.bias", + "model.diffusion_model.input_blocks.10.0.time_stack.out_layers.0.weight": "blocks.68.norm2.weight", + "model.diffusion_model.input_blocks.10.0.time_stack.out_layers.3.bias": "blocks.68.conv2.bias", + "model.diffusion_model.input_blocks.10.0.time_stack.out_layers.3.weight": "blocks.68.conv2.weight", + "model.diffusion_model.input_blocks.11.0.emb_layers.1.bias": "blocks.71.time_emb_proj.bias", + "model.diffusion_model.input_blocks.11.0.emb_layers.1.weight": "blocks.71.time_emb_proj.weight", + "model.diffusion_model.input_blocks.11.0.in_layers.0.bias": "blocks.71.norm1.bias", + "model.diffusion_model.input_blocks.11.0.in_layers.0.weight": "blocks.71.norm1.weight", + "model.diffusion_model.input_blocks.11.0.in_layers.2.bias": "blocks.71.conv1.bias", + "model.diffusion_model.input_blocks.11.0.in_layers.2.weight": "blocks.71.conv1.weight", + "model.diffusion_model.input_blocks.11.0.out_layers.0.bias": "blocks.71.norm2.bias", + "model.diffusion_model.input_blocks.11.0.out_layers.0.weight": "blocks.71.norm2.weight", + "model.diffusion_model.input_blocks.11.0.out_layers.3.bias": "blocks.71.conv2.bias", + "model.diffusion_model.input_blocks.11.0.out_layers.3.weight": "blocks.71.conv2.weight", + "model.diffusion_model.input_blocks.11.0.time_mixer.mix_factor": "blocks.74.mix_factor", + "model.diffusion_model.input_blocks.11.0.time_stack.emb_layers.1.bias": "blocks.73.time_emb_proj.bias", + "model.diffusion_model.input_blocks.11.0.time_stack.emb_layers.1.weight": "blocks.73.time_emb_proj.weight", + "model.diffusion_model.input_blocks.11.0.time_stack.in_layers.0.bias": "blocks.73.norm1.bias", + "model.diffusion_model.input_blocks.11.0.time_stack.in_layers.0.weight": "blocks.73.norm1.weight", + "model.diffusion_model.input_blocks.11.0.time_stack.in_layers.2.bias": "blocks.73.conv1.bias", + "model.diffusion_model.input_blocks.11.0.time_stack.in_layers.2.weight": "blocks.73.conv1.weight", + "model.diffusion_model.input_blocks.11.0.time_stack.out_layers.0.bias": "blocks.73.norm2.bias", + "model.diffusion_model.input_blocks.11.0.time_stack.out_layers.0.weight": "blocks.73.norm2.weight", + "model.diffusion_model.input_blocks.11.0.time_stack.out_layers.3.bias": "blocks.73.conv2.bias", + "model.diffusion_model.input_blocks.11.0.time_stack.out_layers.3.weight": "blocks.73.conv2.weight", + "model.diffusion_model.input_blocks.2.0.emb_layers.1.bias": "blocks.10.time_emb_proj.bias", + "model.diffusion_model.input_blocks.2.0.emb_layers.1.weight": "blocks.10.time_emb_proj.weight", + "model.diffusion_model.input_blocks.2.0.in_layers.0.bias": "blocks.10.norm1.bias", + "model.diffusion_model.input_blocks.2.0.in_layers.0.weight": "blocks.10.norm1.weight", + "model.diffusion_model.input_blocks.2.0.in_layers.2.bias": "blocks.10.conv1.bias", + "model.diffusion_model.input_blocks.2.0.in_layers.2.weight": "blocks.10.conv1.weight", + "model.diffusion_model.input_blocks.2.0.out_layers.0.bias": "blocks.10.norm2.bias", + "model.diffusion_model.input_blocks.2.0.out_layers.0.weight": "blocks.10.norm2.weight", + "model.diffusion_model.input_blocks.2.0.out_layers.3.bias": "blocks.10.conv2.bias", + "model.diffusion_model.input_blocks.2.0.out_layers.3.weight": "blocks.10.conv2.weight", + "model.diffusion_model.input_blocks.2.0.time_mixer.mix_factor": "blocks.13.mix_factor", + "model.diffusion_model.input_blocks.2.0.time_stack.emb_layers.1.bias": "blocks.12.time_emb_proj.bias", + "model.diffusion_model.input_blocks.2.0.time_stack.emb_layers.1.weight": "blocks.12.time_emb_proj.weight", + "model.diffusion_model.input_blocks.2.0.time_stack.in_layers.0.bias": "blocks.12.norm1.bias", + "model.diffusion_model.input_blocks.2.0.time_stack.in_layers.0.weight": "blocks.12.norm1.weight", + "model.diffusion_model.input_blocks.2.0.time_stack.in_layers.2.bias": "blocks.12.conv1.bias", + "model.diffusion_model.input_blocks.2.0.time_stack.in_layers.2.weight": "blocks.12.conv1.weight", + "model.diffusion_model.input_blocks.2.0.time_stack.out_layers.0.bias": "blocks.12.norm2.bias", + "model.diffusion_model.input_blocks.2.0.time_stack.out_layers.0.weight": "blocks.12.norm2.weight", + "model.diffusion_model.input_blocks.2.0.time_stack.out_layers.3.bias": "blocks.12.conv2.bias", + "model.diffusion_model.input_blocks.2.0.time_stack.out_layers.3.weight": "blocks.12.conv2.weight", + "model.diffusion_model.input_blocks.2.1.norm.bias": "blocks.15.norm.bias", + "model.diffusion_model.input_blocks.2.1.norm.weight": "blocks.15.norm.weight", + "model.diffusion_model.input_blocks.2.1.proj_in.bias": "blocks.15.proj_in.bias", + "model.diffusion_model.input_blocks.2.1.proj_in.weight": "blocks.15.proj_in.weight", + "model.diffusion_model.input_blocks.2.1.proj_out.bias": "blocks.18.proj.bias", + "model.diffusion_model.input_blocks.2.1.proj_out.weight": "blocks.18.proj.weight", + "model.diffusion_model.input_blocks.2.1.time_mixer.mix_factor": "blocks.18.mix_factor", + "model.diffusion_model.input_blocks.2.1.time_pos_embed.0.bias": "blocks.17.positional_embedding_proj.0.bias", + "model.diffusion_model.input_blocks.2.1.time_pos_embed.0.weight": "blocks.17.positional_embedding_proj.0.weight", + "model.diffusion_model.input_blocks.2.1.time_pos_embed.2.bias": "blocks.17.positional_embedding_proj.2.bias", + "model.diffusion_model.input_blocks.2.1.time_pos_embed.2.weight": "blocks.17.positional_embedding_proj.2.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn1.to_k.weight": "blocks.17.attn1.to_k.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn1.to_out.0.bias": "blocks.17.attn1.to_out.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn1.to_out.0.weight": "blocks.17.attn1.to_out.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn1.to_q.weight": "blocks.17.attn1.to_q.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn1.to_v.weight": "blocks.17.attn1.to_v.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn2.to_k.weight": "blocks.17.attn2.to_k.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn2.to_out.0.bias": "blocks.17.attn2.to_out.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn2.to_out.0.weight": "blocks.17.attn2.to_out.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn2.to_q.weight": "blocks.17.attn2.to_q.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.attn2.to_v.weight": "blocks.17.attn2.to_v.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff.net.0.proj.bias": "blocks.17.act_fn_out.proj.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff.net.0.proj.weight": "blocks.17.act_fn_out.proj.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff.net.2.bias": "blocks.17.ff_out.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff.net.2.weight": "blocks.17.ff_out.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.17.act_fn_in.proj.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.17.act_fn_in.proj.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff_in.net.2.bias": "blocks.17.ff_in.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.ff_in.net.2.weight": "blocks.17.ff_in.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm1.bias": "blocks.17.norm1.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm1.weight": "blocks.17.norm1.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm2.bias": "blocks.17.norm2.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm2.weight": "blocks.17.norm2.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm3.bias": "blocks.17.norm_out.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm3.weight": "blocks.17.norm_out.weight", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm_in.bias": "blocks.17.norm_in.bias", + "model.diffusion_model.input_blocks.2.1.time_stack.0.norm_in.weight": "blocks.17.norm_in.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_k.weight": "blocks.15.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.15.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.15.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_q.weight": "blocks.15.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_v.weight": "blocks.15.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight": "blocks.15.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.15.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.15.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_q.weight": "blocks.15.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_v.weight": "blocks.15.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.15.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.15.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.bias": "blocks.15.transformer_blocks.0.ff.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.weight": "blocks.15.transformer_blocks.0.ff.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm1.bias": "blocks.15.transformer_blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm1.weight": "blocks.15.transformer_blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm2.bias": "blocks.15.transformer_blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm2.weight": "blocks.15.transformer_blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm3.bias": "blocks.15.transformer_blocks.0.norm3.bias", + "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm3.weight": "blocks.15.transformer_blocks.0.norm3.weight", + "model.diffusion_model.input_blocks.3.0.op.bias": "blocks.20.conv.bias", + "model.diffusion_model.input_blocks.3.0.op.weight": "blocks.20.conv.weight", + "model.diffusion_model.input_blocks.4.0.emb_layers.1.bias": "blocks.22.time_emb_proj.bias", + "model.diffusion_model.input_blocks.4.0.emb_layers.1.weight": "blocks.22.time_emb_proj.weight", + "model.diffusion_model.input_blocks.4.0.in_layers.0.bias": "blocks.22.norm1.bias", + "model.diffusion_model.input_blocks.4.0.in_layers.0.weight": "blocks.22.norm1.weight", + "model.diffusion_model.input_blocks.4.0.in_layers.2.bias": "blocks.22.conv1.bias", + "model.diffusion_model.input_blocks.4.0.in_layers.2.weight": "blocks.22.conv1.weight", + "model.diffusion_model.input_blocks.4.0.out_layers.0.bias": "blocks.22.norm2.bias", + "model.diffusion_model.input_blocks.4.0.out_layers.0.weight": "blocks.22.norm2.weight", + "model.diffusion_model.input_blocks.4.0.out_layers.3.bias": "blocks.22.conv2.bias", + "model.diffusion_model.input_blocks.4.0.out_layers.3.weight": "blocks.22.conv2.weight", + "model.diffusion_model.input_blocks.4.0.skip_connection.bias": "blocks.22.conv_shortcut.bias", + "model.diffusion_model.input_blocks.4.0.skip_connection.weight": "blocks.22.conv_shortcut.weight", + "model.diffusion_model.input_blocks.4.0.time_mixer.mix_factor": "blocks.25.mix_factor", + "model.diffusion_model.input_blocks.4.0.time_stack.emb_layers.1.bias": "blocks.24.time_emb_proj.bias", + "model.diffusion_model.input_blocks.4.0.time_stack.emb_layers.1.weight": "blocks.24.time_emb_proj.weight", + "model.diffusion_model.input_blocks.4.0.time_stack.in_layers.0.bias": "blocks.24.norm1.bias", + "model.diffusion_model.input_blocks.4.0.time_stack.in_layers.0.weight": "blocks.24.norm1.weight", + "model.diffusion_model.input_blocks.4.0.time_stack.in_layers.2.bias": "blocks.24.conv1.bias", + "model.diffusion_model.input_blocks.4.0.time_stack.in_layers.2.weight": "blocks.24.conv1.weight", + "model.diffusion_model.input_blocks.4.0.time_stack.out_layers.0.bias": "blocks.24.norm2.bias", + "model.diffusion_model.input_blocks.4.0.time_stack.out_layers.0.weight": "blocks.24.norm2.weight", + "model.diffusion_model.input_blocks.4.0.time_stack.out_layers.3.bias": "blocks.24.conv2.bias", + "model.diffusion_model.input_blocks.4.0.time_stack.out_layers.3.weight": "blocks.24.conv2.weight", + "model.diffusion_model.input_blocks.4.1.norm.bias": "blocks.27.norm.bias", + "model.diffusion_model.input_blocks.4.1.norm.weight": "blocks.27.norm.weight", + "model.diffusion_model.input_blocks.4.1.proj_in.bias": "blocks.27.proj_in.bias", + "model.diffusion_model.input_blocks.4.1.proj_in.weight": "blocks.27.proj_in.weight", + "model.diffusion_model.input_blocks.4.1.proj_out.bias": "blocks.30.proj.bias", + "model.diffusion_model.input_blocks.4.1.proj_out.weight": "blocks.30.proj.weight", + "model.diffusion_model.input_blocks.4.1.time_mixer.mix_factor": "blocks.30.mix_factor", + "model.diffusion_model.input_blocks.4.1.time_pos_embed.0.bias": "blocks.29.positional_embedding_proj.0.bias", + "model.diffusion_model.input_blocks.4.1.time_pos_embed.0.weight": "blocks.29.positional_embedding_proj.0.weight", + "model.diffusion_model.input_blocks.4.1.time_pos_embed.2.bias": "blocks.29.positional_embedding_proj.2.bias", + "model.diffusion_model.input_blocks.4.1.time_pos_embed.2.weight": "blocks.29.positional_embedding_proj.2.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn1.to_k.weight": "blocks.29.attn1.to_k.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn1.to_out.0.bias": "blocks.29.attn1.to_out.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn1.to_out.0.weight": "blocks.29.attn1.to_out.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn1.to_q.weight": "blocks.29.attn1.to_q.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn1.to_v.weight": "blocks.29.attn1.to_v.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn2.to_k.weight": "blocks.29.attn2.to_k.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn2.to_out.0.bias": "blocks.29.attn2.to_out.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn2.to_out.0.weight": "blocks.29.attn2.to_out.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn2.to_q.weight": "blocks.29.attn2.to_q.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.attn2.to_v.weight": "blocks.29.attn2.to_v.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff.net.0.proj.bias": "blocks.29.act_fn_out.proj.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff.net.0.proj.weight": "blocks.29.act_fn_out.proj.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff.net.2.bias": "blocks.29.ff_out.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff.net.2.weight": "blocks.29.ff_out.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.29.act_fn_in.proj.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.29.act_fn_in.proj.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff_in.net.2.bias": "blocks.29.ff_in.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.ff_in.net.2.weight": "blocks.29.ff_in.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm1.bias": "blocks.29.norm1.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm1.weight": "blocks.29.norm1.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm2.bias": "blocks.29.norm2.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm2.weight": "blocks.29.norm2.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm3.bias": "blocks.29.norm_out.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm3.weight": "blocks.29.norm_out.weight", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm_in.bias": "blocks.29.norm_in.bias", + "model.diffusion_model.input_blocks.4.1.time_stack.0.norm_in.weight": "blocks.29.norm_in.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_k.weight": "blocks.27.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.27.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.27.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_q.weight": "blocks.27.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_v.weight": "blocks.27.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight": "blocks.27.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.27.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.27.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_q.weight": "blocks.27.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_v.weight": "blocks.27.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.27.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.27.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.bias": "blocks.27.transformer_blocks.0.ff.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.weight": "blocks.27.transformer_blocks.0.ff.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm1.bias": "blocks.27.transformer_blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm1.weight": "blocks.27.transformer_blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm2.bias": "blocks.27.transformer_blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm2.weight": "blocks.27.transformer_blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm3.bias": "blocks.27.transformer_blocks.0.norm3.bias", + "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm3.weight": "blocks.27.transformer_blocks.0.norm3.weight", + "model.diffusion_model.input_blocks.5.0.emb_layers.1.bias": "blocks.32.time_emb_proj.bias", + "model.diffusion_model.input_blocks.5.0.emb_layers.1.weight": "blocks.32.time_emb_proj.weight", + "model.diffusion_model.input_blocks.5.0.in_layers.0.bias": "blocks.32.norm1.bias", + "model.diffusion_model.input_blocks.5.0.in_layers.0.weight": "blocks.32.norm1.weight", + "model.diffusion_model.input_blocks.5.0.in_layers.2.bias": "blocks.32.conv1.bias", + "model.diffusion_model.input_blocks.5.0.in_layers.2.weight": "blocks.32.conv1.weight", + "model.diffusion_model.input_blocks.5.0.out_layers.0.bias": "blocks.32.norm2.bias", + "model.diffusion_model.input_blocks.5.0.out_layers.0.weight": "blocks.32.norm2.weight", + "model.diffusion_model.input_blocks.5.0.out_layers.3.bias": "blocks.32.conv2.bias", + "model.diffusion_model.input_blocks.5.0.out_layers.3.weight": "blocks.32.conv2.weight", + "model.diffusion_model.input_blocks.5.0.time_mixer.mix_factor": "blocks.35.mix_factor", + "model.diffusion_model.input_blocks.5.0.time_stack.emb_layers.1.bias": "blocks.34.time_emb_proj.bias", + "model.diffusion_model.input_blocks.5.0.time_stack.emb_layers.1.weight": "blocks.34.time_emb_proj.weight", + "model.diffusion_model.input_blocks.5.0.time_stack.in_layers.0.bias": "blocks.34.norm1.bias", + "model.diffusion_model.input_blocks.5.0.time_stack.in_layers.0.weight": "blocks.34.norm1.weight", + "model.diffusion_model.input_blocks.5.0.time_stack.in_layers.2.bias": "blocks.34.conv1.bias", + "model.diffusion_model.input_blocks.5.0.time_stack.in_layers.2.weight": "blocks.34.conv1.weight", + "model.diffusion_model.input_blocks.5.0.time_stack.out_layers.0.bias": "blocks.34.norm2.bias", + "model.diffusion_model.input_blocks.5.0.time_stack.out_layers.0.weight": "blocks.34.norm2.weight", + "model.diffusion_model.input_blocks.5.0.time_stack.out_layers.3.bias": "blocks.34.conv2.bias", + "model.diffusion_model.input_blocks.5.0.time_stack.out_layers.3.weight": "blocks.34.conv2.weight", + "model.diffusion_model.input_blocks.5.1.norm.bias": "blocks.37.norm.bias", + "model.diffusion_model.input_blocks.5.1.norm.weight": "blocks.37.norm.weight", + "model.diffusion_model.input_blocks.5.1.proj_in.bias": "blocks.37.proj_in.bias", + "model.diffusion_model.input_blocks.5.1.proj_in.weight": "blocks.37.proj_in.weight", + "model.diffusion_model.input_blocks.5.1.proj_out.bias": "blocks.40.proj.bias", + "model.diffusion_model.input_blocks.5.1.proj_out.weight": "blocks.40.proj.weight", + "model.diffusion_model.input_blocks.5.1.time_mixer.mix_factor": "blocks.40.mix_factor", + "model.diffusion_model.input_blocks.5.1.time_pos_embed.0.bias": "blocks.39.positional_embedding_proj.0.bias", + "model.diffusion_model.input_blocks.5.1.time_pos_embed.0.weight": "blocks.39.positional_embedding_proj.0.weight", + "model.diffusion_model.input_blocks.5.1.time_pos_embed.2.bias": "blocks.39.positional_embedding_proj.2.bias", + "model.diffusion_model.input_blocks.5.1.time_pos_embed.2.weight": "blocks.39.positional_embedding_proj.2.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn1.to_k.weight": "blocks.39.attn1.to_k.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn1.to_out.0.bias": "blocks.39.attn1.to_out.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn1.to_out.0.weight": "blocks.39.attn1.to_out.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn1.to_q.weight": "blocks.39.attn1.to_q.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn1.to_v.weight": "blocks.39.attn1.to_v.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn2.to_k.weight": "blocks.39.attn2.to_k.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn2.to_out.0.bias": "blocks.39.attn2.to_out.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn2.to_out.0.weight": "blocks.39.attn2.to_out.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn2.to_q.weight": "blocks.39.attn2.to_q.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.attn2.to_v.weight": "blocks.39.attn2.to_v.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff.net.0.proj.bias": "blocks.39.act_fn_out.proj.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff.net.0.proj.weight": "blocks.39.act_fn_out.proj.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff.net.2.bias": "blocks.39.ff_out.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff.net.2.weight": "blocks.39.ff_out.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.39.act_fn_in.proj.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.39.act_fn_in.proj.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff_in.net.2.bias": "blocks.39.ff_in.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.ff_in.net.2.weight": "blocks.39.ff_in.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm1.bias": "blocks.39.norm1.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm1.weight": "blocks.39.norm1.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm2.bias": "blocks.39.norm2.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm2.weight": "blocks.39.norm2.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm3.bias": "blocks.39.norm_out.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm3.weight": "blocks.39.norm_out.weight", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm_in.bias": "blocks.39.norm_in.bias", + "model.diffusion_model.input_blocks.5.1.time_stack.0.norm_in.weight": "blocks.39.norm_in.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_k.weight": "blocks.37.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.37.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.37.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_q.weight": "blocks.37.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_v.weight": "blocks.37.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight": "blocks.37.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.37.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.37.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_q.weight": "blocks.37.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_v.weight": "blocks.37.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.37.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.37.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.bias": "blocks.37.transformer_blocks.0.ff.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.weight": "blocks.37.transformer_blocks.0.ff.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm1.bias": "blocks.37.transformer_blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm1.weight": "blocks.37.transformer_blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm2.bias": "blocks.37.transformer_blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm2.weight": "blocks.37.transformer_blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm3.bias": "blocks.37.transformer_blocks.0.norm3.bias", + "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm3.weight": "blocks.37.transformer_blocks.0.norm3.weight", + "model.diffusion_model.input_blocks.6.0.op.bias": "blocks.42.conv.bias", + "model.diffusion_model.input_blocks.6.0.op.weight": "blocks.42.conv.weight", + "model.diffusion_model.input_blocks.7.0.emb_layers.1.bias": "blocks.44.time_emb_proj.bias", + "model.diffusion_model.input_blocks.7.0.emb_layers.1.weight": "blocks.44.time_emb_proj.weight", + "model.diffusion_model.input_blocks.7.0.in_layers.0.bias": "blocks.44.norm1.bias", + "model.diffusion_model.input_blocks.7.0.in_layers.0.weight": "blocks.44.norm1.weight", + "model.diffusion_model.input_blocks.7.0.in_layers.2.bias": "blocks.44.conv1.bias", + "model.diffusion_model.input_blocks.7.0.in_layers.2.weight": "blocks.44.conv1.weight", + "model.diffusion_model.input_blocks.7.0.out_layers.0.bias": "blocks.44.norm2.bias", + "model.diffusion_model.input_blocks.7.0.out_layers.0.weight": "blocks.44.norm2.weight", + "model.diffusion_model.input_blocks.7.0.out_layers.3.bias": "blocks.44.conv2.bias", + "model.diffusion_model.input_blocks.7.0.out_layers.3.weight": "blocks.44.conv2.weight", + "model.diffusion_model.input_blocks.7.0.skip_connection.bias": "blocks.44.conv_shortcut.bias", + "model.diffusion_model.input_blocks.7.0.skip_connection.weight": "blocks.44.conv_shortcut.weight", + "model.diffusion_model.input_blocks.7.0.time_mixer.mix_factor": "blocks.47.mix_factor", + "model.diffusion_model.input_blocks.7.0.time_stack.emb_layers.1.bias": "blocks.46.time_emb_proj.bias", + "model.diffusion_model.input_blocks.7.0.time_stack.emb_layers.1.weight": "blocks.46.time_emb_proj.weight", + "model.diffusion_model.input_blocks.7.0.time_stack.in_layers.0.bias": "blocks.46.norm1.bias", + "model.diffusion_model.input_blocks.7.0.time_stack.in_layers.0.weight": "blocks.46.norm1.weight", + "model.diffusion_model.input_blocks.7.0.time_stack.in_layers.2.bias": "blocks.46.conv1.bias", + "model.diffusion_model.input_blocks.7.0.time_stack.in_layers.2.weight": "blocks.46.conv1.weight", + "model.diffusion_model.input_blocks.7.0.time_stack.out_layers.0.bias": "blocks.46.norm2.bias", + "model.diffusion_model.input_blocks.7.0.time_stack.out_layers.0.weight": "blocks.46.norm2.weight", + "model.diffusion_model.input_blocks.7.0.time_stack.out_layers.3.bias": "blocks.46.conv2.bias", + "model.diffusion_model.input_blocks.7.0.time_stack.out_layers.3.weight": "blocks.46.conv2.weight", + "model.diffusion_model.input_blocks.7.1.norm.bias": "blocks.49.norm.bias", + "model.diffusion_model.input_blocks.7.1.norm.weight": "blocks.49.norm.weight", + "model.diffusion_model.input_blocks.7.1.proj_in.bias": "blocks.49.proj_in.bias", + "model.diffusion_model.input_blocks.7.1.proj_in.weight": "blocks.49.proj_in.weight", + "model.diffusion_model.input_blocks.7.1.proj_out.bias": "blocks.52.proj.bias", + "model.diffusion_model.input_blocks.7.1.proj_out.weight": "blocks.52.proj.weight", + "model.diffusion_model.input_blocks.7.1.time_mixer.mix_factor": "blocks.52.mix_factor", + "model.diffusion_model.input_blocks.7.1.time_pos_embed.0.bias": "blocks.51.positional_embedding_proj.0.bias", + "model.diffusion_model.input_blocks.7.1.time_pos_embed.0.weight": "blocks.51.positional_embedding_proj.0.weight", + "model.diffusion_model.input_blocks.7.1.time_pos_embed.2.bias": "blocks.51.positional_embedding_proj.2.bias", + "model.diffusion_model.input_blocks.7.1.time_pos_embed.2.weight": "blocks.51.positional_embedding_proj.2.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn1.to_k.weight": "blocks.51.attn1.to_k.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn1.to_out.0.bias": "blocks.51.attn1.to_out.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn1.to_out.0.weight": "blocks.51.attn1.to_out.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn1.to_q.weight": "blocks.51.attn1.to_q.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn1.to_v.weight": "blocks.51.attn1.to_v.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn2.to_k.weight": "blocks.51.attn2.to_k.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn2.to_out.0.bias": "blocks.51.attn2.to_out.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn2.to_out.0.weight": "blocks.51.attn2.to_out.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn2.to_q.weight": "blocks.51.attn2.to_q.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.attn2.to_v.weight": "blocks.51.attn2.to_v.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff.net.0.proj.bias": "blocks.51.act_fn_out.proj.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff.net.0.proj.weight": "blocks.51.act_fn_out.proj.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff.net.2.bias": "blocks.51.ff_out.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff.net.2.weight": "blocks.51.ff_out.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.51.act_fn_in.proj.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.51.act_fn_in.proj.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff_in.net.2.bias": "blocks.51.ff_in.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.ff_in.net.2.weight": "blocks.51.ff_in.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm1.bias": "blocks.51.norm1.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm1.weight": "blocks.51.norm1.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm2.bias": "blocks.51.norm2.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm2.weight": "blocks.51.norm2.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm3.bias": "blocks.51.norm_out.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm3.weight": "blocks.51.norm_out.weight", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm_in.bias": "blocks.51.norm_in.bias", + "model.diffusion_model.input_blocks.7.1.time_stack.0.norm_in.weight": "blocks.51.norm_in.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_k.weight": "blocks.49.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.49.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.49.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_q.weight": "blocks.49.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_v.weight": "blocks.49.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_k.weight": "blocks.49.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.49.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.49.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_q.weight": "blocks.49.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_v.weight": "blocks.49.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.49.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.49.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.bias": "blocks.49.transformer_blocks.0.ff.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.weight": "blocks.49.transformer_blocks.0.ff.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm1.bias": "blocks.49.transformer_blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm1.weight": "blocks.49.transformer_blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm2.bias": "blocks.49.transformer_blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm2.weight": "blocks.49.transformer_blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm3.bias": "blocks.49.transformer_blocks.0.norm3.bias", + "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm3.weight": "blocks.49.transformer_blocks.0.norm3.weight", + "model.diffusion_model.input_blocks.8.0.emb_layers.1.bias": "blocks.54.time_emb_proj.bias", + "model.diffusion_model.input_blocks.8.0.emb_layers.1.weight": "blocks.54.time_emb_proj.weight", + "model.diffusion_model.input_blocks.8.0.in_layers.0.bias": "blocks.54.norm1.bias", + "model.diffusion_model.input_blocks.8.0.in_layers.0.weight": "blocks.54.norm1.weight", + "model.diffusion_model.input_blocks.8.0.in_layers.2.bias": "blocks.54.conv1.bias", + "model.diffusion_model.input_blocks.8.0.in_layers.2.weight": "blocks.54.conv1.weight", + "model.diffusion_model.input_blocks.8.0.out_layers.0.bias": "blocks.54.norm2.bias", + "model.diffusion_model.input_blocks.8.0.out_layers.0.weight": "blocks.54.norm2.weight", + "model.diffusion_model.input_blocks.8.0.out_layers.3.bias": "blocks.54.conv2.bias", + "model.diffusion_model.input_blocks.8.0.out_layers.3.weight": "blocks.54.conv2.weight", + "model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor": "blocks.57.mix_factor", + "model.diffusion_model.input_blocks.8.0.time_stack.emb_layers.1.bias": "blocks.56.time_emb_proj.bias", + "model.diffusion_model.input_blocks.8.0.time_stack.emb_layers.1.weight": "blocks.56.time_emb_proj.weight", + "model.diffusion_model.input_blocks.8.0.time_stack.in_layers.0.bias": "blocks.56.norm1.bias", + "model.diffusion_model.input_blocks.8.0.time_stack.in_layers.0.weight": "blocks.56.norm1.weight", + "model.diffusion_model.input_blocks.8.0.time_stack.in_layers.2.bias": "blocks.56.conv1.bias", + "model.diffusion_model.input_blocks.8.0.time_stack.in_layers.2.weight": "blocks.56.conv1.weight", + "model.diffusion_model.input_blocks.8.0.time_stack.out_layers.0.bias": "blocks.56.norm2.bias", + "model.diffusion_model.input_blocks.8.0.time_stack.out_layers.0.weight": "blocks.56.norm2.weight", + "model.diffusion_model.input_blocks.8.0.time_stack.out_layers.3.bias": "blocks.56.conv2.bias", + "model.diffusion_model.input_blocks.8.0.time_stack.out_layers.3.weight": "blocks.56.conv2.weight", + "model.diffusion_model.input_blocks.8.1.norm.bias": "blocks.59.norm.bias", + "model.diffusion_model.input_blocks.8.1.norm.weight": "blocks.59.norm.weight", + "model.diffusion_model.input_blocks.8.1.proj_in.bias": "blocks.59.proj_in.bias", + "model.diffusion_model.input_blocks.8.1.proj_in.weight": "blocks.59.proj_in.weight", + "model.diffusion_model.input_blocks.8.1.proj_out.bias": "blocks.62.proj.bias", + "model.diffusion_model.input_blocks.8.1.proj_out.weight": "blocks.62.proj.weight", + "model.diffusion_model.input_blocks.8.1.time_mixer.mix_factor": "blocks.62.mix_factor", + "model.diffusion_model.input_blocks.8.1.time_pos_embed.0.bias": "blocks.61.positional_embedding_proj.0.bias", + "model.diffusion_model.input_blocks.8.1.time_pos_embed.0.weight": "blocks.61.positional_embedding_proj.0.weight", + "model.diffusion_model.input_blocks.8.1.time_pos_embed.2.bias": "blocks.61.positional_embedding_proj.2.bias", + "model.diffusion_model.input_blocks.8.1.time_pos_embed.2.weight": "blocks.61.positional_embedding_proj.2.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn1.to_k.weight": "blocks.61.attn1.to_k.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn1.to_out.0.bias": "blocks.61.attn1.to_out.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn1.to_out.0.weight": "blocks.61.attn1.to_out.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn1.to_q.weight": "blocks.61.attn1.to_q.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn1.to_v.weight": "blocks.61.attn1.to_v.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn2.to_k.weight": "blocks.61.attn2.to_k.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn2.to_out.0.bias": "blocks.61.attn2.to_out.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn2.to_out.0.weight": "blocks.61.attn2.to_out.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn2.to_q.weight": "blocks.61.attn2.to_q.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.attn2.to_v.weight": "blocks.61.attn2.to_v.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff.net.0.proj.bias": "blocks.61.act_fn_out.proj.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff.net.0.proj.weight": "blocks.61.act_fn_out.proj.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff.net.2.bias": "blocks.61.ff_out.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff.net.2.weight": "blocks.61.ff_out.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.61.act_fn_in.proj.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.61.act_fn_in.proj.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff_in.net.2.bias": "blocks.61.ff_in.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.ff_in.net.2.weight": "blocks.61.ff_in.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm1.bias": "blocks.61.norm1.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm1.weight": "blocks.61.norm1.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm2.bias": "blocks.61.norm2.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm2.weight": "blocks.61.norm2.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm3.bias": "blocks.61.norm_out.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm3.weight": "blocks.61.norm_out.weight", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm_in.bias": "blocks.61.norm_in.bias", + "model.diffusion_model.input_blocks.8.1.time_stack.0.norm_in.weight": "blocks.61.norm_in.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_k.weight": "blocks.59.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.59.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.59.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_q.weight": "blocks.59.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_v.weight": "blocks.59.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_k.weight": "blocks.59.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.59.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.59.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_q.weight": "blocks.59.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_v.weight": "blocks.59.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.59.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.59.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.bias": "blocks.59.transformer_blocks.0.ff.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.weight": "blocks.59.transformer_blocks.0.ff.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm1.bias": "blocks.59.transformer_blocks.0.norm1.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm1.weight": "blocks.59.transformer_blocks.0.norm1.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm2.bias": "blocks.59.transformer_blocks.0.norm2.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm2.weight": "blocks.59.transformer_blocks.0.norm2.weight", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm3.bias": "blocks.59.transformer_blocks.0.norm3.bias", + "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm3.weight": "blocks.59.transformer_blocks.0.norm3.weight", + "model.diffusion_model.input_blocks.9.0.op.bias": "blocks.64.conv.bias", + "model.diffusion_model.input_blocks.9.0.op.weight": "blocks.64.conv.weight", + "model.diffusion_model.label_emb.0.0.bias": "add_time_embedding.0.bias", + "model.diffusion_model.label_emb.0.0.weight": "add_time_embedding.0.weight", + "model.diffusion_model.label_emb.0.2.bias": "add_time_embedding.2.bias", + "model.diffusion_model.label_emb.0.2.weight": "add_time_embedding.2.weight", + "model.diffusion_model.middle_block.0.emb_layers.1.bias": "blocks.76.time_emb_proj.bias", + "model.diffusion_model.middle_block.0.emb_layers.1.weight": "blocks.76.time_emb_proj.weight", + "model.diffusion_model.middle_block.0.in_layers.0.bias": "blocks.76.norm1.bias", + "model.diffusion_model.middle_block.0.in_layers.0.weight": "blocks.76.norm1.weight", + "model.diffusion_model.middle_block.0.in_layers.2.bias": "blocks.76.conv1.bias", + "model.diffusion_model.middle_block.0.in_layers.2.weight": "blocks.76.conv1.weight", + "model.diffusion_model.middle_block.0.out_layers.0.bias": "blocks.76.norm2.bias", + "model.diffusion_model.middle_block.0.out_layers.0.weight": "blocks.76.norm2.weight", + "model.diffusion_model.middle_block.0.out_layers.3.bias": "blocks.76.conv2.bias", + "model.diffusion_model.middle_block.0.out_layers.3.weight": "blocks.76.conv2.weight", + "model.diffusion_model.middle_block.0.time_mixer.mix_factor": "blocks.79.mix_factor", + "model.diffusion_model.middle_block.0.time_stack.emb_layers.1.bias": "blocks.78.time_emb_proj.bias", + "model.diffusion_model.middle_block.0.time_stack.emb_layers.1.weight": "blocks.78.time_emb_proj.weight", + "model.diffusion_model.middle_block.0.time_stack.in_layers.0.bias": "blocks.78.norm1.bias", + "model.diffusion_model.middle_block.0.time_stack.in_layers.0.weight": "blocks.78.norm1.weight", + "model.diffusion_model.middle_block.0.time_stack.in_layers.2.bias": "blocks.78.conv1.bias", + "model.diffusion_model.middle_block.0.time_stack.in_layers.2.weight": "blocks.78.conv1.weight", + "model.diffusion_model.middle_block.0.time_stack.out_layers.0.bias": "blocks.78.norm2.bias", + "model.diffusion_model.middle_block.0.time_stack.out_layers.0.weight": "blocks.78.norm2.weight", + "model.diffusion_model.middle_block.0.time_stack.out_layers.3.bias": "blocks.78.conv2.bias", + "model.diffusion_model.middle_block.0.time_stack.out_layers.3.weight": "blocks.78.conv2.weight", + "model.diffusion_model.middle_block.1.norm.bias": "blocks.81.norm.bias", + "model.diffusion_model.middle_block.1.norm.weight": "blocks.81.norm.weight", + "model.diffusion_model.middle_block.1.proj_in.bias": "blocks.81.proj_in.bias", + "model.diffusion_model.middle_block.1.proj_in.weight": "blocks.81.proj_in.weight", + "model.diffusion_model.middle_block.1.proj_out.bias": "blocks.84.proj.bias", + "model.diffusion_model.middle_block.1.proj_out.weight": "blocks.84.proj.weight", + "model.diffusion_model.middle_block.1.time_mixer.mix_factor": "blocks.84.mix_factor", + "model.diffusion_model.middle_block.1.time_pos_embed.0.bias": "blocks.83.positional_embedding_proj.0.bias", + "model.diffusion_model.middle_block.1.time_pos_embed.0.weight": "blocks.83.positional_embedding_proj.0.weight", + "model.diffusion_model.middle_block.1.time_pos_embed.2.bias": "blocks.83.positional_embedding_proj.2.bias", + "model.diffusion_model.middle_block.1.time_pos_embed.2.weight": "blocks.83.positional_embedding_proj.2.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn1.to_k.weight": "blocks.83.attn1.to_k.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn1.to_out.0.bias": "blocks.83.attn1.to_out.bias", + "model.diffusion_model.middle_block.1.time_stack.0.attn1.to_out.0.weight": "blocks.83.attn1.to_out.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn1.to_q.weight": "blocks.83.attn1.to_q.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn1.to_v.weight": "blocks.83.attn1.to_v.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn2.to_k.weight": "blocks.83.attn2.to_k.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn2.to_out.0.bias": "blocks.83.attn2.to_out.bias", + "model.diffusion_model.middle_block.1.time_stack.0.attn2.to_out.0.weight": "blocks.83.attn2.to_out.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn2.to_q.weight": "blocks.83.attn2.to_q.weight", + "model.diffusion_model.middle_block.1.time_stack.0.attn2.to_v.weight": "blocks.83.attn2.to_v.weight", + "model.diffusion_model.middle_block.1.time_stack.0.ff.net.0.proj.bias": "blocks.83.act_fn_out.proj.bias", + "model.diffusion_model.middle_block.1.time_stack.0.ff.net.0.proj.weight": "blocks.83.act_fn_out.proj.weight", + "model.diffusion_model.middle_block.1.time_stack.0.ff.net.2.bias": "blocks.83.ff_out.bias", + "model.diffusion_model.middle_block.1.time_stack.0.ff.net.2.weight": "blocks.83.ff_out.weight", + "model.diffusion_model.middle_block.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.83.act_fn_in.proj.bias", + "model.diffusion_model.middle_block.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.83.act_fn_in.proj.weight", + "model.diffusion_model.middle_block.1.time_stack.0.ff_in.net.2.bias": "blocks.83.ff_in.bias", + "model.diffusion_model.middle_block.1.time_stack.0.ff_in.net.2.weight": "blocks.83.ff_in.weight", + "model.diffusion_model.middle_block.1.time_stack.0.norm1.bias": "blocks.83.norm1.bias", + "model.diffusion_model.middle_block.1.time_stack.0.norm1.weight": "blocks.83.norm1.weight", + "model.diffusion_model.middle_block.1.time_stack.0.norm2.bias": "blocks.83.norm2.bias", + "model.diffusion_model.middle_block.1.time_stack.0.norm2.weight": "blocks.83.norm2.weight", + "model.diffusion_model.middle_block.1.time_stack.0.norm3.bias": "blocks.83.norm_out.bias", + "model.diffusion_model.middle_block.1.time_stack.0.norm3.weight": "blocks.83.norm_out.weight", + "model.diffusion_model.middle_block.1.time_stack.0.norm_in.bias": "blocks.83.norm_in.bias", + "model.diffusion_model.middle_block.1.time_stack.0.norm_in.weight": "blocks.83.norm_in.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_k.weight": "blocks.81.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.81.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.81.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight": "blocks.81.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_v.weight": "blocks.81.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_k.weight": "blocks.81.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.81.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.81.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_q.weight": "blocks.81.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_v.weight": "blocks.81.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.81.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.81.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.2.bias": "blocks.81.transformer_blocks.0.ff.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.2.weight": "blocks.81.transformer_blocks.0.ff.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.norm1.bias": "blocks.81.transformer_blocks.0.norm1.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.norm1.weight": "blocks.81.transformer_blocks.0.norm1.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.norm2.bias": "blocks.81.transformer_blocks.0.norm2.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.norm2.weight": "blocks.81.transformer_blocks.0.norm2.weight", + "model.diffusion_model.middle_block.1.transformer_blocks.0.norm3.bias": "blocks.81.transformer_blocks.0.norm3.bias", + "model.diffusion_model.middle_block.1.transformer_blocks.0.norm3.weight": "blocks.81.transformer_blocks.0.norm3.weight", + "model.diffusion_model.middle_block.2.emb_layers.1.bias": "blocks.85.time_emb_proj.bias", + "model.diffusion_model.middle_block.2.emb_layers.1.weight": "blocks.85.time_emb_proj.weight", + "model.diffusion_model.middle_block.2.in_layers.0.bias": "blocks.85.norm1.bias", + "model.diffusion_model.middle_block.2.in_layers.0.weight": "blocks.85.norm1.weight", + "model.diffusion_model.middle_block.2.in_layers.2.bias": "blocks.85.conv1.bias", + "model.diffusion_model.middle_block.2.in_layers.2.weight": "blocks.85.conv1.weight", + "model.diffusion_model.middle_block.2.out_layers.0.bias": "blocks.85.norm2.bias", + "model.diffusion_model.middle_block.2.out_layers.0.weight": "blocks.85.norm2.weight", + "model.diffusion_model.middle_block.2.out_layers.3.bias": "blocks.85.conv2.bias", + "model.diffusion_model.middle_block.2.out_layers.3.weight": "blocks.85.conv2.weight", + "model.diffusion_model.middle_block.2.time_mixer.mix_factor": "blocks.88.mix_factor", + "model.diffusion_model.middle_block.2.time_stack.emb_layers.1.bias": "blocks.87.time_emb_proj.bias", + "model.diffusion_model.middle_block.2.time_stack.emb_layers.1.weight": "blocks.87.time_emb_proj.weight", + "model.diffusion_model.middle_block.2.time_stack.in_layers.0.bias": "blocks.87.norm1.bias", + "model.diffusion_model.middle_block.2.time_stack.in_layers.0.weight": "blocks.87.norm1.weight", + "model.diffusion_model.middle_block.2.time_stack.in_layers.2.bias": "blocks.87.conv1.bias", + "model.diffusion_model.middle_block.2.time_stack.in_layers.2.weight": "blocks.87.conv1.weight", + "model.diffusion_model.middle_block.2.time_stack.out_layers.0.bias": "blocks.87.norm2.bias", + "model.diffusion_model.middle_block.2.time_stack.out_layers.0.weight": "blocks.87.norm2.weight", + "model.diffusion_model.middle_block.2.time_stack.out_layers.3.bias": "blocks.87.conv2.bias", + "model.diffusion_model.middle_block.2.time_stack.out_layers.3.weight": "blocks.87.conv2.weight", + "model.diffusion_model.out.0.bias": "conv_norm_out.bias", + "model.diffusion_model.out.0.weight": "conv_norm_out.weight", + "model.diffusion_model.out.2.bias": "conv_out.bias", + "model.diffusion_model.out.2.weight": "conv_out.weight", + "model.diffusion_model.output_blocks.0.0.emb_layers.1.bias": "blocks.90.time_emb_proj.bias", + "model.diffusion_model.output_blocks.0.0.emb_layers.1.weight": "blocks.90.time_emb_proj.weight", + "model.diffusion_model.output_blocks.0.0.in_layers.0.bias": "blocks.90.norm1.bias", + "model.diffusion_model.output_blocks.0.0.in_layers.0.weight": "blocks.90.norm1.weight", + "model.diffusion_model.output_blocks.0.0.in_layers.2.bias": "blocks.90.conv1.bias", + "model.diffusion_model.output_blocks.0.0.in_layers.2.weight": "blocks.90.conv1.weight", + "model.diffusion_model.output_blocks.0.0.out_layers.0.bias": "blocks.90.norm2.bias", + "model.diffusion_model.output_blocks.0.0.out_layers.0.weight": "blocks.90.norm2.weight", + "model.diffusion_model.output_blocks.0.0.out_layers.3.bias": "blocks.90.conv2.bias", + "model.diffusion_model.output_blocks.0.0.out_layers.3.weight": "blocks.90.conv2.weight", + "model.diffusion_model.output_blocks.0.0.skip_connection.bias": "blocks.90.conv_shortcut.bias", + "model.diffusion_model.output_blocks.0.0.skip_connection.weight": "blocks.90.conv_shortcut.weight", + "model.diffusion_model.output_blocks.0.0.time_mixer.mix_factor": "blocks.93.mix_factor", + "model.diffusion_model.output_blocks.0.0.time_stack.emb_layers.1.bias": "blocks.92.time_emb_proj.bias", + "model.diffusion_model.output_blocks.0.0.time_stack.emb_layers.1.weight": "blocks.92.time_emb_proj.weight", + "model.diffusion_model.output_blocks.0.0.time_stack.in_layers.0.bias": "blocks.92.norm1.bias", + "model.diffusion_model.output_blocks.0.0.time_stack.in_layers.0.weight": "blocks.92.norm1.weight", + "model.diffusion_model.output_blocks.0.0.time_stack.in_layers.2.bias": "blocks.92.conv1.bias", + "model.diffusion_model.output_blocks.0.0.time_stack.in_layers.2.weight": "blocks.92.conv1.weight", + "model.diffusion_model.output_blocks.0.0.time_stack.out_layers.0.bias": "blocks.92.norm2.bias", + "model.diffusion_model.output_blocks.0.0.time_stack.out_layers.0.weight": "blocks.92.norm2.weight", + "model.diffusion_model.output_blocks.0.0.time_stack.out_layers.3.bias": "blocks.92.conv2.bias", + "model.diffusion_model.output_blocks.0.0.time_stack.out_layers.3.weight": "blocks.92.conv2.weight", + "model.diffusion_model.output_blocks.1.0.emb_layers.1.bias": "blocks.95.time_emb_proj.bias", + "model.diffusion_model.output_blocks.1.0.emb_layers.1.weight": "blocks.95.time_emb_proj.weight", + "model.diffusion_model.output_blocks.1.0.in_layers.0.bias": "blocks.95.norm1.bias", + "model.diffusion_model.output_blocks.1.0.in_layers.0.weight": "blocks.95.norm1.weight", + "model.diffusion_model.output_blocks.1.0.in_layers.2.bias": "blocks.95.conv1.bias", + "model.diffusion_model.output_blocks.1.0.in_layers.2.weight": "blocks.95.conv1.weight", + "model.diffusion_model.output_blocks.1.0.out_layers.0.bias": "blocks.95.norm2.bias", + "model.diffusion_model.output_blocks.1.0.out_layers.0.weight": "blocks.95.norm2.weight", + "model.diffusion_model.output_blocks.1.0.out_layers.3.bias": "blocks.95.conv2.bias", + "model.diffusion_model.output_blocks.1.0.out_layers.3.weight": "blocks.95.conv2.weight", + "model.diffusion_model.output_blocks.1.0.skip_connection.bias": "blocks.95.conv_shortcut.bias", + "model.diffusion_model.output_blocks.1.0.skip_connection.weight": "blocks.95.conv_shortcut.weight", + "model.diffusion_model.output_blocks.1.0.time_mixer.mix_factor": "blocks.98.mix_factor", + "model.diffusion_model.output_blocks.1.0.time_stack.emb_layers.1.bias": "blocks.97.time_emb_proj.bias", + "model.diffusion_model.output_blocks.1.0.time_stack.emb_layers.1.weight": "blocks.97.time_emb_proj.weight", + "model.diffusion_model.output_blocks.1.0.time_stack.in_layers.0.bias": "blocks.97.norm1.bias", + "model.diffusion_model.output_blocks.1.0.time_stack.in_layers.0.weight": "blocks.97.norm1.weight", + "model.diffusion_model.output_blocks.1.0.time_stack.in_layers.2.bias": "blocks.97.conv1.bias", + "model.diffusion_model.output_blocks.1.0.time_stack.in_layers.2.weight": "blocks.97.conv1.weight", + "model.diffusion_model.output_blocks.1.0.time_stack.out_layers.0.bias": "blocks.97.norm2.bias", + "model.diffusion_model.output_blocks.1.0.time_stack.out_layers.0.weight": "blocks.97.norm2.weight", + "model.diffusion_model.output_blocks.1.0.time_stack.out_layers.3.bias": "blocks.97.conv2.bias", + "model.diffusion_model.output_blocks.1.0.time_stack.out_layers.3.weight": "blocks.97.conv2.weight", + "model.diffusion_model.output_blocks.10.0.emb_layers.1.bias": "blocks.178.time_emb_proj.bias", + "model.diffusion_model.output_blocks.10.0.emb_layers.1.weight": "blocks.178.time_emb_proj.weight", + "model.diffusion_model.output_blocks.10.0.in_layers.0.bias": "blocks.178.norm1.bias", + "model.diffusion_model.output_blocks.10.0.in_layers.0.weight": "blocks.178.norm1.weight", + "model.diffusion_model.output_blocks.10.0.in_layers.2.bias": "blocks.178.conv1.bias", + "model.diffusion_model.output_blocks.10.0.in_layers.2.weight": "blocks.178.conv1.weight", + "model.diffusion_model.output_blocks.10.0.out_layers.0.bias": "blocks.178.norm2.bias", + "model.diffusion_model.output_blocks.10.0.out_layers.0.weight": "blocks.178.norm2.weight", + "model.diffusion_model.output_blocks.10.0.out_layers.3.bias": "blocks.178.conv2.bias", + "model.diffusion_model.output_blocks.10.0.out_layers.3.weight": "blocks.178.conv2.weight", + "model.diffusion_model.output_blocks.10.0.skip_connection.bias": "blocks.178.conv_shortcut.bias", + "model.diffusion_model.output_blocks.10.0.skip_connection.weight": "blocks.178.conv_shortcut.weight", + "model.diffusion_model.output_blocks.10.0.time_mixer.mix_factor": "blocks.181.mix_factor", + "model.diffusion_model.output_blocks.10.0.time_stack.emb_layers.1.bias": "blocks.180.time_emb_proj.bias", + "model.diffusion_model.output_blocks.10.0.time_stack.emb_layers.1.weight": "blocks.180.time_emb_proj.weight", + "model.diffusion_model.output_blocks.10.0.time_stack.in_layers.0.bias": "blocks.180.norm1.bias", + "model.diffusion_model.output_blocks.10.0.time_stack.in_layers.0.weight": "blocks.180.norm1.weight", + "model.diffusion_model.output_blocks.10.0.time_stack.in_layers.2.bias": "blocks.180.conv1.bias", + "model.diffusion_model.output_blocks.10.0.time_stack.in_layers.2.weight": "blocks.180.conv1.weight", + "model.diffusion_model.output_blocks.10.0.time_stack.out_layers.0.bias": "blocks.180.norm2.bias", + "model.diffusion_model.output_blocks.10.0.time_stack.out_layers.0.weight": "blocks.180.norm2.weight", + "model.diffusion_model.output_blocks.10.0.time_stack.out_layers.3.bias": "blocks.180.conv2.bias", + "model.diffusion_model.output_blocks.10.0.time_stack.out_layers.3.weight": "blocks.180.conv2.weight", + "model.diffusion_model.output_blocks.10.1.norm.bias": "blocks.183.norm.bias", + "model.diffusion_model.output_blocks.10.1.norm.weight": "blocks.183.norm.weight", + "model.diffusion_model.output_blocks.10.1.proj_in.bias": "blocks.183.proj_in.bias", + "model.diffusion_model.output_blocks.10.1.proj_in.weight": "blocks.183.proj_in.weight", + "model.diffusion_model.output_blocks.10.1.proj_out.bias": "blocks.186.proj.bias", + "model.diffusion_model.output_blocks.10.1.proj_out.weight": "blocks.186.proj.weight", + "model.diffusion_model.output_blocks.10.1.time_mixer.mix_factor": "blocks.186.mix_factor", + "model.diffusion_model.output_blocks.10.1.time_pos_embed.0.bias": "blocks.185.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.10.1.time_pos_embed.0.weight": "blocks.185.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.10.1.time_pos_embed.2.bias": "blocks.185.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.10.1.time_pos_embed.2.weight": "blocks.185.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn1.to_k.weight": "blocks.185.attn1.to_k.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn1.to_out.0.bias": "blocks.185.attn1.to_out.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn1.to_out.0.weight": "blocks.185.attn1.to_out.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn1.to_q.weight": "blocks.185.attn1.to_q.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn1.to_v.weight": "blocks.185.attn1.to_v.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn2.to_k.weight": "blocks.185.attn2.to_k.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn2.to_out.0.bias": "blocks.185.attn2.to_out.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn2.to_out.0.weight": "blocks.185.attn2.to_out.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn2.to_q.weight": "blocks.185.attn2.to_q.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.attn2.to_v.weight": "blocks.185.attn2.to_v.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff.net.0.proj.bias": "blocks.185.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff.net.0.proj.weight": "blocks.185.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff.net.2.bias": "blocks.185.ff_out.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff.net.2.weight": "blocks.185.ff_out.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.185.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.185.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff_in.net.2.bias": "blocks.185.ff_in.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.ff_in.net.2.weight": "blocks.185.ff_in.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm1.bias": "blocks.185.norm1.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm1.weight": "blocks.185.norm1.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm2.bias": "blocks.185.norm2.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm2.weight": "blocks.185.norm2.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm3.bias": "blocks.185.norm_out.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm3.weight": "blocks.185.norm_out.weight", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm_in.bias": "blocks.185.norm_in.bias", + "model.diffusion_model.output_blocks.10.1.time_stack.0.norm_in.weight": "blocks.185.norm_in.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_k.weight": "blocks.183.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.183.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.183.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_q.weight": "blocks.183.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_v.weight": "blocks.183.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_k.weight": "blocks.183.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.183.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.183.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_q.weight": "blocks.183.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_v.weight": "blocks.183.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.183.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.183.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.2.bias": "blocks.183.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.2.weight": "blocks.183.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm1.bias": "blocks.183.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm1.weight": "blocks.183.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm2.bias": "blocks.183.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm2.weight": "blocks.183.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm3.bias": "blocks.183.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm3.weight": "blocks.183.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.11.0.emb_layers.1.bias": "blocks.188.time_emb_proj.bias", + "model.diffusion_model.output_blocks.11.0.emb_layers.1.weight": "blocks.188.time_emb_proj.weight", + "model.diffusion_model.output_blocks.11.0.in_layers.0.bias": "blocks.188.norm1.bias", + "model.diffusion_model.output_blocks.11.0.in_layers.0.weight": "blocks.188.norm1.weight", + "model.diffusion_model.output_blocks.11.0.in_layers.2.bias": "blocks.188.conv1.bias", + "model.diffusion_model.output_blocks.11.0.in_layers.2.weight": "blocks.188.conv1.weight", + "model.diffusion_model.output_blocks.11.0.out_layers.0.bias": "blocks.188.norm2.bias", + "model.diffusion_model.output_blocks.11.0.out_layers.0.weight": "blocks.188.norm2.weight", + "model.diffusion_model.output_blocks.11.0.out_layers.3.bias": "blocks.188.conv2.bias", + "model.diffusion_model.output_blocks.11.0.out_layers.3.weight": "blocks.188.conv2.weight", + "model.diffusion_model.output_blocks.11.0.skip_connection.bias": "blocks.188.conv_shortcut.bias", + "model.diffusion_model.output_blocks.11.0.skip_connection.weight": "blocks.188.conv_shortcut.weight", + "model.diffusion_model.output_blocks.11.0.time_mixer.mix_factor": "blocks.191.mix_factor", + "model.diffusion_model.output_blocks.11.0.time_stack.emb_layers.1.bias": "blocks.190.time_emb_proj.bias", + "model.diffusion_model.output_blocks.11.0.time_stack.emb_layers.1.weight": "blocks.190.time_emb_proj.weight", + "model.diffusion_model.output_blocks.11.0.time_stack.in_layers.0.bias": "blocks.190.norm1.bias", + "model.diffusion_model.output_blocks.11.0.time_stack.in_layers.0.weight": "blocks.190.norm1.weight", + "model.diffusion_model.output_blocks.11.0.time_stack.in_layers.2.bias": "blocks.190.conv1.bias", + "model.diffusion_model.output_blocks.11.0.time_stack.in_layers.2.weight": "blocks.190.conv1.weight", + "model.diffusion_model.output_blocks.11.0.time_stack.out_layers.0.bias": "blocks.190.norm2.bias", + "model.diffusion_model.output_blocks.11.0.time_stack.out_layers.0.weight": "blocks.190.norm2.weight", + "model.diffusion_model.output_blocks.11.0.time_stack.out_layers.3.bias": "blocks.190.conv2.bias", + "model.diffusion_model.output_blocks.11.0.time_stack.out_layers.3.weight": "blocks.190.conv2.weight", + "model.diffusion_model.output_blocks.11.1.norm.bias": "blocks.193.norm.bias", + "model.diffusion_model.output_blocks.11.1.norm.weight": "blocks.193.norm.weight", + "model.diffusion_model.output_blocks.11.1.proj_in.bias": "blocks.193.proj_in.bias", + "model.diffusion_model.output_blocks.11.1.proj_in.weight": "blocks.193.proj_in.weight", + "model.diffusion_model.output_blocks.11.1.proj_out.bias": "blocks.196.proj.bias", + "model.diffusion_model.output_blocks.11.1.proj_out.weight": "blocks.196.proj.weight", + "model.diffusion_model.output_blocks.11.1.time_mixer.mix_factor": "blocks.196.mix_factor", + "model.diffusion_model.output_blocks.11.1.time_pos_embed.0.bias": "blocks.195.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.11.1.time_pos_embed.0.weight": "blocks.195.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.11.1.time_pos_embed.2.bias": "blocks.195.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.11.1.time_pos_embed.2.weight": "blocks.195.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn1.to_k.weight": "blocks.195.attn1.to_k.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn1.to_out.0.bias": "blocks.195.attn1.to_out.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn1.to_out.0.weight": "blocks.195.attn1.to_out.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn1.to_q.weight": "blocks.195.attn1.to_q.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn1.to_v.weight": "blocks.195.attn1.to_v.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn2.to_k.weight": "blocks.195.attn2.to_k.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn2.to_out.0.bias": "blocks.195.attn2.to_out.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn2.to_out.0.weight": "blocks.195.attn2.to_out.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn2.to_q.weight": "blocks.195.attn2.to_q.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.attn2.to_v.weight": "blocks.195.attn2.to_v.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff.net.0.proj.bias": "blocks.195.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff.net.0.proj.weight": "blocks.195.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff.net.2.bias": "blocks.195.ff_out.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff.net.2.weight": "blocks.195.ff_out.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.195.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.195.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff_in.net.2.bias": "blocks.195.ff_in.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.ff_in.net.2.weight": "blocks.195.ff_in.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm1.bias": "blocks.195.norm1.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm1.weight": "blocks.195.norm1.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm2.bias": "blocks.195.norm2.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm2.weight": "blocks.195.norm2.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm3.bias": "blocks.195.norm_out.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm3.weight": "blocks.195.norm_out.weight", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm_in.bias": "blocks.195.norm_in.bias", + "model.diffusion_model.output_blocks.11.1.time_stack.0.norm_in.weight": "blocks.195.norm_in.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_k.weight": "blocks.193.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.193.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.193.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_q.weight": "blocks.193.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_v.weight": "blocks.193.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_k.weight": "blocks.193.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.193.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.193.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_q.weight": "blocks.193.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight": "blocks.193.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.193.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.193.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.2.bias": "blocks.193.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.2.weight": "blocks.193.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.bias": "blocks.193.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.weight": "blocks.193.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm2.bias": "blocks.193.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm2.weight": "blocks.193.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm3.bias": "blocks.193.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm3.weight": "blocks.193.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.2.0.emb_layers.1.bias": "blocks.100.time_emb_proj.bias", + "model.diffusion_model.output_blocks.2.0.emb_layers.1.weight": "blocks.100.time_emb_proj.weight", + "model.diffusion_model.output_blocks.2.0.in_layers.0.bias": "blocks.100.norm1.bias", + "model.diffusion_model.output_blocks.2.0.in_layers.0.weight": "blocks.100.norm1.weight", + "model.diffusion_model.output_blocks.2.0.in_layers.2.bias": "blocks.100.conv1.bias", + "model.diffusion_model.output_blocks.2.0.in_layers.2.weight": "blocks.100.conv1.weight", + "model.diffusion_model.output_blocks.2.0.out_layers.0.bias": "blocks.100.norm2.bias", + "model.diffusion_model.output_blocks.2.0.out_layers.0.weight": "blocks.100.norm2.weight", + "model.diffusion_model.output_blocks.2.0.out_layers.3.bias": "blocks.100.conv2.bias", + "model.diffusion_model.output_blocks.2.0.out_layers.3.weight": "blocks.100.conv2.weight", + "model.diffusion_model.output_blocks.2.0.skip_connection.bias": "blocks.100.conv_shortcut.bias", + "model.diffusion_model.output_blocks.2.0.skip_connection.weight": "blocks.100.conv_shortcut.weight", + "model.diffusion_model.output_blocks.2.0.time_mixer.mix_factor": "blocks.103.mix_factor", + "model.diffusion_model.output_blocks.2.0.time_stack.emb_layers.1.bias": "blocks.102.time_emb_proj.bias", + "model.diffusion_model.output_blocks.2.0.time_stack.emb_layers.1.weight": "blocks.102.time_emb_proj.weight", + "model.diffusion_model.output_blocks.2.0.time_stack.in_layers.0.bias": "blocks.102.norm1.bias", + "model.diffusion_model.output_blocks.2.0.time_stack.in_layers.0.weight": "blocks.102.norm1.weight", + "model.diffusion_model.output_blocks.2.0.time_stack.in_layers.2.bias": "blocks.102.conv1.bias", + "model.diffusion_model.output_blocks.2.0.time_stack.in_layers.2.weight": "blocks.102.conv1.weight", + "model.diffusion_model.output_blocks.2.0.time_stack.out_layers.0.bias": "blocks.102.norm2.bias", + "model.diffusion_model.output_blocks.2.0.time_stack.out_layers.0.weight": "blocks.102.norm2.weight", + "model.diffusion_model.output_blocks.2.0.time_stack.out_layers.3.bias": "blocks.102.conv2.bias", + "model.diffusion_model.output_blocks.2.0.time_stack.out_layers.3.weight": "blocks.102.conv2.weight", + "model.diffusion_model.output_blocks.2.1.conv.bias": "blocks.104.conv.bias", + "model.diffusion_model.output_blocks.2.1.conv.weight": "blocks.104.conv.weight", + "model.diffusion_model.output_blocks.3.0.emb_layers.1.bias": "blocks.106.time_emb_proj.bias", + "model.diffusion_model.output_blocks.3.0.emb_layers.1.weight": "blocks.106.time_emb_proj.weight", + "model.diffusion_model.output_blocks.3.0.in_layers.0.bias": "blocks.106.norm1.bias", + "model.diffusion_model.output_blocks.3.0.in_layers.0.weight": "blocks.106.norm1.weight", + "model.diffusion_model.output_blocks.3.0.in_layers.2.bias": "blocks.106.conv1.bias", + "model.diffusion_model.output_blocks.3.0.in_layers.2.weight": "blocks.106.conv1.weight", + "model.diffusion_model.output_blocks.3.0.out_layers.0.bias": "blocks.106.norm2.bias", + "model.diffusion_model.output_blocks.3.0.out_layers.0.weight": "blocks.106.norm2.weight", + "model.diffusion_model.output_blocks.3.0.out_layers.3.bias": "blocks.106.conv2.bias", + "model.diffusion_model.output_blocks.3.0.out_layers.3.weight": "blocks.106.conv2.weight", + "model.diffusion_model.output_blocks.3.0.skip_connection.bias": "blocks.106.conv_shortcut.bias", + "model.diffusion_model.output_blocks.3.0.skip_connection.weight": "blocks.106.conv_shortcut.weight", + "model.diffusion_model.output_blocks.3.0.time_mixer.mix_factor": "blocks.109.mix_factor", + "model.diffusion_model.output_blocks.3.0.time_stack.emb_layers.1.bias": "blocks.108.time_emb_proj.bias", + "model.diffusion_model.output_blocks.3.0.time_stack.emb_layers.1.weight": "blocks.108.time_emb_proj.weight", + "model.diffusion_model.output_blocks.3.0.time_stack.in_layers.0.bias": "blocks.108.norm1.bias", + "model.diffusion_model.output_blocks.3.0.time_stack.in_layers.0.weight": "blocks.108.norm1.weight", + "model.diffusion_model.output_blocks.3.0.time_stack.in_layers.2.bias": "blocks.108.conv1.bias", + "model.diffusion_model.output_blocks.3.0.time_stack.in_layers.2.weight": "blocks.108.conv1.weight", + "model.diffusion_model.output_blocks.3.0.time_stack.out_layers.0.bias": "blocks.108.norm2.bias", + "model.diffusion_model.output_blocks.3.0.time_stack.out_layers.0.weight": "blocks.108.norm2.weight", + "model.diffusion_model.output_blocks.3.0.time_stack.out_layers.3.bias": "blocks.108.conv2.bias", + "model.diffusion_model.output_blocks.3.0.time_stack.out_layers.3.weight": "blocks.108.conv2.weight", + "model.diffusion_model.output_blocks.3.1.norm.bias": "blocks.111.norm.bias", + "model.diffusion_model.output_blocks.3.1.norm.weight": "blocks.111.norm.weight", + "model.diffusion_model.output_blocks.3.1.proj_in.bias": "blocks.111.proj_in.bias", + "model.diffusion_model.output_blocks.3.1.proj_in.weight": "blocks.111.proj_in.weight", + "model.diffusion_model.output_blocks.3.1.proj_out.bias": "blocks.114.proj.bias", + "model.diffusion_model.output_blocks.3.1.proj_out.weight": "blocks.114.proj.weight", + "model.diffusion_model.output_blocks.3.1.time_mixer.mix_factor": "blocks.114.mix_factor", + "model.diffusion_model.output_blocks.3.1.time_pos_embed.0.bias": "blocks.113.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.3.1.time_pos_embed.0.weight": "blocks.113.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.3.1.time_pos_embed.2.bias": "blocks.113.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.3.1.time_pos_embed.2.weight": "blocks.113.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn1.to_k.weight": "blocks.113.attn1.to_k.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn1.to_out.0.bias": "blocks.113.attn1.to_out.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn1.to_out.0.weight": "blocks.113.attn1.to_out.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn1.to_q.weight": "blocks.113.attn1.to_q.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn1.to_v.weight": "blocks.113.attn1.to_v.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn2.to_k.weight": "blocks.113.attn2.to_k.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn2.to_out.0.bias": "blocks.113.attn2.to_out.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn2.to_out.0.weight": "blocks.113.attn2.to_out.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn2.to_q.weight": "blocks.113.attn2.to_q.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.attn2.to_v.weight": "blocks.113.attn2.to_v.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff.net.0.proj.bias": "blocks.113.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff.net.0.proj.weight": "blocks.113.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff.net.2.bias": "blocks.113.ff_out.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff.net.2.weight": "blocks.113.ff_out.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.113.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.113.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff_in.net.2.bias": "blocks.113.ff_in.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.ff_in.net.2.weight": "blocks.113.ff_in.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm1.bias": "blocks.113.norm1.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm1.weight": "blocks.113.norm1.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm2.bias": "blocks.113.norm2.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm2.weight": "blocks.113.norm2.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm3.bias": "blocks.113.norm_out.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm3.weight": "blocks.113.norm_out.weight", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm_in.bias": "blocks.113.norm_in.bias", + "model.diffusion_model.output_blocks.3.1.time_stack.0.norm_in.weight": "blocks.113.norm_in.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_k.weight": "blocks.111.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.111.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.111.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_q.weight": "blocks.111.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_v.weight": "blocks.111.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_k.weight": "blocks.111.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.111.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.111.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_q.weight": "blocks.111.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_v.weight": "blocks.111.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.111.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.111.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.2.bias": "blocks.111.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.2.weight": "blocks.111.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm1.bias": "blocks.111.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm1.weight": "blocks.111.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm2.bias": "blocks.111.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm2.weight": "blocks.111.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm3.bias": "blocks.111.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm3.weight": "blocks.111.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.4.0.emb_layers.1.bias": "blocks.116.time_emb_proj.bias", + "model.diffusion_model.output_blocks.4.0.emb_layers.1.weight": "blocks.116.time_emb_proj.weight", + "model.diffusion_model.output_blocks.4.0.in_layers.0.bias": "blocks.116.norm1.bias", + "model.diffusion_model.output_blocks.4.0.in_layers.0.weight": "blocks.116.norm1.weight", + "model.diffusion_model.output_blocks.4.0.in_layers.2.bias": "blocks.116.conv1.bias", + "model.diffusion_model.output_blocks.4.0.in_layers.2.weight": "blocks.116.conv1.weight", + "model.diffusion_model.output_blocks.4.0.out_layers.0.bias": "blocks.116.norm2.bias", + "model.diffusion_model.output_blocks.4.0.out_layers.0.weight": "blocks.116.norm2.weight", + "model.diffusion_model.output_blocks.4.0.out_layers.3.bias": "blocks.116.conv2.bias", + "model.diffusion_model.output_blocks.4.0.out_layers.3.weight": "blocks.116.conv2.weight", + "model.diffusion_model.output_blocks.4.0.skip_connection.bias": "blocks.116.conv_shortcut.bias", + "model.diffusion_model.output_blocks.4.0.skip_connection.weight": "blocks.116.conv_shortcut.weight", + "model.diffusion_model.output_blocks.4.0.time_mixer.mix_factor": "blocks.119.mix_factor", + "model.diffusion_model.output_blocks.4.0.time_stack.emb_layers.1.bias": "blocks.118.time_emb_proj.bias", + "model.diffusion_model.output_blocks.4.0.time_stack.emb_layers.1.weight": "blocks.118.time_emb_proj.weight", + "model.diffusion_model.output_blocks.4.0.time_stack.in_layers.0.bias": "blocks.118.norm1.bias", + "model.diffusion_model.output_blocks.4.0.time_stack.in_layers.0.weight": "blocks.118.norm1.weight", + "model.diffusion_model.output_blocks.4.0.time_stack.in_layers.2.bias": "blocks.118.conv1.bias", + "model.diffusion_model.output_blocks.4.0.time_stack.in_layers.2.weight": "blocks.118.conv1.weight", + "model.diffusion_model.output_blocks.4.0.time_stack.out_layers.0.bias": "blocks.118.norm2.bias", + "model.diffusion_model.output_blocks.4.0.time_stack.out_layers.0.weight": "blocks.118.norm2.weight", + "model.diffusion_model.output_blocks.4.0.time_stack.out_layers.3.bias": "blocks.118.conv2.bias", + "model.diffusion_model.output_blocks.4.0.time_stack.out_layers.3.weight": "blocks.118.conv2.weight", + "model.diffusion_model.output_blocks.4.1.norm.bias": "blocks.121.norm.bias", + "model.diffusion_model.output_blocks.4.1.norm.weight": "blocks.121.norm.weight", + "model.diffusion_model.output_blocks.4.1.proj_in.bias": "blocks.121.proj_in.bias", + "model.diffusion_model.output_blocks.4.1.proj_in.weight": "blocks.121.proj_in.weight", + "model.diffusion_model.output_blocks.4.1.proj_out.bias": "blocks.124.proj.bias", + "model.diffusion_model.output_blocks.4.1.proj_out.weight": "blocks.124.proj.weight", + "model.diffusion_model.output_blocks.4.1.time_mixer.mix_factor": "blocks.124.mix_factor", + "model.diffusion_model.output_blocks.4.1.time_pos_embed.0.bias": "blocks.123.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.4.1.time_pos_embed.0.weight": "blocks.123.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.4.1.time_pos_embed.2.bias": "blocks.123.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.4.1.time_pos_embed.2.weight": "blocks.123.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn1.to_k.weight": "blocks.123.attn1.to_k.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn1.to_out.0.bias": "blocks.123.attn1.to_out.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn1.to_out.0.weight": "blocks.123.attn1.to_out.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn1.to_q.weight": "blocks.123.attn1.to_q.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn1.to_v.weight": "blocks.123.attn1.to_v.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn2.to_k.weight": "blocks.123.attn2.to_k.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn2.to_out.0.bias": "blocks.123.attn2.to_out.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn2.to_out.0.weight": "blocks.123.attn2.to_out.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn2.to_q.weight": "blocks.123.attn2.to_q.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.attn2.to_v.weight": "blocks.123.attn2.to_v.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff.net.0.proj.bias": "blocks.123.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff.net.0.proj.weight": "blocks.123.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff.net.2.bias": "blocks.123.ff_out.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff.net.2.weight": "blocks.123.ff_out.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.123.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.123.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff_in.net.2.bias": "blocks.123.ff_in.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.ff_in.net.2.weight": "blocks.123.ff_in.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm1.bias": "blocks.123.norm1.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm1.weight": "blocks.123.norm1.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm2.bias": "blocks.123.norm2.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm2.weight": "blocks.123.norm2.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm3.bias": "blocks.123.norm_out.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm3.weight": "blocks.123.norm_out.weight", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm_in.bias": "blocks.123.norm_in.bias", + "model.diffusion_model.output_blocks.4.1.time_stack.0.norm_in.weight": "blocks.123.norm_in.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_k.weight": "blocks.121.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.121.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.121.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_q.weight": "blocks.121.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_v.weight": "blocks.121.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_k.weight": "blocks.121.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.121.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.121.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_q.weight": "blocks.121.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_v.weight": "blocks.121.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.121.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.121.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.2.bias": "blocks.121.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.2.weight": "blocks.121.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm1.bias": "blocks.121.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm1.weight": "blocks.121.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm2.bias": "blocks.121.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm2.weight": "blocks.121.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm3.bias": "blocks.121.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm3.weight": "blocks.121.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.5.0.emb_layers.1.bias": "blocks.126.time_emb_proj.bias", + "model.diffusion_model.output_blocks.5.0.emb_layers.1.weight": "blocks.126.time_emb_proj.weight", + "model.diffusion_model.output_blocks.5.0.in_layers.0.bias": "blocks.126.norm1.bias", + "model.diffusion_model.output_blocks.5.0.in_layers.0.weight": "blocks.126.norm1.weight", + "model.diffusion_model.output_blocks.5.0.in_layers.2.bias": "blocks.126.conv1.bias", + "model.diffusion_model.output_blocks.5.0.in_layers.2.weight": "blocks.126.conv1.weight", + "model.diffusion_model.output_blocks.5.0.out_layers.0.bias": "blocks.126.norm2.bias", + "model.diffusion_model.output_blocks.5.0.out_layers.0.weight": "blocks.126.norm2.weight", + "model.diffusion_model.output_blocks.5.0.out_layers.3.bias": "blocks.126.conv2.bias", + "model.diffusion_model.output_blocks.5.0.out_layers.3.weight": "blocks.126.conv2.weight", + "model.diffusion_model.output_blocks.5.0.skip_connection.bias": "blocks.126.conv_shortcut.bias", + "model.diffusion_model.output_blocks.5.0.skip_connection.weight": "blocks.126.conv_shortcut.weight", + "model.diffusion_model.output_blocks.5.0.time_mixer.mix_factor": "blocks.129.mix_factor", + "model.diffusion_model.output_blocks.5.0.time_stack.emb_layers.1.bias": "blocks.128.time_emb_proj.bias", + "model.diffusion_model.output_blocks.5.0.time_stack.emb_layers.1.weight": "blocks.128.time_emb_proj.weight", + "model.diffusion_model.output_blocks.5.0.time_stack.in_layers.0.bias": "blocks.128.norm1.bias", + "model.diffusion_model.output_blocks.5.0.time_stack.in_layers.0.weight": "blocks.128.norm1.weight", + "model.diffusion_model.output_blocks.5.0.time_stack.in_layers.2.bias": "blocks.128.conv1.bias", + "model.diffusion_model.output_blocks.5.0.time_stack.in_layers.2.weight": "blocks.128.conv1.weight", + "model.diffusion_model.output_blocks.5.0.time_stack.out_layers.0.bias": "blocks.128.norm2.bias", + "model.diffusion_model.output_blocks.5.0.time_stack.out_layers.0.weight": "blocks.128.norm2.weight", + "model.diffusion_model.output_blocks.5.0.time_stack.out_layers.3.bias": "blocks.128.conv2.bias", + "model.diffusion_model.output_blocks.5.0.time_stack.out_layers.3.weight": "blocks.128.conv2.weight", + "model.diffusion_model.output_blocks.5.1.norm.bias": "blocks.131.norm.bias", + "model.diffusion_model.output_blocks.5.1.norm.weight": "blocks.131.norm.weight", + "model.diffusion_model.output_blocks.5.1.proj_in.bias": "blocks.131.proj_in.bias", + "model.diffusion_model.output_blocks.5.1.proj_in.weight": "blocks.131.proj_in.weight", + "model.diffusion_model.output_blocks.5.1.proj_out.bias": "blocks.134.proj.bias", + "model.diffusion_model.output_blocks.5.1.proj_out.weight": "blocks.134.proj.weight", + "model.diffusion_model.output_blocks.5.1.time_mixer.mix_factor": "blocks.134.mix_factor", + "model.diffusion_model.output_blocks.5.1.time_pos_embed.0.bias": "blocks.133.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.5.1.time_pos_embed.0.weight": "blocks.133.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.5.1.time_pos_embed.2.bias": "blocks.133.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.5.1.time_pos_embed.2.weight": "blocks.133.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn1.to_k.weight": "blocks.133.attn1.to_k.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn1.to_out.0.bias": "blocks.133.attn1.to_out.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn1.to_out.0.weight": "blocks.133.attn1.to_out.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn1.to_q.weight": "blocks.133.attn1.to_q.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn1.to_v.weight": "blocks.133.attn1.to_v.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn2.to_k.weight": "blocks.133.attn2.to_k.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn2.to_out.0.bias": "blocks.133.attn2.to_out.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn2.to_out.0.weight": "blocks.133.attn2.to_out.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn2.to_q.weight": "blocks.133.attn2.to_q.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.attn2.to_v.weight": "blocks.133.attn2.to_v.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff.net.0.proj.bias": "blocks.133.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff.net.0.proj.weight": "blocks.133.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff.net.2.bias": "blocks.133.ff_out.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff.net.2.weight": "blocks.133.ff_out.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.133.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.133.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff_in.net.2.bias": "blocks.133.ff_in.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.ff_in.net.2.weight": "blocks.133.ff_in.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm1.bias": "blocks.133.norm1.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm1.weight": "blocks.133.norm1.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm2.bias": "blocks.133.norm2.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm2.weight": "blocks.133.norm2.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm3.bias": "blocks.133.norm_out.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm3.weight": "blocks.133.norm_out.weight", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm_in.bias": "blocks.133.norm_in.bias", + "model.diffusion_model.output_blocks.5.1.time_stack.0.norm_in.weight": "blocks.133.norm_in.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_k.weight": "blocks.131.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.131.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.131.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_q.weight": "blocks.131.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_v.weight": "blocks.131.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_k.weight": "blocks.131.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.131.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.131.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_q.weight": "blocks.131.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_v.weight": "blocks.131.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.131.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.131.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.2.bias": "blocks.131.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.2.weight": "blocks.131.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm1.bias": "blocks.131.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm1.weight": "blocks.131.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm2.bias": "blocks.131.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm2.weight": "blocks.131.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm3.bias": "blocks.131.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm3.weight": "blocks.131.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.5.2.conv.bias": "blocks.135.conv.bias", + "model.diffusion_model.output_blocks.5.2.conv.weight": "blocks.135.conv.weight", + "model.diffusion_model.output_blocks.6.0.emb_layers.1.bias": "blocks.137.time_emb_proj.bias", + "model.diffusion_model.output_blocks.6.0.emb_layers.1.weight": "blocks.137.time_emb_proj.weight", + "model.diffusion_model.output_blocks.6.0.in_layers.0.bias": "blocks.137.norm1.bias", + "model.diffusion_model.output_blocks.6.0.in_layers.0.weight": "blocks.137.norm1.weight", + "model.diffusion_model.output_blocks.6.0.in_layers.2.bias": "blocks.137.conv1.bias", + "model.diffusion_model.output_blocks.6.0.in_layers.2.weight": "blocks.137.conv1.weight", + "model.diffusion_model.output_blocks.6.0.out_layers.0.bias": "blocks.137.norm2.bias", + "model.diffusion_model.output_blocks.6.0.out_layers.0.weight": "blocks.137.norm2.weight", + "model.diffusion_model.output_blocks.6.0.out_layers.3.bias": "blocks.137.conv2.bias", + "model.diffusion_model.output_blocks.6.0.out_layers.3.weight": "blocks.137.conv2.weight", + "model.diffusion_model.output_blocks.6.0.skip_connection.bias": "blocks.137.conv_shortcut.bias", + "model.diffusion_model.output_blocks.6.0.skip_connection.weight": "blocks.137.conv_shortcut.weight", + "model.diffusion_model.output_blocks.6.0.time_mixer.mix_factor": "blocks.140.mix_factor", + "model.diffusion_model.output_blocks.6.0.time_stack.emb_layers.1.bias": "blocks.139.time_emb_proj.bias", + "model.diffusion_model.output_blocks.6.0.time_stack.emb_layers.1.weight": "blocks.139.time_emb_proj.weight", + "model.diffusion_model.output_blocks.6.0.time_stack.in_layers.0.bias": "blocks.139.norm1.bias", + "model.diffusion_model.output_blocks.6.0.time_stack.in_layers.0.weight": "blocks.139.norm1.weight", + "model.diffusion_model.output_blocks.6.0.time_stack.in_layers.2.bias": "blocks.139.conv1.bias", + "model.diffusion_model.output_blocks.6.0.time_stack.in_layers.2.weight": "blocks.139.conv1.weight", + "model.diffusion_model.output_blocks.6.0.time_stack.out_layers.0.bias": "blocks.139.norm2.bias", + "model.diffusion_model.output_blocks.6.0.time_stack.out_layers.0.weight": "blocks.139.norm2.weight", + "model.diffusion_model.output_blocks.6.0.time_stack.out_layers.3.bias": "blocks.139.conv2.bias", + "model.diffusion_model.output_blocks.6.0.time_stack.out_layers.3.weight": "blocks.139.conv2.weight", + "model.diffusion_model.output_blocks.6.1.norm.bias": "blocks.142.norm.bias", + "model.diffusion_model.output_blocks.6.1.norm.weight": "blocks.142.norm.weight", + "model.diffusion_model.output_blocks.6.1.proj_in.bias": "blocks.142.proj_in.bias", + "model.diffusion_model.output_blocks.6.1.proj_in.weight": "blocks.142.proj_in.weight", + "model.diffusion_model.output_blocks.6.1.proj_out.bias": "blocks.145.proj.bias", + "model.diffusion_model.output_blocks.6.1.proj_out.weight": "blocks.145.proj.weight", + "model.diffusion_model.output_blocks.6.1.time_mixer.mix_factor": "blocks.145.mix_factor", + "model.diffusion_model.output_blocks.6.1.time_pos_embed.0.bias": "blocks.144.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.6.1.time_pos_embed.0.weight": "blocks.144.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.6.1.time_pos_embed.2.bias": "blocks.144.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.6.1.time_pos_embed.2.weight": "blocks.144.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn1.to_k.weight": "blocks.144.attn1.to_k.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn1.to_out.0.bias": "blocks.144.attn1.to_out.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn1.to_out.0.weight": "blocks.144.attn1.to_out.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn1.to_q.weight": "blocks.144.attn1.to_q.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn1.to_v.weight": "blocks.144.attn1.to_v.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn2.to_k.weight": "blocks.144.attn2.to_k.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn2.to_out.0.bias": "blocks.144.attn2.to_out.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn2.to_out.0.weight": "blocks.144.attn2.to_out.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn2.to_q.weight": "blocks.144.attn2.to_q.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.attn2.to_v.weight": "blocks.144.attn2.to_v.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff.net.0.proj.bias": "blocks.144.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff.net.0.proj.weight": "blocks.144.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff.net.2.bias": "blocks.144.ff_out.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff.net.2.weight": "blocks.144.ff_out.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.144.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.144.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff_in.net.2.bias": "blocks.144.ff_in.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.ff_in.net.2.weight": "blocks.144.ff_in.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm1.bias": "blocks.144.norm1.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm1.weight": "blocks.144.norm1.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm2.bias": "blocks.144.norm2.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm2.weight": "blocks.144.norm2.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm3.bias": "blocks.144.norm_out.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm3.weight": "blocks.144.norm_out.weight", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm_in.bias": "blocks.144.norm_in.bias", + "model.diffusion_model.output_blocks.6.1.time_stack.0.norm_in.weight": "blocks.144.norm_in.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_k.weight": "blocks.142.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.142.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.142.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_q.weight": "blocks.142.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_v.weight": "blocks.142.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_k.weight": "blocks.142.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.142.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.142.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_q.weight": "blocks.142.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_v.weight": "blocks.142.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.142.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.142.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.2.bias": "blocks.142.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.2.weight": "blocks.142.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm1.bias": "blocks.142.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm1.weight": "blocks.142.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm2.bias": "blocks.142.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm2.weight": "blocks.142.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm3.bias": "blocks.142.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm3.weight": "blocks.142.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.7.0.emb_layers.1.bias": "blocks.147.time_emb_proj.bias", + "model.diffusion_model.output_blocks.7.0.emb_layers.1.weight": "blocks.147.time_emb_proj.weight", + "model.diffusion_model.output_blocks.7.0.in_layers.0.bias": "blocks.147.norm1.bias", + "model.diffusion_model.output_blocks.7.0.in_layers.0.weight": "blocks.147.norm1.weight", + "model.diffusion_model.output_blocks.7.0.in_layers.2.bias": "blocks.147.conv1.bias", + "model.diffusion_model.output_blocks.7.0.in_layers.2.weight": "blocks.147.conv1.weight", + "model.diffusion_model.output_blocks.7.0.out_layers.0.bias": "blocks.147.norm2.bias", + "model.diffusion_model.output_blocks.7.0.out_layers.0.weight": "blocks.147.norm2.weight", + "model.diffusion_model.output_blocks.7.0.out_layers.3.bias": "blocks.147.conv2.bias", + "model.diffusion_model.output_blocks.7.0.out_layers.3.weight": "blocks.147.conv2.weight", + "model.diffusion_model.output_blocks.7.0.skip_connection.bias": "blocks.147.conv_shortcut.bias", + "model.diffusion_model.output_blocks.7.0.skip_connection.weight": "blocks.147.conv_shortcut.weight", + "model.diffusion_model.output_blocks.7.0.time_mixer.mix_factor": "blocks.150.mix_factor", + "model.diffusion_model.output_blocks.7.0.time_stack.emb_layers.1.bias": "blocks.149.time_emb_proj.bias", + "model.diffusion_model.output_blocks.7.0.time_stack.emb_layers.1.weight": "blocks.149.time_emb_proj.weight", + "model.diffusion_model.output_blocks.7.0.time_stack.in_layers.0.bias": "blocks.149.norm1.bias", + "model.diffusion_model.output_blocks.7.0.time_stack.in_layers.0.weight": "blocks.149.norm1.weight", + "model.diffusion_model.output_blocks.7.0.time_stack.in_layers.2.bias": "blocks.149.conv1.bias", + "model.diffusion_model.output_blocks.7.0.time_stack.in_layers.2.weight": "blocks.149.conv1.weight", + "model.diffusion_model.output_blocks.7.0.time_stack.out_layers.0.bias": "blocks.149.norm2.bias", + "model.diffusion_model.output_blocks.7.0.time_stack.out_layers.0.weight": "blocks.149.norm2.weight", + "model.diffusion_model.output_blocks.7.0.time_stack.out_layers.3.bias": "blocks.149.conv2.bias", + "model.diffusion_model.output_blocks.7.0.time_stack.out_layers.3.weight": "blocks.149.conv2.weight", + "model.diffusion_model.output_blocks.7.1.norm.bias": "blocks.152.norm.bias", + "model.diffusion_model.output_blocks.7.1.norm.weight": "blocks.152.norm.weight", + "model.diffusion_model.output_blocks.7.1.proj_in.bias": "blocks.152.proj_in.bias", + "model.diffusion_model.output_blocks.7.1.proj_in.weight": "blocks.152.proj_in.weight", + "model.diffusion_model.output_blocks.7.1.proj_out.bias": "blocks.155.proj.bias", + "model.diffusion_model.output_blocks.7.1.proj_out.weight": "blocks.155.proj.weight", + "model.diffusion_model.output_blocks.7.1.time_mixer.mix_factor": "blocks.155.mix_factor", + "model.diffusion_model.output_blocks.7.1.time_pos_embed.0.bias": "blocks.154.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.7.1.time_pos_embed.0.weight": "blocks.154.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.7.1.time_pos_embed.2.bias": "blocks.154.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.7.1.time_pos_embed.2.weight": "blocks.154.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn1.to_k.weight": "blocks.154.attn1.to_k.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn1.to_out.0.bias": "blocks.154.attn1.to_out.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn1.to_out.0.weight": "blocks.154.attn1.to_out.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn1.to_q.weight": "blocks.154.attn1.to_q.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn1.to_v.weight": "blocks.154.attn1.to_v.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn2.to_k.weight": "blocks.154.attn2.to_k.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn2.to_out.0.bias": "blocks.154.attn2.to_out.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn2.to_out.0.weight": "blocks.154.attn2.to_out.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn2.to_q.weight": "blocks.154.attn2.to_q.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.attn2.to_v.weight": "blocks.154.attn2.to_v.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff.net.0.proj.bias": "blocks.154.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff.net.0.proj.weight": "blocks.154.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff.net.2.bias": "blocks.154.ff_out.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff.net.2.weight": "blocks.154.ff_out.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.154.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.154.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff_in.net.2.bias": "blocks.154.ff_in.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.ff_in.net.2.weight": "blocks.154.ff_in.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm1.bias": "blocks.154.norm1.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm1.weight": "blocks.154.norm1.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm2.bias": "blocks.154.norm2.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm2.weight": "blocks.154.norm2.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm3.bias": "blocks.154.norm_out.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm3.weight": "blocks.154.norm_out.weight", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm_in.bias": "blocks.154.norm_in.bias", + "model.diffusion_model.output_blocks.7.1.time_stack.0.norm_in.weight": "blocks.154.norm_in.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight": "blocks.152.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.152.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.152.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_q.weight": "blocks.152.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_v.weight": "blocks.152.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_k.weight": "blocks.152.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.152.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.152.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_q.weight": "blocks.152.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_v.weight": "blocks.152.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.152.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.152.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.2.bias": "blocks.152.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.2.weight": "blocks.152.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm1.bias": "blocks.152.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm1.weight": "blocks.152.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm2.bias": "blocks.152.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm2.weight": "blocks.152.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm3.bias": "blocks.152.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm3.weight": "blocks.152.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.8.0.emb_layers.1.bias": "blocks.157.time_emb_proj.bias", + "model.diffusion_model.output_blocks.8.0.emb_layers.1.weight": "blocks.157.time_emb_proj.weight", + "model.diffusion_model.output_blocks.8.0.in_layers.0.bias": "blocks.157.norm1.bias", + "model.diffusion_model.output_blocks.8.0.in_layers.0.weight": "blocks.157.norm1.weight", + "model.diffusion_model.output_blocks.8.0.in_layers.2.bias": "blocks.157.conv1.bias", + "model.diffusion_model.output_blocks.8.0.in_layers.2.weight": "blocks.157.conv1.weight", + "model.diffusion_model.output_blocks.8.0.out_layers.0.bias": "blocks.157.norm2.bias", + "model.diffusion_model.output_blocks.8.0.out_layers.0.weight": "blocks.157.norm2.weight", + "model.diffusion_model.output_blocks.8.0.out_layers.3.bias": "blocks.157.conv2.bias", + "model.diffusion_model.output_blocks.8.0.out_layers.3.weight": "blocks.157.conv2.weight", + "model.diffusion_model.output_blocks.8.0.skip_connection.bias": "blocks.157.conv_shortcut.bias", + "model.diffusion_model.output_blocks.8.0.skip_connection.weight": "blocks.157.conv_shortcut.weight", + "model.diffusion_model.output_blocks.8.0.time_mixer.mix_factor": "blocks.160.mix_factor", + "model.diffusion_model.output_blocks.8.0.time_stack.emb_layers.1.bias": "blocks.159.time_emb_proj.bias", + "model.diffusion_model.output_blocks.8.0.time_stack.emb_layers.1.weight": "blocks.159.time_emb_proj.weight", + "model.diffusion_model.output_blocks.8.0.time_stack.in_layers.0.bias": "blocks.159.norm1.bias", + "model.diffusion_model.output_blocks.8.0.time_stack.in_layers.0.weight": "blocks.159.norm1.weight", + "model.diffusion_model.output_blocks.8.0.time_stack.in_layers.2.bias": "blocks.159.conv1.bias", + "model.diffusion_model.output_blocks.8.0.time_stack.in_layers.2.weight": "blocks.159.conv1.weight", + "model.diffusion_model.output_blocks.8.0.time_stack.out_layers.0.bias": "blocks.159.norm2.bias", + "model.diffusion_model.output_blocks.8.0.time_stack.out_layers.0.weight": "blocks.159.norm2.weight", + "model.diffusion_model.output_blocks.8.0.time_stack.out_layers.3.bias": "blocks.159.conv2.bias", + "model.diffusion_model.output_blocks.8.0.time_stack.out_layers.3.weight": "blocks.159.conv2.weight", + "model.diffusion_model.output_blocks.8.1.norm.bias": "blocks.162.norm.bias", + "model.diffusion_model.output_blocks.8.1.norm.weight": "blocks.162.norm.weight", + "model.diffusion_model.output_blocks.8.1.proj_in.bias": "blocks.162.proj_in.bias", + "model.diffusion_model.output_blocks.8.1.proj_in.weight": "blocks.162.proj_in.weight", + "model.diffusion_model.output_blocks.8.1.proj_out.bias": "blocks.165.proj.bias", + "model.diffusion_model.output_blocks.8.1.proj_out.weight": "blocks.165.proj.weight", + "model.diffusion_model.output_blocks.8.1.time_mixer.mix_factor": "blocks.165.mix_factor", + "model.diffusion_model.output_blocks.8.1.time_pos_embed.0.bias": "blocks.164.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.8.1.time_pos_embed.0.weight": "blocks.164.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.8.1.time_pos_embed.2.bias": "blocks.164.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.8.1.time_pos_embed.2.weight": "blocks.164.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn1.to_k.weight": "blocks.164.attn1.to_k.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn1.to_out.0.bias": "blocks.164.attn1.to_out.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn1.to_out.0.weight": "blocks.164.attn1.to_out.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn1.to_q.weight": "blocks.164.attn1.to_q.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn1.to_v.weight": "blocks.164.attn1.to_v.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn2.to_k.weight": "blocks.164.attn2.to_k.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn2.to_out.0.bias": "blocks.164.attn2.to_out.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn2.to_out.0.weight": "blocks.164.attn2.to_out.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn2.to_q.weight": "blocks.164.attn2.to_q.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.attn2.to_v.weight": "blocks.164.attn2.to_v.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff.net.0.proj.bias": "blocks.164.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff.net.0.proj.weight": "blocks.164.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff.net.2.bias": "blocks.164.ff_out.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff.net.2.weight": "blocks.164.ff_out.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.164.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.164.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff_in.net.2.bias": "blocks.164.ff_in.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.ff_in.net.2.weight": "blocks.164.ff_in.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm1.bias": "blocks.164.norm1.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm1.weight": "blocks.164.norm1.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm2.bias": "blocks.164.norm2.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm2.weight": "blocks.164.norm2.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm3.bias": "blocks.164.norm_out.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm3.weight": "blocks.164.norm_out.weight", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm_in.bias": "blocks.164.norm_in.bias", + "model.diffusion_model.output_blocks.8.1.time_stack.0.norm_in.weight": "blocks.164.norm_in.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_k.weight": "blocks.162.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.162.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.162.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_q.weight": "blocks.162.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_v.weight": "blocks.162.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_k.weight": "blocks.162.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.162.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.162.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_q.weight": "blocks.162.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_v.weight": "blocks.162.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.162.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.162.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.2.bias": "blocks.162.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.2.weight": "blocks.162.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm1.bias": "blocks.162.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm1.weight": "blocks.162.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm2.bias": "blocks.162.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm2.weight": "blocks.162.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm3.bias": "blocks.162.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm3.weight": "blocks.162.transformer_blocks.0.norm3.weight", + "model.diffusion_model.output_blocks.8.2.conv.bias": "blocks.166.conv.bias", + "model.diffusion_model.output_blocks.8.2.conv.weight": "blocks.166.conv.weight", + "model.diffusion_model.output_blocks.9.0.emb_layers.1.bias": "blocks.168.time_emb_proj.bias", + "model.diffusion_model.output_blocks.9.0.emb_layers.1.weight": "blocks.168.time_emb_proj.weight", + "model.diffusion_model.output_blocks.9.0.in_layers.0.bias": "blocks.168.norm1.bias", + "model.diffusion_model.output_blocks.9.0.in_layers.0.weight": "blocks.168.norm1.weight", + "model.diffusion_model.output_blocks.9.0.in_layers.2.bias": "blocks.168.conv1.bias", + "model.diffusion_model.output_blocks.9.0.in_layers.2.weight": "blocks.168.conv1.weight", + "model.diffusion_model.output_blocks.9.0.out_layers.0.bias": "blocks.168.norm2.bias", + "model.diffusion_model.output_blocks.9.0.out_layers.0.weight": "blocks.168.norm2.weight", + "model.diffusion_model.output_blocks.9.0.out_layers.3.bias": "blocks.168.conv2.bias", + "model.diffusion_model.output_blocks.9.0.out_layers.3.weight": "blocks.168.conv2.weight", + "model.diffusion_model.output_blocks.9.0.skip_connection.bias": "blocks.168.conv_shortcut.bias", + "model.diffusion_model.output_blocks.9.0.skip_connection.weight": "blocks.168.conv_shortcut.weight", + "model.diffusion_model.output_blocks.9.0.time_mixer.mix_factor": "blocks.171.mix_factor", + "model.diffusion_model.output_blocks.9.0.time_stack.emb_layers.1.bias": "blocks.170.time_emb_proj.bias", + "model.diffusion_model.output_blocks.9.0.time_stack.emb_layers.1.weight": "blocks.170.time_emb_proj.weight", + "model.diffusion_model.output_blocks.9.0.time_stack.in_layers.0.bias": "blocks.170.norm1.bias", + "model.diffusion_model.output_blocks.9.0.time_stack.in_layers.0.weight": "blocks.170.norm1.weight", + "model.diffusion_model.output_blocks.9.0.time_stack.in_layers.2.bias": "blocks.170.conv1.bias", + "model.diffusion_model.output_blocks.9.0.time_stack.in_layers.2.weight": "blocks.170.conv1.weight", + "model.diffusion_model.output_blocks.9.0.time_stack.out_layers.0.bias": "blocks.170.norm2.bias", + "model.diffusion_model.output_blocks.9.0.time_stack.out_layers.0.weight": "blocks.170.norm2.weight", + "model.diffusion_model.output_blocks.9.0.time_stack.out_layers.3.bias": "blocks.170.conv2.bias", + "model.diffusion_model.output_blocks.9.0.time_stack.out_layers.3.weight": "blocks.170.conv2.weight", + "model.diffusion_model.output_blocks.9.1.norm.bias": "blocks.173.norm.bias", + "model.diffusion_model.output_blocks.9.1.norm.weight": "blocks.173.norm.weight", + "model.diffusion_model.output_blocks.9.1.proj_in.bias": "blocks.173.proj_in.bias", + "model.diffusion_model.output_blocks.9.1.proj_in.weight": "blocks.173.proj_in.weight", + "model.diffusion_model.output_blocks.9.1.proj_out.bias": "blocks.176.proj.bias", + "model.diffusion_model.output_blocks.9.1.proj_out.weight": "blocks.176.proj.weight", + "model.diffusion_model.output_blocks.9.1.time_mixer.mix_factor": "blocks.176.mix_factor", + "model.diffusion_model.output_blocks.9.1.time_pos_embed.0.bias": "blocks.175.positional_embedding_proj.0.bias", + "model.diffusion_model.output_blocks.9.1.time_pos_embed.0.weight": "blocks.175.positional_embedding_proj.0.weight", + "model.diffusion_model.output_blocks.9.1.time_pos_embed.2.bias": "blocks.175.positional_embedding_proj.2.bias", + "model.diffusion_model.output_blocks.9.1.time_pos_embed.2.weight": "blocks.175.positional_embedding_proj.2.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn1.to_k.weight": "blocks.175.attn1.to_k.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn1.to_out.0.bias": "blocks.175.attn1.to_out.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn1.to_out.0.weight": "blocks.175.attn1.to_out.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn1.to_q.weight": "blocks.175.attn1.to_q.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn1.to_v.weight": "blocks.175.attn1.to_v.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn2.to_k.weight": "blocks.175.attn2.to_k.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn2.to_out.0.bias": "blocks.175.attn2.to_out.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn2.to_out.0.weight": "blocks.175.attn2.to_out.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn2.to_q.weight": "blocks.175.attn2.to_q.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.attn2.to_v.weight": "blocks.175.attn2.to_v.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff.net.0.proj.bias": "blocks.175.act_fn_out.proj.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff.net.0.proj.weight": "blocks.175.act_fn_out.proj.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff.net.2.bias": "blocks.175.ff_out.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff.net.2.weight": "blocks.175.ff_out.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff_in.net.0.proj.bias": "blocks.175.act_fn_in.proj.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff_in.net.0.proj.weight": "blocks.175.act_fn_in.proj.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff_in.net.2.bias": "blocks.175.ff_in.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.ff_in.net.2.weight": "blocks.175.ff_in.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm1.bias": "blocks.175.norm1.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm1.weight": "blocks.175.norm1.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm2.bias": "blocks.175.norm2.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm2.weight": "blocks.175.norm2.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm3.bias": "blocks.175.norm_out.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm3.weight": "blocks.175.norm_out.weight", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.bias": "blocks.175.norm_in.bias", + "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight": "blocks.175.norm_in.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_k.weight": "blocks.173.transformer_blocks.0.attn1.to_k.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.173.transformer_blocks.0.attn1.to_out.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.173.transformer_blocks.0.attn1.to_out.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_q.weight": "blocks.173.transformer_blocks.0.attn1.to_q.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_v.weight": "blocks.173.transformer_blocks.0.attn1.to_v.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_k.weight": "blocks.173.transformer_blocks.0.attn2.to_k.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.173.transformer_blocks.0.attn2.to_out.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.173.transformer_blocks.0.attn2.to_out.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_q.weight": "blocks.173.transformer_blocks.0.attn2.to_q.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_v.weight": "blocks.173.transformer_blocks.0.attn2.to_v.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.173.transformer_blocks.0.act_fn.proj.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.173.transformer_blocks.0.act_fn.proj.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.2.bias": "blocks.173.transformer_blocks.0.ff.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.2.weight": "blocks.173.transformer_blocks.0.ff.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm1.bias": "blocks.173.transformer_blocks.0.norm1.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm1.weight": "blocks.173.transformer_blocks.0.norm1.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm2.bias": "blocks.173.transformer_blocks.0.norm2.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm2.weight": "blocks.173.transformer_blocks.0.norm2.weight", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.bias": "blocks.173.transformer_blocks.0.norm3.bias", + "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight": "blocks.173.transformer_blocks.0.norm3.weight", + "model.diffusion_model.time_embed.0.bias": "time_embedding.0.bias", + "model.diffusion_model.time_embed.0.weight": "time_embedding.0.weight", + "model.diffusion_model.time_embed.2.bias": "time_embedding.2.bias", + "model.diffusion_model.time_embed.2.weight": "time_embedding.2.weight", + } + state_dict_ = {} + for name in state_dict: + if name in rename_dict: + param = state_dict[name] + if ".proj_in." in name or ".proj_out." in name: + param = param.squeeze() + state_dict_[rename_dict[name]] = param + return state_dict_ diff --git a/diffsynth/models/svd_vae_decoder.py b/diffsynth/models/svd_vae_decoder.py index 9c8f4bc..3fe6446 100644 --- a/diffsynth/models/svd_vae_decoder.py +++ b/diffsynth/models/svd_vae_decoder.py @@ -2,7 +2,7 @@ import torch from .attention import Attention from .sd_unet import ResnetBlock, UpSampler from .tiler import TileWorker -from einops import rearrange +from einops import rearrange, repeat class VAEAttentionBlock(torch.nn.Module): @@ -119,14 +119,13 @@ class SVDVAEDecoder(torch.nn.Module): self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1) self.time_conv_out = torch.nn.Conv3d(3, 3, kernel_size=(3, 1, 1), padding=(1, 0, 0)) + def forward(self, sample): # 1. pre-process - hidden_states = sample.flatten(0, 1) + hidden_states = rearrange(sample, "C T H W -> T C H W") hidden_states = hidden_states / self.scaling_factor hidden_states = self.conv_in(hidden_states) - time_emb = None - text_emb = None - res_stack = None + time_emb, text_emb, res_stack = None, None, None # 2. blocks for i, block in enumerate(self.blocks): @@ -136,11 +135,70 @@ class SVDVAEDecoder(torch.nn.Module): hidden_states = self.conv_norm_out(hidden_states) hidden_states = self.conv_act(hidden_states) hidden_states = self.conv_out(hidden_states) - hidden_states = rearrange(hidden_states, "T C H W -> 1 C T H W") + hidden_states = rearrange(hidden_states, "T C H W -> C T H W") hidden_states = self.time_conv_out(hidden_states) return hidden_states + + def build_mask(self, data, is_bound): + _, T, H, W = data.shape + t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W) + h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W) + w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W) + border_width = (T + H + W) // 6 + pad = torch.ones_like(t) * border_width + mask = torch.stack([ + pad if is_bound[0] else t + 1, + pad if is_bound[1] else T - t, + pad if is_bound[2] else h + 1, + pad if is_bound[3] else H - h, + pad if is_bound[4] else w + 1, + pad if is_bound[5] else W - w + ]).min(dim=0).values + mask = mask.clip(1, border_width) + mask = (mask / border_width).to(dtype=data.dtype, device=data.device) + mask = rearrange(mask, "T H W -> 1 T H W") + return mask + + + def decode_video( + self, sample, + batch_time=8, batch_height=128, batch_width=128, + stride_time=4, stride_height=32, stride_width=32, + progress_bar=lambda x:x + ): + sample = sample.permute(1, 0, 2, 3) + data_device = sample.device + computation_device = self.conv_in.weight.device + torch_dtype = sample.dtype + _, T, H, W = sample.shape + + weight = torch.zeros((1, T, H*8, W*8), dtype=torch_dtype, device=data_device) + values = torch.zeros((3, T, H*8, W*8), dtype=torch_dtype, device=data_device) + + # Split tasks + tasks = [] + for t in range(0, T, stride_time): + for h in range(0, H, stride_height): + for w in range(0, W, stride_width): + if (t-stride_time >= 0 and t-stride_time+batch_time >= T)\ + or (h-stride_height >= 0 and h-stride_height+batch_height >= H)\ + or (w-stride_width >= 0 and w-stride_width+batch_width >= W): + continue + tasks.append((t, t+batch_time, h, h+batch_height, w, w+batch_width)) + + # Run + for tl, tr, hl, hr, wl, wr in progress_bar(tasks): + sample_batch = sample[:, tl:tr, hl:hr, wl:wr].to(computation_device) + sample_batch = self.forward(sample_batch).to(data_device) + mask = self.build_mask(sample_batch, is_bound=(tl==0, tr>=T, hl==0, hr>=H, wl==0, wr>=W)) + values[:, tl:tr, hl*8:hr*8, wl*8:wr*8] += sample_batch * mask + weight[:, tl:tr, hl*8:hr*8, wl*8:wr*8] += mask + values /= weight + return values + + def state_dict_converter(self): return SVDVAEDecoderStateDictConverter() @@ -238,3 +296,282 @@ class SVDVAEDecoderStateDictConverter: state_dict_[name_] = state_dict[name] return state_dict_ + + + def from_civitai(self, state_dict): + rename_dict = { + "first_stage_model.decoder.conv_in.bias": "conv_in.bias", + "first_stage_model.decoder.conv_in.weight": "conv_in.weight", + "first_stage_model.decoder.conv_out.bias": "conv_out.bias", + "first_stage_model.decoder.conv_out.time_mix_conv.bias": "time_conv_out.bias", + "first_stage_model.decoder.conv_out.time_mix_conv.weight": "time_conv_out.weight", + "first_stage_model.decoder.conv_out.weight": "conv_out.weight", + "first_stage_model.decoder.mid.attn_1.k.bias": "blocks.2.transformer_blocks.0.to_k.bias", + "first_stage_model.decoder.mid.attn_1.k.weight": "blocks.2.transformer_blocks.0.to_k.weight", + "first_stage_model.decoder.mid.attn_1.norm.bias": "blocks.2.norm.bias", + "first_stage_model.decoder.mid.attn_1.norm.weight": "blocks.2.norm.weight", + "first_stage_model.decoder.mid.attn_1.proj_out.bias": "blocks.2.transformer_blocks.0.to_out.bias", + "first_stage_model.decoder.mid.attn_1.proj_out.weight": "blocks.2.transformer_blocks.0.to_out.weight", + "first_stage_model.decoder.mid.attn_1.q.bias": "blocks.2.transformer_blocks.0.to_q.bias", + "first_stage_model.decoder.mid.attn_1.q.weight": "blocks.2.transformer_blocks.0.to_q.weight", + "first_stage_model.decoder.mid.attn_1.v.bias": "blocks.2.transformer_blocks.0.to_v.bias", + "first_stage_model.decoder.mid.attn_1.v.weight": "blocks.2.transformer_blocks.0.to_v.weight", + "first_stage_model.decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias", + "first_stage_model.decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight", + "first_stage_model.decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias", + "first_stage_model.decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight", + "first_stage_model.decoder.mid.block_1.mix_factor": "blocks.1.mix_factor", + "first_stage_model.decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias", + "first_stage_model.decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight", + "first_stage_model.decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias", + "first_stage_model.decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight", + "first_stage_model.decoder.mid.block_1.time_stack.in_layers.0.bias": "blocks.1.norm1.bias", + "first_stage_model.decoder.mid.block_1.time_stack.in_layers.0.weight": "blocks.1.norm1.weight", + "first_stage_model.decoder.mid.block_1.time_stack.in_layers.2.bias": "blocks.1.conv1.bias", + "first_stage_model.decoder.mid.block_1.time_stack.in_layers.2.weight": "blocks.1.conv1.weight", + "first_stage_model.decoder.mid.block_1.time_stack.out_layers.0.bias": "blocks.1.norm2.bias", + "first_stage_model.decoder.mid.block_1.time_stack.out_layers.0.weight": "blocks.1.norm2.weight", + "first_stage_model.decoder.mid.block_1.time_stack.out_layers.3.bias": "blocks.1.conv2.bias", + "first_stage_model.decoder.mid.block_1.time_stack.out_layers.3.weight": "blocks.1.conv2.weight", + "first_stage_model.decoder.mid.block_2.conv1.bias": "blocks.3.conv1.bias", + "first_stage_model.decoder.mid.block_2.conv1.weight": "blocks.3.conv1.weight", + "first_stage_model.decoder.mid.block_2.conv2.bias": "blocks.3.conv2.bias", + "first_stage_model.decoder.mid.block_2.conv2.weight": "blocks.3.conv2.weight", + "first_stage_model.decoder.mid.block_2.mix_factor": "blocks.4.mix_factor", + "first_stage_model.decoder.mid.block_2.norm1.bias": "blocks.3.norm1.bias", + "first_stage_model.decoder.mid.block_2.norm1.weight": "blocks.3.norm1.weight", + "first_stage_model.decoder.mid.block_2.norm2.bias": "blocks.3.norm2.bias", + "first_stage_model.decoder.mid.block_2.norm2.weight": "blocks.3.norm2.weight", + "first_stage_model.decoder.mid.block_2.time_stack.in_layers.0.bias": "blocks.4.norm1.bias", + "first_stage_model.decoder.mid.block_2.time_stack.in_layers.0.weight": "blocks.4.norm1.weight", + "first_stage_model.decoder.mid.block_2.time_stack.in_layers.2.bias": "blocks.4.conv1.bias", + "first_stage_model.decoder.mid.block_2.time_stack.in_layers.2.weight": "blocks.4.conv1.weight", + "first_stage_model.decoder.mid.block_2.time_stack.out_layers.0.bias": "blocks.4.norm2.bias", + "first_stage_model.decoder.mid.block_2.time_stack.out_layers.0.weight": "blocks.4.norm2.weight", + "first_stage_model.decoder.mid.block_2.time_stack.out_layers.3.bias": "blocks.4.conv2.bias", + "first_stage_model.decoder.mid.block_2.time_stack.out_layers.3.weight": "blocks.4.conv2.weight", + "first_stage_model.decoder.norm_out.bias": "conv_norm_out.bias", + "first_stage_model.decoder.norm_out.weight": "conv_norm_out.weight", + "first_stage_model.decoder.up.0.block.0.conv1.bias": "blocks.26.conv1.bias", + "first_stage_model.decoder.up.0.block.0.conv1.weight": "blocks.26.conv1.weight", + "first_stage_model.decoder.up.0.block.0.conv2.bias": "blocks.26.conv2.bias", + "first_stage_model.decoder.up.0.block.0.conv2.weight": "blocks.26.conv2.weight", + "first_stage_model.decoder.up.0.block.0.mix_factor": "blocks.27.mix_factor", + "first_stage_model.decoder.up.0.block.0.nin_shortcut.bias": "blocks.26.conv_shortcut.bias", + "first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": "blocks.26.conv_shortcut.weight", + "first_stage_model.decoder.up.0.block.0.norm1.bias": "blocks.26.norm1.bias", + "first_stage_model.decoder.up.0.block.0.norm1.weight": "blocks.26.norm1.weight", + "first_stage_model.decoder.up.0.block.0.norm2.bias": "blocks.26.norm2.bias", + "first_stage_model.decoder.up.0.block.0.norm2.weight": "blocks.26.norm2.weight", + "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.0.bias": "blocks.27.norm1.bias", + "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.0.weight": "blocks.27.norm1.weight", + "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.2.bias": "blocks.27.conv1.bias", + "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.2.weight": "blocks.27.conv1.weight", + "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.0.bias": "blocks.27.norm2.bias", + "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.0.weight": "blocks.27.norm2.weight", + "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.3.bias": "blocks.27.conv2.bias", + "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.3.weight": "blocks.27.conv2.weight", + "first_stage_model.decoder.up.0.block.1.conv1.bias": "blocks.28.conv1.bias", + "first_stage_model.decoder.up.0.block.1.conv1.weight": "blocks.28.conv1.weight", + "first_stage_model.decoder.up.0.block.1.conv2.bias": "blocks.28.conv2.bias", + "first_stage_model.decoder.up.0.block.1.conv2.weight": "blocks.28.conv2.weight", + "first_stage_model.decoder.up.0.block.1.mix_factor": "blocks.29.mix_factor", + "first_stage_model.decoder.up.0.block.1.norm1.bias": "blocks.28.norm1.bias", + "first_stage_model.decoder.up.0.block.1.norm1.weight": "blocks.28.norm1.weight", + "first_stage_model.decoder.up.0.block.1.norm2.bias": "blocks.28.norm2.bias", + "first_stage_model.decoder.up.0.block.1.norm2.weight": "blocks.28.norm2.weight", + "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.0.bias": "blocks.29.norm1.bias", + "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.0.weight": "blocks.29.norm1.weight", + "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.2.bias": "blocks.29.conv1.bias", + "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.2.weight": "blocks.29.conv1.weight", + "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.0.bias": "blocks.29.norm2.bias", + "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.0.weight": "blocks.29.norm2.weight", + "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.3.bias": "blocks.29.conv2.bias", + "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.3.weight": "blocks.29.conv2.weight", + "first_stage_model.decoder.up.0.block.2.conv1.bias": "blocks.30.conv1.bias", + "first_stage_model.decoder.up.0.block.2.conv1.weight": "blocks.30.conv1.weight", + "first_stage_model.decoder.up.0.block.2.conv2.bias": "blocks.30.conv2.bias", + "first_stage_model.decoder.up.0.block.2.conv2.weight": "blocks.30.conv2.weight", + "first_stage_model.decoder.up.0.block.2.mix_factor": "blocks.31.mix_factor", + "first_stage_model.decoder.up.0.block.2.norm1.bias": "blocks.30.norm1.bias", + "first_stage_model.decoder.up.0.block.2.norm1.weight": "blocks.30.norm1.weight", + "first_stage_model.decoder.up.0.block.2.norm2.bias": "blocks.30.norm2.bias", + "first_stage_model.decoder.up.0.block.2.norm2.weight": "blocks.30.norm2.weight", + "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.0.bias": "blocks.31.norm1.bias", + "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.0.weight": "blocks.31.norm1.weight", + "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.2.bias": "blocks.31.conv1.bias", + "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.2.weight": "blocks.31.conv1.weight", + "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.0.bias": "blocks.31.norm2.bias", + "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.0.weight": "blocks.31.norm2.weight", + "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.3.bias": "blocks.31.conv2.bias", + "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.3.weight": "blocks.31.conv2.weight", + "first_stage_model.decoder.up.1.block.0.conv1.bias": "blocks.19.conv1.bias", + "first_stage_model.decoder.up.1.block.0.conv1.weight": "blocks.19.conv1.weight", + "first_stage_model.decoder.up.1.block.0.conv2.bias": "blocks.19.conv2.bias", + "first_stage_model.decoder.up.1.block.0.conv2.weight": "blocks.19.conv2.weight", + "first_stage_model.decoder.up.1.block.0.mix_factor": "blocks.20.mix_factor", + "first_stage_model.decoder.up.1.block.0.nin_shortcut.bias": "blocks.19.conv_shortcut.bias", + "first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": "blocks.19.conv_shortcut.weight", + "first_stage_model.decoder.up.1.block.0.norm1.bias": "blocks.19.norm1.bias", + "first_stage_model.decoder.up.1.block.0.norm1.weight": "blocks.19.norm1.weight", + "first_stage_model.decoder.up.1.block.0.norm2.bias": "blocks.19.norm2.bias", + "first_stage_model.decoder.up.1.block.0.norm2.weight": "blocks.19.norm2.weight", + "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.0.bias": "blocks.20.norm1.bias", + "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.0.weight": "blocks.20.norm1.weight", + "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.2.bias": "blocks.20.conv1.bias", + "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.2.weight": "blocks.20.conv1.weight", + "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.0.bias": "blocks.20.norm2.bias", + "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.0.weight": "blocks.20.norm2.weight", + "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.3.bias": "blocks.20.conv2.bias", + "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.3.weight": "blocks.20.conv2.weight", + "first_stage_model.decoder.up.1.block.1.conv1.bias": "blocks.21.conv1.bias", + "first_stage_model.decoder.up.1.block.1.conv1.weight": "blocks.21.conv1.weight", + "first_stage_model.decoder.up.1.block.1.conv2.bias": "blocks.21.conv2.bias", + "first_stage_model.decoder.up.1.block.1.conv2.weight": "blocks.21.conv2.weight", + "first_stage_model.decoder.up.1.block.1.mix_factor": "blocks.22.mix_factor", + "first_stage_model.decoder.up.1.block.1.norm1.bias": "blocks.21.norm1.bias", + "first_stage_model.decoder.up.1.block.1.norm1.weight": "blocks.21.norm1.weight", + "first_stage_model.decoder.up.1.block.1.norm2.bias": "blocks.21.norm2.bias", + "first_stage_model.decoder.up.1.block.1.norm2.weight": "blocks.21.norm2.weight", + "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.0.bias": "blocks.22.norm1.bias", + "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.0.weight": "blocks.22.norm1.weight", + "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.2.bias": "blocks.22.conv1.bias", + "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.2.weight": "blocks.22.conv1.weight", + "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.0.bias": "blocks.22.norm2.bias", + "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.0.weight": "blocks.22.norm2.weight", + "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.3.bias": "blocks.22.conv2.bias", + "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.3.weight": "blocks.22.conv2.weight", + "first_stage_model.decoder.up.1.block.2.conv1.bias": "blocks.23.conv1.bias", + "first_stage_model.decoder.up.1.block.2.conv1.weight": "blocks.23.conv1.weight", + "first_stage_model.decoder.up.1.block.2.conv2.bias": "blocks.23.conv2.bias", + "first_stage_model.decoder.up.1.block.2.conv2.weight": "blocks.23.conv2.weight", + "first_stage_model.decoder.up.1.block.2.mix_factor": "blocks.24.mix_factor", + "first_stage_model.decoder.up.1.block.2.norm1.bias": "blocks.23.norm1.bias", + "first_stage_model.decoder.up.1.block.2.norm1.weight": "blocks.23.norm1.weight", + "first_stage_model.decoder.up.1.block.2.norm2.bias": "blocks.23.norm2.bias", + "first_stage_model.decoder.up.1.block.2.norm2.weight": "blocks.23.norm2.weight", + "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.0.bias": "blocks.24.norm1.bias", + "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.0.weight": "blocks.24.norm1.weight", + "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.2.bias": "blocks.24.conv1.bias", + "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.2.weight": "blocks.24.conv1.weight", + "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.0.bias": "blocks.24.norm2.bias", + "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.0.weight": "blocks.24.norm2.weight", + "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.3.bias": "blocks.24.conv2.bias", + "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.3.weight": "blocks.24.conv2.weight", + "first_stage_model.decoder.up.1.upsample.conv.bias": "blocks.25.conv.bias", + "first_stage_model.decoder.up.1.upsample.conv.weight": "blocks.25.conv.weight", + "first_stage_model.decoder.up.2.block.0.conv1.bias": "blocks.12.conv1.bias", + "first_stage_model.decoder.up.2.block.0.conv1.weight": "blocks.12.conv1.weight", + "first_stage_model.decoder.up.2.block.0.conv2.bias": "blocks.12.conv2.bias", + "first_stage_model.decoder.up.2.block.0.conv2.weight": "blocks.12.conv2.weight", + "first_stage_model.decoder.up.2.block.0.mix_factor": "blocks.13.mix_factor", + "first_stage_model.decoder.up.2.block.0.norm1.bias": "blocks.12.norm1.bias", + "first_stage_model.decoder.up.2.block.0.norm1.weight": "blocks.12.norm1.weight", + "first_stage_model.decoder.up.2.block.0.norm2.bias": "blocks.12.norm2.bias", + "first_stage_model.decoder.up.2.block.0.norm2.weight": "blocks.12.norm2.weight", + "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.0.bias": "blocks.13.norm1.bias", + "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.0.weight": "blocks.13.norm1.weight", + "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.2.bias": "blocks.13.conv1.bias", + "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.2.weight": "blocks.13.conv1.weight", + "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.0.bias": "blocks.13.norm2.bias", + "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.0.weight": "blocks.13.norm2.weight", + "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.3.bias": "blocks.13.conv2.bias", + "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.3.weight": "blocks.13.conv2.weight", + "first_stage_model.decoder.up.2.block.1.conv1.bias": "blocks.14.conv1.bias", + "first_stage_model.decoder.up.2.block.1.conv1.weight": "blocks.14.conv1.weight", + "first_stage_model.decoder.up.2.block.1.conv2.bias": "blocks.14.conv2.bias", + "first_stage_model.decoder.up.2.block.1.conv2.weight": "blocks.14.conv2.weight", + "first_stage_model.decoder.up.2.block.1.mix_factor": "blocks.15.mix_factor", + "first_stage_model.decoder.up.2.block.1.norm1.bias": "blocks.14.norm1.bias", + "first_stage_model.decoder.up.2.block.1.norm1.weight": "blocks.14.norm1.weight", + "first_stage_model.decoder.up.2.block.1.norm2.bias": "blocks.14.norm2.bias", + "first_stage_model.decoder.up.2.block.1.norm2.weight": "blocks.14.norm2.weight", + "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.0.bias": "blocks.15.norm1.bias", + "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.0.weight": "blocks.15.norm1.weight", + "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.2.bias": "blocks.15.conv1.bias", + "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.2.weight": "blocks.15.conv1.weight", + "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.0.bias": "blocks.15.norm2.bias", + "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.0.weight": "blocks.15.norm2.weight", + "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.3.bias": "blocks.15.conv2.bias", + "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.3.weight": "blocks.15.conv2.weight", + "first_stage_model.decoder.up.2.block.2.conv1.bias": "blocks.16.conv1.bias", + "first_stage_model.decoder.up.2.block.2.conv1.weight": "blocks.16.conv1.weight", + "first_stage_model.decoder.up.2.block.2.conv2.bias": "blocks.16.conv2.bias", + "first_stage_model.decoder.up.2.block.2.conv2.weight": "blocks.16.conv2.weight", + "first_stage_model.decoder.up.2.block.2.mix_factor": "blocks.17.mix_factor", + "first_stage_model.decoder.up.2.block.2.norm1.bias": "blocks.16.norm1.bias", + "first_stage_model.decoder.up.2.block.2.norm1.weight": "blocks.16.norm1.weight", + "first_stage_model.decoder.up.2.block.2.norm2.bias": "blocks.16.norm2.bias", + "first_stage_model.decoder.up.2.block.2.norm2.weight": "blocks.16.norm2.weight", + "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.0.bias": "blocks.17.norm1.bias", + "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.0.weight": "blocks.17.norm1.weight", + "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.2.bias": "blocks.17.conv1.bias", + "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.2.weight": "blocks.17.conv1.weight", + "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.0.bias": "blocks.17.norm2.bias", + "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.0.weight": "blocks.17.norm2.weight", + "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.3.bias": "blocks.17.conv2.bias", + "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.3.weight": "blocks.17.conv2.weight", + "first_stage_model.decoder.up.2.upsample.conv.bias": "blocks.18.conv.bias", + "first_stage_model.decoder.up.2.upsample.conv.weight": "blocks.18.conv.weight", + "first_stage_model.decoder.up.3.block.0.conv1.bias": "blocks.5.conv1.bias", + "first_stage_model.decoder.up.3.block.0.conv1.weight": "blocks.5.conv1.weight", + "first_stage_model.decoder.up.3.block.0.conv2.bias": "blocks.5.conv2.bias", + "first_stage_model.decoder.up.3.block.0.conv2.weight": "blocks.5.conv2.weight", + "first_stage_model.decoder.up.3.block.0.mix_factor": "blocks.6.mix_factor", + "first_stage_model.decoder.up.3.block.0.norm1.bias": "blocks.5.norm1.bias", + "first_stage_model.decoder.up.3.block.0.norm1.weight": "blocks.5.norm1.weight", + "first_stage_model.decoder.up.3.block.0.norm2.bias": "blocks.5.norm2.bias", + "first_stage_model.decoder.up.3.block.0.norm2.weight": "blocks.5.norm2.weight", + "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.0.bias": "blocks.6.norm1.bias", + "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.0.weight": "blocks.6.norm1.weight", + "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.2.bias": "blocks.6.conv1.bias", + "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.2.weight": "blocks.6.conv1.weight", + "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.0.bias": "blocks.6.norm2.bias", + "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.0.weight": "blocks.6.norm2.weight", + "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.3.bias": "blocks.6.conv2.bias", + "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.3.weight": "blocks.6.conv2.weight", + "first_stage_model.decoder.up.3.block.1.conv1.bias": "blocks.7.conv1.bias", + "first_stage_model.decoder.up.3.block.1.conv1.weight": "blocks.7.conv1.weight", + "first_stage_model.decoder.up.3.block.1.conv2.bias": "blocks.7.conv2.bias", + "first_stage_model.decoder.up.3.block.1.conv2.weight": "blocks.7.conv2.weight", + "first_stage_model.decoder.up.3.block.1.mix_factor": "blocks.8.mix_factor", + "first_stage_model.decoder.up.3.block.1.norm1.bias": "blocks.7.norm1.bias", + "first_stage_model.decoder.up.3.block.1.norm1.weight": "blocks.7.norm1.weight", + "first_stage_model.decoder.up.3.block.1.norm2.bias": "blocks.7.norm2.bias", + "first_stage_model.decoder.up.3.block.1.norm2.weight": "blocks.7.norm2.weight", + "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.0.bias": "blocks.8.norm1.bias", + "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.0.weight": "blocks.8.norm1.weight", + "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.2.bias": "blocks.8.conv1.bias", + "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.2.weight": "blocks.8.conv1.weight", + "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.0.bias": "blocks.8.norm2.bias", + "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.0.weight": "blocks.8.norm2.weight", + "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.3.bias": "blocks.8.conv2.bias", + "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.3.weight": "blocks.8.conv2.weight", + "first_stage_model.decoder.up.3.block.2.conv1.bias": "blocks.9.conv1.bias", + "first_stage_model.decoder.up.3.block.2.conv1.weight": "blocks.9.conv1.weight", + "first_stage_model.decoder.up.3.block.2.conv2.bias": "blocks.9.conv2.bias", + "first_stage_model.decoder.up.3.block.2.conv2.weight": "blocks.9.conv2.weight", + "first_stage_model.decoder.up.3.block.2.mix_factor": "blocks.10.mix_factor", + "first_stage_model.decoder.up.3.block.2.norm1.bias": "blocks.9.norm1.bias", + "first_stage_model.decoder.up.3.block.2.norm1.weight": "blocks.9.norm1.weight", + "first_stage_model.decoder.up.3.block.2.norm2.bias": "blocks.9.norm2.bias", + "first_stage_model.decoder.up.3.block.2.norm2.weight": "blocks.9.norm2.weight", + "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.0.bias": "blocks.10.norm1.bias", + "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.0.weight": "blocks.10.norm1.weight", + "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.2.bias": "blocks.10.conv1.bias", + "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.2.weight": "blocks.10.conv1.weight", + "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.0.bias": "blocks.10.norm2.bias", + "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.0.weight": "blocks.10.norm2.weight", + "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.3.bias": "blocks.10.conv2.bias", + "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.3.weight": "blocks.10.conv2.weight", + "first_stage_model.decoder.up.3.upsample.conv.bias": "blocks.11.conv.bias", + "first_stage_model.decoder.up.3.upsample.conv.weight": "blocks.11.conv.weight", + } + state_dict_ = {} + for name in state_dict: + if name in rename_dict: + param = state_dict[name] + if "blocks.2.transformer_blocks.0" in rename_dict[name]: + param = param.squeeze() + state_dict_[rename_dict[name]] = param + return state_dict_ diff --git a/diffsynth/models/svd_vae_encoder.py b/diffsynth/models/svd_vae_encoder.py new file mode 100644 index 0000000..3e84a59 --- /dev/null +++ b/diffsynth/models/svd_vae_encoder.py @@ -0,0 +1,138 @@ +from .sd_vae_encoder import SDVAEEncoderStateDictConverter, SDVAEEncoder + + +class SVDVAEEncoder(SDVAEEncoder): + def __init__(self): + super().__init__() + self.scaling_factor = 0.13025 + + def state_dict_converter(self): + return SVDVAEEncoderStateDictConverter() + + +class SVDVAEEncoderStateDictConverter(SDVAEEncoderStateDictConverter): + def __init__(self): + super().__init__() + + def from_diffusers(self, state_dict): + return super().from_diffusers(state_dict) + + def from_civitai(self, state_dict): + rename_dict = { + "conditioner.embedders.3.encoder.encoder.conv_in.bias": "conv_in.bias", + "conditioner.embedders.3.encoder.encoder.conv_in.weight": "conv_in.weight", + "conditioner.embedders.3.encoder.encoder.conv_out.bias": "conv_out.bias", + "conditioner.embedders.3.encoder.encoder.conv_out.weight": "conv_out.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias", + "conditioner.embedders.3.encoder.encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias", + "conditioner.embedders.3.encoder.encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias", + "conditioner.embedders.3.encoder.encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias", + "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias", + "conditioner.embedders.3.encoder.encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight", + "conditioner.embedders.3.encoder.encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias", + "conditioner.embedders.3.encoder.encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight", + "conditioner.embedders.3.encoder.encoder.norm_out.bias": "conv_norm_out.bias", + "conditioner.embedders.3.encoder.encoder.norm_out.weight": "conv_norm_out.weight", + "conditioner.embedders.3.encoder.quant_conv.bias": "quant_conv.bias", + "conditioner.embedders.3.encoder.quant_conv.weight": "quant_conv.weight", + } + state_dict_ = {} + for name in state_dict: + if name in rename_dict: + param = state_dict[name] + if "transformer_blocks" in rename_dict[name]: + param = param.squeeze() + state_dict_[rename_dict[name]] = param + return state_dict_ diff --git a/diffsynth/pipelines/__init__.py b/diffsynth/pipelines/__init__.py index 4f97931..80dda9f 100644 --- a/diffsynth/pipelines/__init__.py +++ b/diffsynth/pipelines/__init__.py @@ -1,3 +1,4 @@ from .stable_diffusion import SDImagePipeline from .stable_diffusion_xl import SDXLImagePipeline from .stable_diffusion_video import SDVideoPipeline, SDVideoPipelineRunner +from .stable_video_diffusion import SVDVideoPipeline diff --git a/diffsynth/pipelines/stable_video_diffusion.py b/diffsynth/pipelines/stable_video_diffusion.py new file mode 100644 index 0000000..2bf5197 --- /dev/null +++ b/diffsynth/pipelines/stable_video_diffusion.py @@ -0,0 +1,289 @@ +from ..models import ModelManager, SVDImageEncoder, SVDUNet, SVDVAEEncoder, SVDVAEDecoder +from ..schedulers import ContinuousODEScheduler +from ..data import save_video +import torch +from tqdm import tqdm +from PIL import Image +import numpy as np +from einops import rearrange, repeat + + + +class SVDVideoPipeline(torch.nn.Module): + + def __init__(self, device="cuda", torch_dtype=torch.float16): + super().__init__() + self.scheduler = ContinuousODEScheduler() + self.device = device + self.torch_dtype = torch_dtype + # models + self.image_encoder: SVDImageEncoder = None + self.unet: SVDUNet = None + self.vae_encoder: SVDVAEEncoder = None + self.vae_decoder: SVDVAEDecoder = None + + + def fetch_main_models(self, model_manager: ModelManager): + self.image_encoder = model_manager.image_encoder + self.unet = model_manager.unet + self.vae_encoder = model_manager.vae_encoder + self.vae_decoder = model_manager.vae_decoder + + + @staticmethod + def from_model_manager(model_manager: ModelManager, **kwargs): + pipe = SVDVideoPipeline(device=model_manager.device, torch_dtype=model_manager.torch_dtype) + pipe.fetch_main_models(model_manager) + return pipe + + + def preprocess_image(self, image): + image = torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1).permute(2, 0, 1).unsqueeze(0) + return image + + + def decode_image(self, latent, tiled=False, tile_size=64, tile_stride=32): + image = self.vae_decoder(latent.to(self.device), tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)[0] + image = image.cpu().permute(1, 2, 0).numpy() + image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8")) + return image + + + def encode_image_with_clip(self, image): + image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype) + image = SVDCLIPImageProcessor().resize_with_antialiasing(image, (224, 224)) + image = (image + 1.0) / 2.0 + mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).reshape(1, 3, 1, 1).to(device=self.device, dtype=self.torch_dtype) + std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).reshape(1, 3, 1, 1).to(device=self.device, dtype=self.torch_dtype) + image = (image - mean) / std + image_emb = self.image_encoder(image) + return image_emb + + + def encode_image_with_vae(self, image, noise_aug_strength): + image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype) + noise = torch.randn(image.shape, device="cpu", dtype=self.torch_dtype).to(self.device) + image = image + noise_aug_strength * noise + image_emb = self.vae_encoder(image) / self.vae_encoder.scaling_factor + return image_emb + + + def encode_video_with_vae(self, video): + video = torch.concat([self.preprocess_image(frame) for frame in video], dim=0) + video = rearrange(video, "T C H W -> 1 C T H W") + video = video.to(device=self.device, dtype=self.torch_dtype) + latents = self.vae_encoder.encode_video(video) + latents = rearrange(latents[0], "C T H W -> T C H W") + return latents + + + def tensor2video(self, frames): + frames = rearrange(frames, "C T H W -> T H W C") + frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8) + frames = [Image.fromarray(frame) for frame in frames] + return frames + + + def calculate_noise_pred( + self, + latents, + timestep, + add_time_id, + cfg_scales, + image_emb_vae_posi, image_emb_clip_posi, + image_emb_vae_nega, image_emb_clip_nega + ): + latents_input = self.scheduler.scale_model_input(latents, timestep) + + # Positive side + noise_pred_posi = self.unet( + torch.cat([latents_input, image_emb_vae_posi], dim=1), + timestep, image_emb_clip_posi, add_time_id + ) + # Negative side + noise_pred_nega = self.unet( + torch.cat([latents_input, image_emb_vae_nega], dim=1), + timestep, image_emb_clip_nega, add_time_id + ) + + # Classifier-free guidance + noise_pred = noise_pred_nega + cfg_scales * (noise_pred_posi - noise_pred_nega) + + return noise_pred + + + @torch.no_grad() + def __call__( + self, + input_image=None, + input_video=None, + min_cfg_scale=1.0, + max_cfg_scale=3.0, + denoising_strength=1.0, + num_frames=25, + height=576, + width=1024, + fps=7, + motion_bucket_id=127, + noise_aug_strength=0.02, + num_inference_steps=20, + progress_bar_cmd=tqdm, + progress_bar_st=None, + ): + # Prepare scheduler + self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength) + + # Prepare latent tensors + noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).to(self.device) + if denoising_strength == 1.0: + latents = noise * self.scheduler.init_noise_sigma + else: + latents = self.encode_video_with_vae(input_video) + latents = self.scheduler.add_noise(latents, noise, self.scheduler.timesteps[0]) + + # Encode image + image_emb_clip_posi = self.encode_image_with_clip(input_image) + image_emb_clip_nega = torch.zeros_like(image_emb_clip_posi) + image_emb_vae_posi = repeat(self.encode_image_with_vae(input_image, noise_aug_strength), "B C H W -> (B T) C H W", T=num_frames) + image_emb_vae_nega = torch.zeros_like(image_emb_vae_posi) + + # Prepare classifier-free guidance + cfg_scales = torch.linspace(min_cfg_scale, max_cfg_scale, num_frames) + cfg_scales = cfg_scales.reshape(num_frames, 1, 1, 1).to(device=self.device, dtype=self.torch_dtype) + + # Prepare positional id + add_time_id = torch.tensor([[fps-1, motion_bucket_id, noise_aug_strength]], device=self.device) + + # Denoise + for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): + + # Fetch model output + noise_pred = self.calculate_noise_pred( + latents, timestep, add_time_id, cfg_scales, + image_emb_vae_posi, image_emb_clip_posi, image_emb_vae_nega, image_emb_clip_nega + ) + + # Forward Euler + latents = self.scheduler.step(noise_pred, timestep, latents) + + # Update progress bar + if progress_bar_st is not None: + progress_bar_st.progress(progress_id / len(self.scheduler.timesteps)) + + # Decode image + video = self.vae_decoder.decode_video(latents, progress_bar=progress_bar_cmd) + video = self.tensor2video(video) + + return video + + + +class SVDCLIPImageProcessor: + def __init__(self): + pass + + def resize_with_antialiasing(self, input, size, interpolation="bicubic", align_corners=True): + h, w = input.shape[-2:] + factors = (h / size[0], w / size[1]) + + # First, we have to determine sigma + # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171 + sigmas = ( + max((factors[0] - 1.0) / 2.0, 0.001), + max((factors[1] - 1.0) / 2.0, 0.001), + ) + + # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma + # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206 + # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + + # Make sure it is odd + if (ks[0] % 2) == 0: + ks = ks[0] + 1, ks[1] + + if (ks[1] % 2) == 0: + ks = ks[0], ks[1] + 1 + + input = self._gaussian_blur2d(input, ks, sigmas) + + output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners) + return output + + + def _compute_padding(self, kernel_size): + """Compute padding tuple.""" + # 4 or 6 ints: (padding_left, padding_right,padding_top,padding_bottom) + # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad + if len(kernel_size) < 2: + raise AssertionError(kernel_size) + computed = [k - 1 for k in kernel_size] + + # for even kernels we need to do asymmetric padding :( + out_padding = 2 * len(kernel_size) * [0] + + for i in range(len(kernel_size)): + computed_tmp = computed[-(i + 1)] + + pad_front = computed_tmp // 2 + pad_rear = computed_tmp - pad_front + + out_padding[2 * i + 0] = pad_front + out_padding[2 * i + 1] = pad_rear + + return out_padding + + + def _filter2d(self, input, kernel): + # prepare kernel + b, c, h, w = input.shape + tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype) + + tmp_kernel = tmp_kernel.expand(-1, c, -1, -1) + + height, width = tmp_kernel.shape[-2:] + + padding_shape: list[int] = self._compute_padding([height, width]) + input = torch.nn.functional.pad(input, padding_shape, mode="reflect") + + # kernel and input tensor reshape to align element-wise or batch-wise params + tmp_kernel = tmp_kernel.reshape(-1, 1, height, width) + input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1)) + + # convolve the tensor with the kernel. + output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1) + + out = output.view(b, c, h, w) + return out + + + def _gaussian(self, window_size: int, sigma): + if isinstance(sigma, float): + sigma = torch.tensor([[sigma]]) + + batch_size = sigma.shape[0] + + x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1) + + if window_size % 2 == 0: + x = x + 0.5 + + gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0))) + + return gauss / gauss.sum(-1, keepdim=True) + + + def _gaussian_blur2d(self, input, kernel_size, sigma): + if isinstance(sigma, tuple): + sigma = torch.tensor([sigma], dtype=input.dtype) + else: + sigma = sigma.to(dtype=input.dtype) + + ky, kx = int(kernel_size[0]), int(kernel_size[1]) + bs = sigma.shape[0] + kernel_x = self._gaussian(kx, sigma[:, 1].view(bs, 1)) + kernel_y = self._gaussian(ky, sigma[:, 0].view(bs, 1)) + out_x = self._filter2d(input, kernel_x[..., None, :]) + out = self._filter2d(out_x, kernel_y[..., None]) + + return out diff --git a/diffsynth/schedulers/__init__.py b/diffsynth/schedulers/__init__.py index 303fffe..1620e13 100644 --- a/diffsynth/schedulers/__init__.py +++ b/diffsynth/schedulers/__init__.py @@ -1,65 +1,2 @@ -import torch, math - - -class EnhancedDDIMScheduler(): - - def __init__(self, num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"): - self.num_train_timesteps = num_train_timesteps - if beta_schedule == "scaled_linear": - betas = torch.square(torch.linspace(math.sqrt(beta_start), math.sqrt(beta_end), num_train_timesteps, dtype=torch.float32)) - elif beta_schedule == "linear": - betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) - else: - raise NotImplementedError(f"{beta_schedule} is not implemented") - self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0).tolist() - self.set_timesteps(10) - - - def set_timesteps(self, num_inference_steps, denoising_strength=1.0): - # The timesteps are aligned to 999...0, which is different from other implementations, - # but I think this implementation is more reasonable in theory. - max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0) - num_inference_steps = min(num_inference_steps, max_timestep + 1) - if num_inference_steps == 1: - self.timesteps = [max_timestep] - else: - step_length = max_timestep / (num_inference_steps - 1) - self.timesteps = [round(max_timestep - i*step_length) for i in range(num_inference_steps)] - - - def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev): - weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t) - weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t) - - prev_sample = sample * weight_x + model_output * weight_e - - weight_e = -math.sqrt((1 - alpha_prod_t) / alpha_prod_t) - weight_x = math.sqrt(1 / alpha_prod_t) - - return prev_sample - - - def step(self, model_output, timestep, sample, to_final=False): - alpha_prod_t = self.alphas_cumprod[timestep] - timestep_id = self.timesteps.index(timestep) - if to_final or timestep_id + 1 >= len(self.timesteps): - alpha_prod_t_prev = 1.0 - else: - timestep_prev = self.timesteps[timestep_id + 1] - alpha_prod_t_prev = self.alphas_cumprod[timestep_prev] - - return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev) - - - def return_to_timestep(self, timestep, sample, sample_stablized): - alpha_prod_t = self.alphas_cumprod[timestep] - noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(1 - alpha_prod_t) - return noise_pred - - - def add_noise(self, original_samples, noise, timestep): - sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[timestep]) - sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[timestep]) - noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise - return noisy_samples - +from .ddim import EnhancedDDIMScheduler +from .continuous_ode import ContinuousODEScheduler diff --git a/diffsynth/schedulers/continuous_ode.py b/diffsynth/schedulers/continuous_ode.py new file mode 100644 index 0000000..bbe57f9 --- /dev/null +++ b/diffsynth/schedulers/continuous_ode.py @@ -0,0 +1,52 @@ +import torch, math + + +class ContinuousODEScheduler(): + + def __init__(self, num_inference_steps=100, sigma_max=700.0, sigma_min=0.002, rho=7.0): + self.sigma_max = sigma_max + self.sigma_min = sigma_min + self.rho = rho + self.init_noise_sigma = math.sqrt(sigma_max*sigma_max + 1) + self.set_timesteps(num_inference_steps) + + + def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0): + ramp = torch.linspace(0, denoising_strength, num_inference_steps) + min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho)) + max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho)) + self.sigmas = torch.pow(max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho) + self.timesteps = torch.log(self.sigmas) * 0.25 + + + def step(self, model_output, timestep, sample, to_final=False): + timestep_id = torch.argmin((self.timesteps - timestep).abs()) + sigma = self.sigmas[timestep_id] + estimated_sample = -sigma / (sigma*sigma + 1).sqrt() * model_output + 1 / (sigma*sigma + 1) * sample + if to_final or timestep_id + 1 >= len(self.timesteps): + prev_sample = estimated_sample + else: + dt = self.sigmas[timestep_id + 1] - sigma + derivative = 1 / sigma * (sample - estimated_sample) + prev_sample = sample + derivative * dt + return prev_sample + + + def scale_model_input(self, sample, timestep): + timestep_id = torch.argmin((self.timesteps - timestep).abs()) + sigma = self.sigmas[timestep_id] + sample = sample / (sigma*sigma + 1).sqrt() + return sample + + + def return_to_timestep(self, timestep, sample, sample_stablized): + # This scheduler doesn't support this function. + pass + + + def add_noise(self, original_samples, noise, timestep): + timestep_id = torch.argmin((self.timesteps - timestep).abs()) + sigma = self.sigmas[timestep_id] + sample = original_samples + noise * sigma + return sample + diff --git a/diffsynth/schedulers/ddim.py b/diffsynth/schedulers/ddim.py new file mode 100644 index 0000000..8bfcee6 --- /dev/null +++ b/diffsynth/schedulers/ddim.py @@ -0,0 +1,60 @@ +import torch, math + + +class EnhancedDDIMScheduler(): + + def __init__(self, num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"): + self.num_train_timesteps = num_train_timesteps + if beta_schedule == "scaled_linear": + betas = torch.square(torch.linspace(math.sqrt(beta_start), math.sqrt(beta_end), num_train_timesteps, dtype=torch.float32)) + elif beta_schedule == "linear": + betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + else: + raise NotImplementedError(f"{beta_schedule} is not implemented") + self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0).tolist() + self.set_timesteps(10) + + + def set_timesteps(self, num_inference_steps, denoising_strength=1.0): + # The timesteps are aligned to 999...0, which is different from other implementations, + # but I think this implementation is more reasonable in theory. + max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0) + num_inference_steps = min(num_inference_steps, max_timestep + 1) + if num_inference_steps == 1: + self.timesteps = [max_timestep] + else: + step_length = max_timestep / (num_inference_steps - 1) + self.timesteps = [round(max_timestep - i*step_length) for i in range(num_inference_steps)] + + + def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev): + weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t) + weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t) + prev_sample = sample * weight_x + model_output * weight_e + return prev_sample + + + def step(self, model_output, timestep, sample, to_final=False): + alpha_prod_t = self.alphas_cumprod[timestep] + timestep_id = self.timesteps.index(timestep) + if to_final or timestep_id + 1 >= len(self.timesteps): + alpha_prod_t_prev = 1.0 + else: + timestep_prev = self.timesteps[timestep_id + 1] + alpha_prod_t_prev = self.alphas_cumprod[timestep_prev] + + return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev) + + + def return_to_timestep(self, timestep, sample, sample_stablized): + alpha_prod_t = self.alphas_cumprod[timestep] + noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(1 - alpha_prod_t) + return noise_pred + + + def add_noise(self, original_samples, noise, timestep): + sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[timestep]) + sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[timestep]) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples + diff --git a/examples/svd_text_to_video.py b/examples/svd_text_to_video.py new file mode 100644 index 0000000..b432f1f --- /dev/null +++ b/examples/svd_text_to_video.py @@ -0,0 +1,37 @@ +from diffsynth import save_video, SDXLImagePipeline, ModelManager, SVDVideoPipeline +from diffsynth import ModelManager +import torch + + +# Download models +# `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) +# `models/stable_video_diffusion/svd_xt.safetensors`: [link](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors) + + +prompt = "cloud, wind" +torch.manual_seed(0) + +# 1. Text-to-image using SD-XL +model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") +model_manager.load_models(["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"]) +pipe = SDXLImagePipeline.from_model_manager(model_manager) +image = pipe( + prompt=prompt, + negative_prompt="", + cfg_scale=6, + height=1024, width=1024, num_inference_steps=50, +) +pipe.to("cpu") +torch.cuda.empty_cache() + +# 2. Image-to-video using SVD +model_manager = ModelManager() +model_manager.load_models(["models/stable_video_diffusion/svd_xt.safetensors"]) +pipe = SVDVideoPipeline.from_model_manager(model_manager) +video = pipe( + input_image=image, + num_frames=25, fps=15, height=1024, width=1024, + motion_bucket_id=127, + num_inference_steps=50 +) +save_video(video, "video.mp4", fps=15) diff --git a/models/stable_video_diffusion/Put Stable Video Diffusion checkpoints here.txt b/models/stable_video_diffusion/Put Stable Video Diffusion checkpoints here.txt new file mode 100644 index 0000000..e69de29