mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-22 16:50:47 +00:00
update preference models
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
from .coca_model import CoCa
|
||||
from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
|
||||
from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
|
||||
from .factory import list_models, add_model_config, get_model_config, load_checkpoint
|
||||
from .loss import ClipLoss, DistillClipLoss, CoCaLoss
|
||||
from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
|
||||
convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
|
||||
from .openai import load_openai_model, list_openai_models
|
||||
from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
|
||||
get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
|
||||
from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
|
||||
from .tokenizer import SimpleTokenizer
|
||||
from .transform import image_transform, AugmentationCfg
|
||||
from .utils import freeze_batch_norm_2d
|
||||
458
diffsynth/extensions/ImageQualityMetric/open_clip/coca_model.py
Normal file
458
diffsynth/extensions/ImageQualityMetric/open_clip/coca_model.py
Normal file
@@ -0,0 +1,458 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
import numpy as np
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .transformer import (
|
||||
LayerNormFp32,
|
||||
LayerNorm,
|
||||
QuickGELU,
|
||||
MultimodalTransformer,
|
||||
)
|
||||
from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
|
||||
|
||||
try:
|
||||
from transformers import (
|
||||
BeamSearchScorer,
|
||||
LogitsProcessorList,
|
||||
TopPLogitsWarper,
|
||||
TopKLogitsWarper,
|
||||
RepetitionPenaltyLogitsProcessor,
|
||||
MinLengthLogitsProcessor,
|
||||
MaxLengthCriteria,
|
||||
StoppingCriteriaList
|
||||
)
|
||||
|
||||
GENERATION_TYPES = {
|
||||
"top_k": TopKLogitsWarper,
|
||||
"top_p": TopPLogitsWarper,
|
||||
"beam_search": "beam_search"
|
||||
}
|
||||
_has_transformers = True
|
||||
except ImportError as e:
|
||||
GENERATION_TYPES = {
|
||||
"top_k": None,
|
||||
"top_p": None,
|
||||
"beam_search": "beam_search"
|
||||
}
|
||||
_has_transformers = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class MultimodalCfg(CLIPTextCfg):
|
||||
mlp_ratio: int = 4
|
||||
dim_head: int = 64
|
||||
heads: int = 8
|
||||
n_queries: int = 256
|
||||
attn_pooler_heads: int = 8
|
||||
|
||||
|
||||
def _build_text_decoder_tower(
|
||||
embed_dim,
|
||||
multimodal_cfg,
|
||||
quick_gelu: bool = False,
|
||||
cast_dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
|
||||
act_layer = QuickGELU if quick_gelu else nn.GELU
|
||||
norm_layer = (
|
||||
LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
|
||||
)
|
||||
|
||||
decoder = MultimodalTransformer(
|
||||
context_length=multimodal_cfg.context_length,
|
||||
width=multimodal_cfg.width,
|
||||
heads=multimodal_cfg.heads,
|
||||
layers=multimodal_cfg.layers,
|
||||
ls_init_value=multimodal_cfg.ls_init_value,
|
||||
output_dim=embed_dim,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
)
|
||||
|
||||
return decoder
|
||||
|
||||
|
||||
class CoCa(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim,
|
||||
multimodal_cfg: MultimodalCfg,
|
||||
text_cfg: CLIPTextCfg,
|
||||
vision_cfg: CLIPVisionCfg,
|
||||
quick_gelu: bool = False,
|
||||
cast_dtype: Optional[torch.dtype] = None,
|
||||
pad_id: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
|
||||
text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
|
||||
vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
|
||||
|
||||
self.text = _build_text_tower(
|
||||
embed_dim=embed_dim,
|
||||
text_cfg=text_cfg,
|
||||
quick_gelu=quick_gelu,
|
||||
cast_dtype=cast_dtype,
|
||||
)
|
||||
|
||||
vocab_size = (
|
||||
text_cfg.vocab_size # for hf models
|
||||
if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
|
||||
else text_cfg.vocab_size
|
||||
)
|
||||
|
||||
self.visual = _build_vision_tower(
|
||||
embed_dim=embed_dim,
|
||||
vision_cfg=vision_cfg,
|
||||
quick_gelu=quick_gelu,
|
||||
cast_dtype=cast_dtype,
|
||||
)
|
||||
|
||||
self.text_decoder = _build_text_decoder_tower(
|
||||
vocab_size,
|
||||
multimodal_cfg=multimodal_cfg,
|
||||
quick_gelu=quick_gelu,
|
||||
cast_dtype=cast_dtype,
|
||||
)
|
||||
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
|
||||
self.pad_id = pad_id
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.visual.set_grad_checkpointing(enable)
|
||||
self.text.set_grad_checkpointing(enable)
|
||||
self.text_decoder.set_grad_checkpointing(enable)
|
||||
|
||||
def _encode_image(self, images, normalize=True):
|
||||
image_latent, tokens_embs = self.visual(images)
|
||||
image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
|
||||
return image_latent, tokens_embs
|
||||
|
||||
def _encode_text(self, text, normalize=True, embed_cls=True):
|
||||
text = text[:, :-1] if embed_cls else text # make space for CLS token
|
||||
text_latent, token_emb = self.text(text)
|
||||
text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
|
||||
return text_latent, token_emb
|
||||
|
||||
def encode_image(self, images, normalize=True):
|
||||
image_latent, _ = self._encode_image(images, normalize=normalize)
|
||||
return image_latent
|
||||
|
||||
def encode_text(self, text, normalize=True, embed_cls=True):
|
||||
text_latent, _ = self._encode_text(text, normalize=normalize, embed_cls=embed_cls)
|
||||
return text_latent
|
||||
|
||||
def forward(self, image, text, embed_cls=True, image_latent=None, image_embs=None):
|
||||
text_latent, token_embs = self._encode_text(text, embed_cls=embed_cls)
|
||||
if image_latent is None or image_embs is None:
|
||||
image_latent, image_embs = self._encode_image(image)
|
||||
|
||||
# TODO: add assertion to avoid bugs?
|
||||
labels = text[:, -token_embs.shape[1]:]
|
||||
|
||||
logits = self.text_decoder(image_embs, token_embs)
|
||||
return {
|
||||
"image_features": image_latent,
|
||||
"text_features": text_latent,
|
||||
"logits": logits,
|
||||
"labels": labels,
|
||||
"logit_scale": self.logit_scale.exp()
|
||||
}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
image,
|
||||
text=None,
|
||||
seq_len=30,
|
||||
max_seq_len=77,
|
||||
temperature=1.,
|
||||
generation_type="beam_search",
|
||||
top_p=0.1, # keep tokens in the 1 - top_p quantile
|
||||
top_k=1, # keeps the top_k most probable tokens
|
||||
pad_token_id=None,
|
||||
eos_token_id=None,
|
||||
sot_token_id=None,
|
||||
num_beams=6,
|
||||
num_beam_groups=3,
|
||||
min_seq_len=5,
|
||||
stopping_criteria=None,
|
||||
repetition_penalty=1.0,
|
||||
fixed_output_length=False # if True output.shape == (batch_size, seq_len)
|
||||
):
|
||||
# taking many ideas and components from HuggingFace GenerationMixin
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
|
||||
assert _has_transformers, "Please install transformers for generate functionality. `pip install transformers`."
|
||||
assert seq_len > min_seq_len, "seq_len must be larger than min_seq_len"
|
||||
|
||||
with torch.no_grad():
|
||||
sot_token_id = 49406 if sot_token_id is None else sot_token_id
|
||||
eos_token_id = 49407 if eos_token_id is None else eos_token_id
|
||||
pad_token_id = self.pad_id if pad_token_id is None else pad_token_id
|
||||
logit_processor = LogitsProcessorList(
|
||||
[
|
||||
MinLengthLogitsProcessor(min_seq_len, eos_token_id),
|
||||
RepetitionPenaltyLogitsProcessor(repetition_penalty),
|
||||
]
|
||||
)
|
||||
|
||||
if stopping_criteria is None:
|
||||
stopping_criteria = [MaxLengthCriteria(max_length=seq_len)]
|
||||
|
||||
stopping_criteria = StoppingCriteriaList(
|
||||
stopping_criteria
|
||||
)
|
||||
|
||||
device = image.device
|
||||
|
||||
if generation_type == "beam_search":
|
||||
output = self._generate_beamsearch(
|
||||
image_inputs = image,
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
sot_token_id=sot_token_id,
|
||||
num_beams=num_beams,
|
||||
num_beam_groups=num_beam_groups,
|
||||
min_seq_len=min_seq_len,
|
||||
stopping_criteria=stopping_criteria,
|
||||
logit_processor=logit_processor,
|
||||
)
|
||||
if fixed_output_length and output.shape[1] < seq_len:
|
||||
return torch.cat(
|
||||
(output, torch.ones(output.shape[0], seq_len-output.shape[1], device=device, dtype=output.dtype) * self.pad_id),
|
||||
dim=1
|
||||
)
|
||||
return output
|
||||
|
||||
elif generation_type == "top_p":
|
||||
logit_warper = GENERATION_TYPES[generation_type](top_p)
|
||||
elif generation_type == "top_k":
|
||||
logit_warper = GENERATION_TYPES[generation_type](top_k)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"generation_type has to be one of "
|
||||
f"{'| ' + ' | '.join(list(GENERATION_TYPES.keys())) + ' |'}."
|
||||
)
|
||||
|
||||
image_latent, image_embs = self._encode_image(image)
|
||||
|
||||
if text is None:
|
||||
text = torch.ones((image.shape[0], 1), device=device, dtype=torch.long) * sot_token_id
|
||||
|
||||
was_training = self.training
|
||||
num_dims = len(text.shape)
|
||||
|
||||
if num_dims == 1:
|
||||
text = text[None, :]
|
||||
|
||||
cur_len = text.shape[1]
|
||||
self.eval()
|
||||
out = text
|
||||
|
||||
while True:
|
||||
x = out[:, -max_seq_len:]
|
||||
cur_len = x.shape[1]
|
||||
logits = self(image, x, image_latent=image_latent, image_embs=image_embs, embed_cls=False)["logits"][:, -1]
|
||||
mask = (out[:, -1] == eos_token_id) | (out[:, -1] == pad_token_id)
|
||||
sample = torch.ones((out.shape[0], 1), device=device, dtype=torch.long) * pad_token_id
|
||||
|
||||
if mask.all():
|
||||
if not fixed_output_length:
|
||||
break
|
||||
else:
|
||||
logits = logits[~mask, :]
|
||||
filtered_logits = logit_processor(x[~mask, :], logits)
|
||||
filtered_logits = logit_warper(x[~mask, :], filtered_logits)
|
||||
probs = F.softmax(filtered_logits / temperature, dim=-1)
|
||||
|
||||
if (cur_len + 1 == seq_len):
|
||||
sample[~mask, :] = torch.ones((sum(~mask), 1), device=device, dtype=torch.long) * eos_token_id
|
||||
else:
|
||||
sample[~mask, :] = torch.multinomial(probs, 1)
|
||||
|
||||
out = torch.cat((out, sample), dim=-1)
|
||||
|
||||
cur_len += 1
|
||||
|
||||
if stopping_criteria(out, None):
|
||||
break
|
||||
|
||||
if num_dims == 1:
|
||||
out = out.squeeze(0)
|
||||
|
||||
self.train(was_training)
|
||||
return out
|
||||
|
||||
def _generate_beamsearch(
|
||||
self,
|
||||
image_inputs,
|
||||
pad_token_id=None,
|
||||
eos_token_id=None,
|
||||
sot_token_id=None,
|
||||
num_beams=6,
|
||||
num_beam_groups=3,
|
||||
min_seq_len=5,
|
||||
stopping_criteria=None,
|
||||
logit_processor=None,
|
||||
logit_warper=None,
|
||||
):
|
||||
device = image_inputs.device
|
||||
batch_size = image_inputs.shape[0]
|
||||
image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
|
||||
image_latent, image_embs = self._encode_image(image_inputs)
|
||||
|
||||
input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
|
||||
input_ids = input_ids * sot_token_id
|
||||
beam_scorer = BeamSearchScorer(
|
||||
batch_size=batch_size,
|
||||
num_beams=num_beams,
|
||||
device=device,
|
||||
num_beam_groups=num_beam_groups,
|
||||
)
|
||||
# instantiate logits processors
|
||||
logits_processor = (
|
||||
LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
|
||||
if logit_processor is None
|
||||
else logit_processor
|
||||
)
|
||||
|
||||
batch_size = len(beam_scorer._beam_hyps)
|
||||
num_beams = beam_scorer.num_beams
|
||||
num_beam_groups = beam_scorer.num_beam_groups
|
||||
num_sub_beams = num_beams // num_beam_groups
|
||||
batch_beam_size, cur_len = input_ids.shape
|
||||
beam_indices = None
|
||||
|
||||
if num_beams * batch_size != batch_beam_size:
|
||||
raise ValueError(
|
||||
f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
|
||||
)
|
||||
|
||||
beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
|
||||
# initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
|
||||
# the same group don't produce same tokens everytime.
|
||||
beam_scores[:, ::num_sub_beams] = 0
|
||||
beam_scores = beam_scores.view((batch_size * num_beams,))
|
||||
|
||||
while True:
|
||||
|
||||
# predicted tokens in cur_len step
|
||||
current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
|
||||
|
||||
# indices which will form the beams in the next time step
|
||||
reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
|
||||
|
||||
# do one decoder step on all beams of all sentences in batch
|
||||
model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
|
||||
outputs = self(
|
||||
model_inputs['images'],
|
||||
model_inputs['text'],
|
||||
embed_cls=False,
|
||||
image_latent=image_latent,
|
||||
image_embs=image_embs
|
||||
)
|
||||
|
||||
for beam_group_idx in range(num_beam_groups):
|
||||
group_start_idx = beam_group_idx * num_sub_beams
|
||||
group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
|
||||
group_size = group_end_idx - group_start_idx
|
||||
|
||||
# indices of beams of current group among all sentences in batch
|
||||
batch_group_indices = []
|
||||
|
||||
for batch_idx in range(batch_size):
|
||||
batch_group_indices.extend(
|
||||
[batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
|
||||
)
|
||||
group_input_ids = input_ids[batch_group_indices]
|
||||
|
||||
# select outputs of beams of currentg group only
|
||||
next_token_logits = outputs['logits'][batch_group_indices, -1, :]
|
||||
vocab_size = next_token_logits.shape[-1]
|
||||
|
||||
next_token_scores_processed = logits_processor(
|
||||
group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
|
||||
)
|
||||
next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
|
||||
next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
|
||||
|
||||
# reshape for beam search
|
||||
next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
|
||||
|
||||
next_token_scores, next_tokens = torch.topk(
|
||||
next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
|
||||
)
|
||||
|
||||
next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
|
||||
next_tokens = next_tokens % vocab_size
|
||||
|
||||
# stateless
|
||||
process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
|
||||
beam_outputs = beam_scorer.process(
|
||||
group_input_ids,
|
||||
next_token_scores,
|
||||
next_tokens,
|
||||
next_indices,
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
beam_indices=process_beam_indices,
|
||||
)
|
||||
beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
|
||||
beam_next_tokens = beam_outputs["next_beam_tokens"]
|
||||
beam_idx = beam_outputs["next_beam_indices"]
|
||||
|
||||
input_ids[batch_group_indices] = group_input_ids[beam_idx]
|
||||
group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
|
||||
current_tokens[batch_group_indices] = group_input_ids[:, -1]
|
||||
|
||||
# (beam_idx // group_size) -> batch_idx
|
||||
# (beam_idx % group_size) -> offset of idx inside the group
|
||||
reordering_indices[batch_group_indices] = (
|
||||
num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
|
||||
)
|
||||
|
||||
input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
|
||||
|
||||
# increase cur_len
|
||||
cur_len = cur_len + 1
|
||||
if beam_scorer.is_done or stopping_criteria(input_ids, None):
|
||||
break
|
||||
|
||||
final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
|
||||
sequence_outputs = beam_scorer.finalize(
|
||||
input_ids,
|
||||
beam_scores,
|
||||
next_tokens,
|
||||
next_indices,
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
max_length=stopping_criteria.max_length,
|
||||
beam_indices=final_beam_indices,
|
||||
)
|
||||
return sequence_outputs['sequences']
|
||||
|
||||
|
||||
def prepare_inputs_for_generation(input_ids, image_inputs, past=None, **kwargs):
|
||||
if past:
|
||||
input_ids = input_ids[:, -1].unsqueeze(-1)
|
||||
|
||||
attention_mask = kwargs.get("attention_mask", None)
|
||||
position_ids = kwargs.get("position_ids", None)
|
||||
|
||||
if attention_mask is not None and position_ids is None:
|
||||
# create position_ids on the fly for batch generation
|
||||
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
else:
|
||||
position_ids = None
|
||||
return {
|
||||
"text": input_ids,
|
||||
"images": image_inputs,
|
||||
"past_key_values": past,
|
||||
"position_ids": position_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
||||
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
|
||||
433
diffsynth/extensions/ImageQualityMetric/open_clip/factory.py
Normal file
433
diffsynth/extensions/ImageQualityMetric/open_clip/factory.py
Normal file
@@ -0,0 +1,433 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from turtle import forward
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
|
||||
from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
|
||||
resize_pos_embed, get_cast_dtype
|
||||
from .coca_model import CoCa
|
||||
from .loss import ClipLoss, DistillClipLoss, CoCaLoss
|
||||
from .openai import load_openai_model
|
||||
from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model, download_pretrained_from_hf
|
||||
from .transform import image_transform, AugmentationCfg
|
||||
from .tokenizer import HFTokenizer, SimpleTokenizer
|
||||
|
||||
|
||||
HF_HUB_PREFIX = 'hf-hub:'
|
||||
_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
|
||||
_MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs
|
||||
|
||||
|
||||
def _natural_key(string_):
|
||||
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
|
||||
|
||||
|
||||
def _rescan_model_configs():
|
||||
global _MODEL_CONFIGS
|
||||
|
||||
config_ext = ('.json',)
|
||||
config_files = []
|
||||
for config_path in _MODEL_CONFIG_PATHS:
|
||||
if config_path.is_file() and config_path.suffix in config_ext:
|
||||
config_files.append(config_path)
|
||||
elif config_path.is_dir():
|
||||
for ext in config_ext:
|
||||
config_files.extend(config_path.glob(f'*{ext}'))
|
||||
|
||||
for cf in config_files:
|
||||
with open(cf, 'r') as f:
|
||||
model_cfg = json.load(f)
|
||||
if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
|
||||
_MODEL_CONFIGS[cf.stem] = model_cfg
|
||||
|
||||
_MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))}
|
||||
|
||||
|
||||
_rescan_model_configs() # initial populate of model config registry
|
||||
|
||||
|
||||
def list_models():
|
||||
""" enumerate available model architectures based on config files """
|
||||
return list(_MODEL_CONFIGS.keys())
|
||||
|
||||
|
||||
def add_model_config(path):
|
||||
""" add model config path or file and update registry """
|
||||
if not isinstance(path, Path):
|
||||
path = Path(path)
|
||||
_MODEL_CONFIG_PATHS.append(path)
|
||||
_rescan_model_configs()
|
||||
|
||||
|
||||
def get_model_config(model_name):
|
||||
if model_name in _MODEL_CONFIGS:
|
||||
return deepcopy(_MODEL_CONFIGS[model_name])
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def get_tokenizer(model_name, open_clip_bpe_path=None):
|
||||
if model_name.startswith(HF_HUB_PREFIX):
|
||||
tokenizer = HFTokenizer(model_name[len(HF_HUB_PREFIX):])
|
||||
else:
|
||||
config = get_model_config(model_name)
|
||||
tokenizer = HFTokenizer(
|
||||
config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else SimpleTokenizer(open_clip_bpe_path)
|
||||
return tokenizer
|
||||
|
||||
|
||||
def load_state_dict(checkpoint_path: str, map_location='cpu'):
|
||||
checkpoint = torch.load(checkpoint_path, map_location=map_location)
|
||||
if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
|
||||
state_dict = checkpoint['state_dict']
|
||||
else:
|
||||
state_dict = checkpoint
|
||||
if next(iter(state_dict.items()))[0].startswith('module'):
|
||||
state_dict = {k[7:]: v for k, v in state_dict.items()}
|
||||
return state_dict
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, strict=True):
|
||||
state_dict = load_state_dict(checkpoint_path)
|
||||
# detect old format and make compatible with new format
|
||||
if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
|
||||
state_dict = convert_to_custom_text_state_dict(state_dict)
|
||||
resize_pos_embed(state_dict, model)
|
||||
incompatible_keys = model.load_state_dict(state_dict, strict=strict)
|
||||
return incompatible_keys
|
||||
|
||||
|
||||
def create_model(
|
||||
model_name: str,
|
||||
pretrained: Optional[str] = None,
|
||||
precision: str = 'fp32',
|
||||
device: Union[str, torch.device] = 'cpu',
|
||||
jit: bool = False,
|
||||
force_quick_gelu: bool = False,
|
||||
force_custom_text: bool = False,
|
||||
force_patch_dropout: Optional[float] = None,
|
||||
force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
|
||||
pretrained_image: bool = False,
|
||||
pretrained_hf: bool = True,
|
||||
cache_dir: Optional[str] = None,
|
||||
output_dict: Optional[bool] = None,
|
||||
require_pretrained: bool = False,
|
||||
):
|
||||
has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX)
|
||||
if has_hf_hub_prefix:
|
||||
model_id = model_name[len(HF_HUB_PREFIX):]
|
||||
checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
|
||||
config_path = download_pretrained_from_hf(model_id, filename='open_clip_config.json', cache_dir=cache_dir)
|
||||
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
pretrained_cfg = config['preprocess_cfg']
|
||||
model_cfg = config['model_cfg']
|
||||
else:
|
||||
model_name = model_name.replace('/', '-') # for callers using old naming with / in ViT names
|
||||
checkpoint_path = None
|
||||
pretrained_cfg = {}
|
||||
model_cfg = None
|
||||
|
||||
if isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
|
||||
if pretrained and pretrained.lower() == 'openai':
|
||||
logging.info(f'Loading pretrained {model_name} from OpenAI.')
|
||||
model = load_openai_model(
|
||||
model_name,
|
||||
precision=precision,
|
||||
device=device,
|
||||
jit=jit,
|
||||
cache_dir=cache_dir,
|
||||
)
|
||||
|
||||
# to always output dict even if it is clip
|
||||
if output_dict and hasattr(model, "output_dict"):
|
||||
model.output_dict = True
|
||||
else:
|
||||
model_cfg = model_cfg or get_model_config(model_name)
|
||||
if model_cfg is not None:
|
||||
logging.info(f'Loaded {model_name} model config.')
|
||||
else:
|
||||
logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
|
||||
raise RuntimeError(f'Model config for {model_name} not found.')
|
||||
|
||||
if force_quick_gelu:
|
||||
# override for use of QuickGELU on non-OpenAI transformer models
|
||||
model_cfg["quick_gelu"] = True
|
||||
|
||||
if force_patch_dropout is not None:
|
||||
# override the default patch dropout value
|
||||
model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout
|
||||
|
||||
if force_image_size is not None:
|
||||
# override model config's image size
|
||||
model_cfg["vision_cfg"]["image_size"] = force_image_size
|
||||
|
||||
if pretrained_image:
|
||||
if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
|
||||
# pretrained weight loading for timm models set via vision_cfg
|
||||
model_cfg['vision_cfg']['timm_model_pretrained'] = True
|
||||
else:
|
||||
assert False, 'pretrained image towers currently only supported for timm models'
|
||||
|
||||
cast_dtype = get_cast_dtype(precision)
|
||||
is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {})
|
||||
custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model
|
||||
|
||||
if custom_text:
|
||||
if is_hf_model:
|
||||
model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
|
||||
if "coca" in model_name:
|
||||
model = CoCa(**model_cfg, cast_dtype=cast_dtype)
|
||||
else:
|
||||
model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
|
||||
else:
|
||||
model = CLIP(**model_cfg, cast_dtype=cast_dtype)
|
||||
|
||||
pretrained_loaded = False
|
||||
if pretrained:
|
||||
checkpoint_path = ''
|
||||
pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
|
||||
if pretrained_cfg:
|
||||
checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
|
||||
elif os.path.exists(pretrained):
|
||||
checkpoint_path = pretrained
|
||||
|
||||
if checkpoint_path:
|
||||
logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
|
||||
load_checkpoint(model, checkpoint_path)
|
||||
else:
|
||||
error_str = (
|
||||
f'Pretrained weights ({pretrained}) not found for model {model_name}.'
|
||||
f'Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
|
||||
logging.warning(error_str)
|
||||
raise RuntimeError(error_str)
|
||||
pretrained_loaded = True
|
||||
elif has_hf_hub_prefix:
|
||||
logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
|
||||
load_checkpoint(model, checkpoint_path)
|
||||
pretrained_loaded = True
|
||||
|
||||
if require_pretrained and not pretrained_loaded:
|
||||
# callers of create_model_from_pretrained always expect pretrained weights
|
||||
raise RuntimeError(
|
||||
f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.')
|
||||
|
||||
model.to(device=device)
|
||||
if precision in ("fp16", "bf16"):
|
||||
convert_weights_to_lp(model, dtype=torch.bfloat16 if precision == 'bf16' else torch.float16)
|
||||
|
||||
# set image / mean metadata from pretrained_cfg if available, or use default
|
||||
model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
|
||||
model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
|
||||
|
||||
# to always output dict even if it is clip
|
||||
if output_dict and hasattr(model, "output_dict"):
|
||||
model.output_dict = True
|
||||
|
||||
if jit:
|
||||
model = torch.jit.script(model)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def create_loss(args):
|
||||
if args.distill:
|
||||
return DistillClipLoss(
|
||||
local_loss=args.local_loss,
|
||||
gather_with_grad=args.gather_with_grad,
|
||||
cache_labels=True,
|
||||
rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
use_horovod=args.horovod,
|
||||
)
|
||||
elif "coca" in args.model.lower():
|
||||
return CoCaLoss(
|
||||
caption_loss_weight=args.coca_caption_loss_weight,
|
||||
clip_loss_weight=args.coca_contrastive_loss_weight,
|
||||
local_loss=args.local_loss,
|
||||
gather_with_grad=args.gather_with_grad,
|
||||
cache_labels=True,
|
||||
rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
use_horovod=args.horovod,
|
||||
)
|
||||
return ClipLoss(
|
||||
local_loss=args.local_loss,
|
||||
gather_with_grad=args.gather_with_grad,
|
||||
cache_labels=True,
|
||||
rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
use_horovod=args.horovod,
|
||||
)
|
||||
|
||||
class MLP(torch.nn.Module):
|
||||
def __init__(self, input_size):
|
||||
super().__init__()
|
||||
self.input_size = input_size
|
||||
self.layers = torch.nn.Sequential(
|
||||
torch.nn.Linear(self.input_size, 1024),
|
||||
torch.nn.Dropout(0.2),
|
||||
torch.nn.Linear(1024, 128),
|
||||
torch.nn.Dropout(0.2),
|
||||
torch.nn.Linear(128, 64),
|
||||
torch.nn.Dropout(0.1),
|
||||
torch.nn.Linear(64, 16),
|
||||
torch.nn.Linear(16, 1)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers(x)
|
||||
|
||||
# class semantic_head(torch.nn.Module):
|
||||
# def __init__(self, input_size):
|
||||
# super().__init__()
|
||||
# self.input_size = input_size # for ViT-L-14 is 1024
|
||||
# self.seg_head = torch.nn.Sequential(
|
||||
# torch.nn.Linear(input_size, 128),
|
||||
# torch.nn.Dropout(0.2),
|
||||
# torch.nn.Linear(128, 64),
|
||||
# torch.nn.Dropout(0.1),
|
||||
# torch.nn.Linear(64, 16),
|
||||
# torch.nn.Linear(16, 1),
|
||||
# )
|
||||
# self.sigmoid = torch.nn.Sigmoid()
|
||||
|
||||
# def forward(self, x):
|
||||
# return self.sigmoid(self.seg_head(x))
|
||||
|
||||
def create_model_and_transforms(
|
||||
model_name: str,
|
||||
pretrained: Optional[str] = None,
|
||||
precision: str = 'fp32',
|
||||
device: Union[str, torch.device] = 'cpu',
|
||||
jit: bool = False,
|
||||
force_quick_gelu: bool = False,
|
||||
force_custom_text: bool = False,
|
||||
force_patch_dropout: Optional[float] = None,
|
||||
force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
|
||||
pretrained_image: bool = False,
|
||||
pretrained_hf: bool = True,
|
||||
image_mean: Optional[Tuple[float, ...]] = None,
|
||||
image_std: Optional[Tuple[float, ...]] = None,
|
||||
aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
|
||||
cache_dir: Optional[str] = None,
|
||||
light_augmentation = False,
|
||||
output_dict: Optional[bool] = None,
|
||||
with_score_predictor: bool = False,
|
||||
with_region_predictor: bool = False
|
||||
):
|
||||
model = create_model(
|
||||
model_name,
|
||||
pretrained,
|
||||
precision=precision,
|
||||
device=device,
|
||||
jit=jit,
|
||||
force_quick_gelu=force_quick_gelu,
|
||||
force_custom_text=force_custom_text,
|
||||
force_patch_dropout=force_patch_dropout,
|
||||
force_image_size=force_image_size,
|
||||
pretrained_image=pretrained_image,
|
||||
pretrained_hf=pretrained_hf,
|
||||
cache_dir=cache_dir,
|
||||
output_dict=output_dict,
|
||||
)
|
||||
|
||||
image_mean = image_mean or getattr(model.visual, 'image_mean', None)
|
||||
image_std = image_std or getattr(model.visual, 'image_std', None)
|
||||
|
||||
if with_score_predictor:
|
||||
model.score_predictor = MLP(model.visual.proj.size(1)).to(device=device, dtype=model.visual.proj.dtype)
|
||||
|
||||
if with_region_predictor:
|
||||
# model.region_predictor = semantic_head(model.visual.proj.size(1)).to(device=device, dtype=model.visual.proj.dtype)
|
||||
model.region_predictor = torch.nn.Linear(model.visual.proj.size(0), 1).to(device=device, dtype=model.visual.proj.dtype)
|
||||
# preprocess_train = image_transform_region(
|
||||
# model.visual.image_size,
|
||||
# is_train=True,
|
||||
# mean=image_mean,
|
||||
# std=image_std
|
||||
# )
|
||||
# preprocess_val = image_transform_region(
|
||||
# model.visual.image_size,
|
||||
# is_train=False,
|
||||
# mean=image_mean,
|
||||
# std=image_std
|
||||
# )
|
||||
|
||||
if light_augmentation:
|
||||
preprocess_val = image_transform(
|
||||
model.visual.image_size,
|
||||
is_train=False,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
resize_longest_max=True,
|
||||
)
|
||||
preprocess_train = preprocess_val
|
||||
else:
|
||||
preprocess_train = image_transform(
|
||||
model.visual.image_size,
|
||||
is_train=True,
|
||||
mean=image_mean,
|
||||
std=image_std
|
||||
)
|
||||
preprocess_val = image_transform(
|
||||
model.visual.image_size,
|
||||
is_train=False,
|
||||
mean=image_mean,
|
||||
std=image_std
|
||||
)
|
||||
|
||||
return model, preprocess_train, preprocess_val
|
||||
|
||||
|
||||
def create_model_from_pretrained(
|
||||
model_name: str,
|
||||
pretrained: Optional[str] = None,
|
||||
precision: str = 'fp32',
|
||||
device: Union[str, torch.device] = 'cpu',
|
||||
jit: bool = False,
|
||||
force_quick_gelu: bool = False,
|
||||
force_custom_text: bool = False,
|
||||
force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
|
||||
return_transform: bool = True,
|
||||
image_mean: Optional[Tuple[float, ...]] = None,
|
||||
image_std: Optional[Tuple[float, ...]] = None,
|
||||
cache_dir: Optional[str] = None,
|
||||
):
|
||||
model = create_model(
|
||||
model_name,
|
||||
pretrained,
|
||||
precision=precision,
|
||||
device=device,
|
||||
jit=jit,
|
||||
force_quick_gelu=force_quick_gelu,
|
||||
force_custom_text=force_custom_text,
|
||||
force_image_size=force_image_size,
|
||||
cache_dir=cache_dir,
|
||||
require_pretrained=True,
|
||||
)
|
||||
|
||||
if not return_transform:
|
||||
return model
|
||||
|
||||
image_mean = image_mean or getattr(model.visual, 'image_mean', None)
|
||||
image_std = image_std or getattr(model.visual, 'image_std', None)
|
||||
preprocess = image_transform(
|
||||
model.visual.image_size,
|
||||
is_train=False,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
)
|
||||
|
||||
return model, preprocess
|
||||
@@ -0,0 +1,45 @@
|
||||
# HF architecture dict:
|
||||
arch_dict = {
|
||||
# https://huggingface.co/docs/transformers/model_doc/roberta#roberta
|
||||
"roberta": {
|
||||
"config_names": {
|
||||
"context_length": "max_position_embeddings",
|
||||
"vocab_size": "vocab_size",
|
||||
"width": "hidden_size",
|
||||
"heads": "num_attention_heads",
|
||||
"layers": "num_hidden_layers",
|
||||
"layer_attr": "layer",
|
||||
"token_embeddings_attr": "embeddings"
|
||||
},
|
||||
"pooler": "mean_pooler",
|
||||
},
|
||||
# https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
|
||||
"xlm-roberta": {
|
||||
"config_names": {
|
||||
"context_length": "max_position_embeddings",
|
||||
"vocab_size": "vocab_size",
|
||||
"width": "hidden_size",
|
||||
"heads": "num_attention_heads",
|
||||
"layers": "num_hidden_layers",
|
||||
"layer_attr": "layer",
|
||||
"token_embeddings_attr": "embeddings"
|
||||
},
|
||||
"pooler": "mean_pooler",
|
||||
},
|
||||
# https://huggingface.co/docs/transformers/model_doc/mt5#mt5
|
||||
"mt5": {
|
||||
"config_names": {
|
||||
# unlimited seqlen
|
||||
# https://github.com/google-research/text-to-text-transfer-transformer/issues/273
|
||||
# https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
|
||||
"context_length": "",
|
||||
"vocab_size": "vocab_size",
|
||||
"width": "d_model",
|
||||
"heads": "num_heads",
|
||||
"layers": "num_layers",
|
||||
"layer_attr": "block",
|
||||
"token_embeddings_attr": "embed_tokens"
|
||||
},
|
||||
"pooler": "mean_pooler",
|
||||
},
|
||||
}
|
||||
176
diffsynth/extensions/ImageQualityMetric/open_clip/hf_model.py
Normal file
176
diffsynth/extensions/ImageQualityMetric/open_clip/hf_model.py
Normal file
@@ -0,0 +1,176 @@
|
||||
""" huggingface model adapter
|
||||
|
||||
Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch import TensorType
|
||||
|
||||
try:
|
||||
import transformers
|
||||
from transformers import AutoModel, AutoTokenizer, AutoConfig, PretrainedConfig
|
||||
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
|
||||
BaseModelOutputWithPoolingAndCrossAttentions
|
||||
except ImportError as e:
|
||||
transformers = None
|
||||
|
||||
|
||||
class BaseModelOutput:
|
||||
pass
|
||||
|
||||
|
||||
class PretrainedConfig:
|
||||
pass
|
||||
|
||||
from .hf_configs import arch_dict
|
||||
|
||||
|
||||
# utils
|
||||
def _camel2snake(s):
|
||||
return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
|
||||
|
||||
|
||||
# TODO: ?last - for gpt-like models
|
||||
_POOLERS = {}
|
||||
|
||||
|
||||
def register_pooler(cls):
|
||||
"""Decorator registering pooler class"""
|
||||
_POOLERS[_camel2snake(cls.__name__)] = cls
|
||||
return cls
|
||||
|
||||
|
||||
@register_pooler
|
||||
class MeanPooler(nn.Module):
|
||||
"""Mean pooling"""
|
||||
|
||||
def forward(self, x: BaseModelOutput, attention_mask: TensorType):
|
||||
masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
|
||||
return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
|
||||
|
||||
|
||||
@register_pooler
|
||||
class MaxPooler(nn.Module):
|
||||
"""Max pooling"""
|
||||
|
||||
def forward(self, x: BaseModelOutput, attention_mask: TensorType):
|
||||
masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
|
||||
return masked_output.max(1).values
|
||||
|
||||
|
||||
@register_pooler
|
||||
class ClsPooler(nn.Module):
|
||||
"""CLS token pooling"""
|
||||
|
||||
def __init__(self, use_pooler_output=True):
|
||||
super().__init__()
|
||||
self.cls_token_position = 0
|
||||
self.use_pooler_output = use_pooler_output
|
||||
|
||||
def forward(self, x: BaseModelOutput, attention_mask: TensorType):
|
||||
if (self.use_pooler_output and
|
||||
isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
|
||||
(x.pooler_output is not None)
|
||||
):
|
||||
return x.pooler_output
|
||||
|
||||
return x.last_hidden_state[:, self.cls_token_position, :]
|
||||
|
||||
|
||||
class HFTextEncoder(nn.Module):
|
||||
"""HuggingFace model adapter"""
|
||||
output_tokens: torch.jit.Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name_or_path: str,
|
||||
output_dim: int,
|
||||
config: PretrainedConfig = None,
|
||||
pooler_type: str = None,
|
||||
proj: str = None,
|
||||
pretrained: bool = True,
|
||||
output_tokens: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.output_tokens = output_tokens
|
||||
self.output_dim = output_dim
|
||||
|
||||
# TODO: find better way to get this information
|
||||
uses_transformer_pooler = (pooler_type == "cls_pooler")
|
||||
|
||||
if transformers is None:
|
||||
raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
|
||||
if config is None:
|
||||
self.config = AutoConfig.from_pretrained(model_name_or_path)
|
||||
create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
|
||||
AutoModel.from_config, self.config)
|
||||
# TODO: do all model configs have this attribute? PretrainedConfig does so yes??
|
||||
if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
|
||||
self.transformer = create_func(model_args)
|
||||
self.transformer = self.transformer.encoder
|
||||
else:
|
||||
self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
|
||||
else:
|
||||
self.config = config
|
||||
self.transformer = AutoModel.from_config(config)
|
||||
if pooler_type is None: # get default arch pooler
|
||||
pooler_type = (arch_dict[self.config.model_type]["pooler"])
|
||||
|
||||
self.pooler = _POOLERS[pooler_type]()
|
||||
|
||||
d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
|
||||
if (d_model == output_dim) and (proj is None): # do we always need a proj?
|
||||
self.proj = nn.Identity()
|
||||
elif proj == 'linear':
|
||||
self.proj = nn.Linear(d_model, output_dim, bias=False)
|
||||
elif proj == 'mlp':
|
||||
hidden_size = (d_model + output_dim) // 2
|
||||
self.proj = nn.Sequential(
|
||||
nn.Linear(d_model, hidden_size, bias=False),
|
||||
nn.GELU(),
|
||||
nn.Linear(hidden_size, output_dim, bias=False),
|
||||
)
|
||||
|
||||
def forward(self, x: TensorType):
|
||||
attn_mask = (x != self.config.pad_token_id).long()
|
||||
out = self.transformer(input_ids=x, attention_mask=attn_mask)
|
||||
pooled_out = self.pooler(out, attn_mask)
|
||||
projected = self.proj(pooled_out)
|
||||
|
||||
seq_len = out.last_hidden_state.shape[1]
|
||||
tokens = (
|
||||
out.last_hidden_state[:, torch.arange(seq_len) != self.pooler.cls_token_position, :]
|
||||
if type(self.pooler) == ClsPooler
|
||||
else out.last_hidden_state
|
||||
)
|
||||
|
||||
if self.output_tokens:
|
||||
return projected, tokens
|
||||
return projected
|
||||
|
||||
def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
|
||||
if not unlocked_layers: # full freezing
|
||||
for n, p in self.transformer.named_parameters():
|
||||
p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
|
||||
return
|
||||
|
||||
encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
|
||||
layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
|
||||
print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
|
||||
embeddings = getattr(
|
||||
self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
|
||||
modules = [embeddings, *layer_list][:-unlocked_layers]
|
||||
# freeze layers
|
||||
for module in modules:
|
||||
for n, p in module.named_parameters():
|
||||
p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.transformer.gradient_checkpointing_enable()
|
||||
|
||||
def init_parameters(self):
|
||||
pass
|
||||
270
diffsynth/extensions/ImageQualityMetric/open_clip/loss.py
Normal file
270
diffsynth/extensions/ImageQualityMetric/open_clip/loss.py
Normal file
@@ -0,0 +1,270 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
|
||||
try:
|
||||
import torch.distributed.nn
|
||||
from torch import distributed as dist
|
||||
|
||||
has_distributed = True
|
||||
except ImportError:
|
||||
has_distributed = False
|
||||
|
||||
try:
|
||||
import horovod.torch as hvd
|
||||
except ImportError:
|
||||
hvd = None
|
||||
|
||||
|
||||
def gather_features(
|
||||
image_features,
|
||||
text_features,
|
||||
local_loss=False,
|
||||
gather_with_grad=False,
|
||||
rank=0,
|
||||
world_size=1,
|
||||
use_horovod=False
|
||||
):
|
||||
assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
|
||||
if use_horovod:
|
||||
assert hvd is not None, 'Please install horovod'
|
||||
if gather_with_grad:
|
||||
all_image_features = hvd.allgather(image_features)
|
||||
all_text_features = hvd.allgather(text_features)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
all_image_features = hvd.allgather(image_features)
|
||||
all_text_features = hvd.allgather(text_features)
|
||||
if not local_loss:
|
||||
# ensure grads for local rank when all_* features don't have a gradient
|
||||
gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
|
||||
gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
|
||||
gathered_image_features[rank] = image_features
|
||||
gathered_text_features[rank] = text_features
|
||||
all_image_features = torch.cat(gathered_image_features, dim=0)
|
||||
all_text_features = torch.cat(gathered_text_features, dim=0)
|
||||
else:
|
||||
# We gather tensors from all gpus
|
||||
if gather_with_grad:
|
||||
all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
|
||||
all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
|
||||
else:
|
||||
gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
|
||||
gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
|
||||
dist.all_gather(gathered_image_features, image_features)
|
||||
dist.all_gather(gathered_text_features, text_features)
|
||||
if not local_loss:
|
||||
# ensure grads for local rank when all_* features don't have a gradient
|
||||
gathered_image_features[rank] = image_features
|
||||
gathered_text_features[rank] = text_features
|
||||
all_image_features = torch.cat(gathered_image_features, dim=0)
|
||||
all_text_features = torch.cat(gathered_text_features, dim=0)
|
||||
|
||||
return all_image_features, all_text_features
|
||||
|
||||
|
||||
class ClipLoss(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_loss=False,
|
||||
gather_with_grad=False,
|
||||
cache_labels=False,
|
||||
rank=0,
|
||||
world_size=1,
|
||||
use_horovod=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.local_loss = local_loss
|
||||
self.gather_with_grad = gather_with_grad
|
||||
self.cache_labels = cache_labels
|
||||
self.rank = rank
|
||||
self.world_size = world_size
|
||||
self.use_horovod = use_horovod
|
||||
|
||||
# cache state
|
||||
self.prev_num_logits = 0
|
||||
self.labels = {}
|
||||
|
||||
def get_ground_truth(self, device, num_logits) -> torch.Tensor:
|
||||
# calculated ground-truth and cache if enabled
|
||||
if self.prev_num_logits != num_logits or device not in self.labels:
|
||||
labels = torch.arange(num_logits, device=device, dtype=torch.long)
|
||||
if self.world_size > 1 and self.local_loss:
|
||||
labels = labels + num_logits * self.rank
|
||||
if self.cache_labels:
|
||||
self.labels[device] = labels
|
||||
self.prev_num_logits = num_logits
|
||||
else:
|
||||
labels = self.labels[device]
|
||||
return labels
|
||||
|
||||
def get_logits(self, image_features, text_features, logit_scale):
|
||||
if self.world_size > 1:
|
||||
all_image_features, all_text_features = gather_features(
|
||||
image_features, text_features,
|
||||
self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
|
||||
|
||||
if self.local_loss:
|
||||
logits_per_image = logit_scale * image_features @ all_text_features.T
|
||||
logits_per_text = logit_scale * text_features @ all_image_features.T
|
||||
else:
|
||||
logits_per_image = logit_scale * all_image_features @ all_text_features.T
|
||||
logits_per_text = logits_per_image.T
|
||||
else:
|
||||
logits_per_image = logit_scale * image_features @ text_features.T
|
||||
logits_per_text = logit_scale * text_features @ image_features.T
|
||||
|
||||
return logits_per_image, logits_per_text
|
||||
|
||||
def forward(self, image_features, text_features, logit_scale, output_dict=False):
|
||||
device = image_features.device
|
||||
logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
|
||||
|
||||
labels = self.get_ground_truth(device, logits_per_image.shape[0])
|
||||
|
||||
total_loss = (
|
||||
F.cross_entropy(logits_per_image, labels) +
|
||||
F.cross_entropy(logits_per_text, labels)
|
||||
) / 2
|
||||
return total_loss
|
||||
|
||||
class PreferenceLoss(nn.Module):
|
||||
|
||||
def forward(self, logits_per_image, num_images, labels):
|
||||
|
||||
paired_logits_list = [logit[:,i] for i, logit in enumerate(logits_per_image.split(num_images.tolist()))]
|
||||
paired_logits = pad_sequence(paired_logits_list, batch_first=True, padding_value=-999)
|
||||
|
||||
ce_loss = F.cross_entropy(paired_logits, labels)
|
||||
return ce_loss
|
||||
|
||||
class HPSLoss(nn.Module):
|
||||
|
||||
def forward(self, text_logits, labels):
|
||||
|
||||
device = text_logits.device
|
||||
text_0_logits, text_1_logits = text_logits.chunk(2, dim=-1)
|
||||
label_0, label_1 = labels.chunk(2, dim=-1)
|
||||
|
||||
index = torch.arange(text_0_logits.shape[0], device=device, dtype=torch.long)
|
||||
text_0_logits = text_0_logits[index, index]
|
||||
text_1_logits = text_1_logits[index, index]
|
||||
text_logits = torch.stack([text_0_logits, text_1_logits], dim=-1)
|
||||
text_0_labels = torch.zeros(text_logits.shape[0], device=device, dtype=torch.long)
|
||||
text_1_labels = text_0_labels + 1
|
||||
|
||||
text_0_loss = torch.nn.functional.cross_entropy(text_logits, text_0_labels, reduction="none")
|
||||
text_1_loss = torch.nn.functional.cross_entropy(text_logits, text_1_labels, reduction="none")
|
||||
|
||||
text_loss = label_0 * text_0_loss + label_1 * text_1_loss
|
||||
|
||||
# absolute_example_weight = 1 / num_per_prompt
|
||||
# denominator = absolute_example_weight.sum()
|
||||
# weight_per_example = absolute_example_weight / denominator
|
||||
# text_loss *= weight_per_example
|
||||
|
||||
text_loss = text_loss.sum()
|
||||
return text_loss
|
||||
|
||||
class RankingLoss(nn.Module):
|
||||
|
||||
def forward(self, logits_per_image, num_images, labels, margin = 1.0):
|
||||
paired_logits_list = [logit[:,i] for i, logit in enumerate(logits_per_image.split(num_images.tolist()))]
|
||||
label_list = [label for label in labels.split(num_images.tolist())]
|
||||
# ranked_logits = [torch.index_select(paired_logits_list[i], 0, rank) for i, rank in enumerate(label_list)]
|
||||
|
||||
paired_logits = pad_sequence(paired_logits_list, batch_first=True, padding_value=-1)
|
||||
padded_labels = pad_sequence(label_list, batch_first=True, padding_value=10)
|
||||
|
||||
# regulized_logits = torch.log(torch.sigmoid(paired_logits))
|
||||
|
||||
diff = paired_logits.unsqueeze(1) - paired_logits.unsqueeze(2)
|
||||
# diff = paired_logits.unsqueeze(1) - paired_logits.unsqueeze(2)
|
||||
# diff_label = torch.clamp(padded_labels.unsqueeze(1) - padded_labels.unsqueeze(2), min=-1, max=1)
|
||||
diff_label = - (padded_labels.unsqueeze(1) - padded_labels.unsqueeze(2))
|
||||
mask = torch.triu(torch.ones(diff.shape[1], diff.shape[1]), diagonal=1).bool().detach()
|
||||
|
||||
loss = torch.clamp(margin - torch.mul(diff[:, ~mask],diff_label[:,~mask]), min=0).mean()
|
||||
return loss
|
||||
|
||||
class CoCaLoss(ClipLoss):
|
||||
def __init__(
|
||||
self,
|
||||
caption_loss_weight,
|
||||
clip_loss_weight,
|
||||
pad_id=0, # pad_token for open_clip custom tokenizer
|
||||
local_loss=False,
|
||||
gather_with_grad=False,
|
||||
cache_labels=False,
|
||||
rank=0,
|
||||
world_size=1,
|
||||
use_horovod=False,
|
||||
):
|
||||
super().__init__(
|
||||
local_loss=local_loss,
|
||||
gather_with_grad=gather_with_grad,
|
||||
cache_labels=cache_labels,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
use_horovod=use_horovod
|
||||
)
|
||||
|
||||
self.clip_loss_weight = clip_loss_weight
|
||||
self.caption_loss_weight = caption_loss_weight
|
||||
self.caption_loss = nn.CrossEntropyLoss(ignore_index=pad_id)
|
||||
|
||||
def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
|
||||
clip_loss = super().forward(image_features, text_features, logit_scale)
|
||||
clip_loss = self.clip_loss_weight * clip_loss
|
||||
|
||||
caption_loss = self.caption_loss(
|
||||
logits.permute(0, 2, 1),
|
||||
labels,
|
||||
)
|
||||
caption_loss = caption_loss * self.caption_loss_weight
|
||||
|
||||
if output_dict:
|
||||
return {"contrastive_loss": clip_loss, "caption_loss": caption_loss}
|
||||
|
||||
return clip_loss, caption_loss
|
||||
|
||||
|
||||
class DistillClipLoss(ClipLoss):
|
||||
|
||||
def dist_loss(self, teacher_logits, student_logits):
|
||||
return -(teacher_logits.softmax(dim=1) * student_logits.log_softmax(dim=1)).sum(dim=1).mean(dim=0)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
image_features,
|
||||
text_features,
|
||||
logit_scale,
|
||||
dist_image_features,
|
||||
dist_text_features,
|
||||
dist_logit_scale,
|
||||
output_dict=False,
|
||||
):
|
||||
logits_per_image, logits_per_text = \
|
||||
self.get_logits(image_features, text_features, logit_scale)
|
||||
|
||||
dist_logits_per_image, dist_logits_per_text = \
|
||||
self.get_logits(dist_image_features, dist_text_features, dist_logit_scale)
|
||||
|
||||
labels = self.get_ground_truth(image_features.device, logits_per_image.shape[0])
|
||||
|
||||
contrastive_loss = (
|
||||
F.cross_entropy(logits_per_image, labels) +
|
||||
F.cross_entropy(logits_per_text, labels)
|
||||
) / 2
|
||||
|
||||
distill_loss = (
|
||||
self.dist_loss(dist_logits_per_image, logits_per_image) +
|
||||
self.dist_loss(dist_logits_per_text, logits_per_text)
|
||||
) / 2
|
||||
|
||||
if output_dict:
|
||||
return {"contrastive_loss": contrastive_loss, "distill_loss": distill_loss}
|
||||
|
||||
return contrastive_loss, distill_loss
|
||||
461
diffsynth/extensions/ImageQualityMetric/open_clip/model.py
Normal file
461
diffsynth/extensions/ImageQualityMetric/open_clip/model.py
Normal file
@@ -0,0 +1,461 @@
|
||||
""" CLIP Model
|
||||
|
||||
Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
import math
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
|
||||
from .hf_model import HFTextEncoder
|
||||
from .modified_resnet import ModifiedResNet
|
||||
from .timm_model import TimmModel
|
||||
from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer
|
||||
from .utils import to_2tuple
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPVisionCfg:
|
||||
layers: Union[Tuple[int, int, int, int], int] = 12
|
||||
width: int = 768
|
||||
head_width: int = 64
|
||||
mlp_ratio: float = 4.0
|
||||
patch_size: int = 16
|
||||
image_size: Union[Tuple[int, int], int] = 224
|
||||
ls_init_value: Optional[float] = None # layer scale initial value
|
||||
patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
|
||||
input_patchnorm: bool = False # whether to use dual patchnorm - would only apply the input layernorm on each patch, as post-layernorm already exist in original clip vit design
|
||||
global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
|
||||
attentional_pool: bool = False # whether to use attentional pooler in the last embedding layer
|
||||
n_queries: int = 256 # n_queries for attentional pooler
|
||||
attn_pooler_heads: int = 8 # n heads for attentional_pooling
|
||||
timm_model_name: str = None # a valid model name overrides layers, width, patch_size
|
||||
timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
|
||||
timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
|
||||
timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
|
||||
timm_proj_bias: bool = False # enable bias final projection
|
||||
timm_drop: float = 0. # head dropout
|
||||
timm_drop_path: Optional[float] = None # backbone stochastic depth
|
||||
output_tokens: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPTextCfg:
|
||||
context_length: int = 77
|
||||
vocab_size: int = 49408
|
||||
width: int = 512
|
||||
heads: int = 8
|
||||
layers: int = 12
|
||||
ls_init_value: Optional[float] = None # layer scale initial value
|
||||
hf_model_name: str = None
|
||||
hf_tokenizer_name: str = None
|
||||
hf_model_pretrained: bool = True
|
||||
proj: str = 'mlp'
|
||||
pooler_type: str = 'mean_pooler'
|
||||
embed_cls: bool = False
|
||||
pad_id: int = 0
|
||||
output_tokens: bool = False
|
||||
|
||||
|
||||
def get_cast_dtype(precision: str):
|
||||
cast_dtype = None
|
||||
if precision == 'bf16':
|
||||
cast_dtype = torch.bfloat16
|
||||
elif precision == 'fp16':
|
||||
cast_dtype = torch.float16
|
||||
return cast_dtype
|
||||
|
||||
|
||||
def _build_vision_tower(
|
||||
embed_dim: int,
|
||||
vision_cfg: CLIPVisionCfg,
|
||||
quick_gelu: bool = False,
|
||||
cast_dtype: Optional[torch.dtype] = None
|
||||
):
|
||||
if isinstance(vision_cfg, dict):
|
||||
vision_cfg = CLIPVisionCfg(**vision_cfg)
|
||||
|
||||
# OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
|
||||
# memory efficient in recent PyTorch releases (>= 1.10).
|
||||
# NOTE: timm models always use native GELU regardless of quick_gelu flag.
|
||||
act_layer = QuickGELU if quick_gelu else nn.GELU
|
||||
|
||||
if vision_cfg.timm_model_name:
|
||||
visual = TimmModel(
|
||||
vision_cfg.timm_model_name,
|
||||
pretrained=vision_cfg.timm_model_pretrained,
|
||||
pool=vision_cfg.timm_pool,
|
||||
proj=vision_cfg.timm_proj,
|
||||
proj_bias=vision_cfg.timm_proj_bias,
|
||||
drop=vision_cfg.timm_drop,
|
||||
drop_path=vision_cfg.timm_drop_path,
|
||||
embed_dim=embed_dim,
|
||||
image_size=vision_cfg.image_size,
|
||||
)
|
||||
act_layer = nn.GELU # so that text transformer doesn't use QuickGELU w/ timm models
|
||||
elif isinstance(vision_cfg.layers, (tuple, list)):
|
||||
vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
|
||||
visual = ModifiedResNet(
|
||||
layers=vision_cfg.layers,
|
||||
output_dim=embed_dim,
|
||||
heads=vision_heads,
|
||||
image_size=vision_cfg.image_size,
|
||||
width=vision_cfg.width,
|
||||
)
|
||||
else:
|
||||
vision_heads = vision_cfg.width // vision_cfg.head_width
|
||||
norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
|
||||
visual = VisionTransformer(
|
||||
image_size=vision_cfg.image_size,
|
||||
patch_size=vision_cfg.patch_size,
|
||||
width=vision_cfg.width,
|
||||
layers=vision_cfg.layers,
|
||||
heads=vision_heads,
|
||||
mlp_ratio=vision_cfg.mlp_ratio,
|
||||
ls_init_value=vision_cfg.ls_init_value,
|
||||
patch_dropout=vision_cfg.patch_dropout,
|
||||
input_patchnorm=vision_cfg.input_patchnorm,
|
||||
global_average_pool=vision_cfg.global_average_pool,
|
||||
attentional_pool=vision_cfg.attentional_pool,
|
||||
n_queries=vision_cfg.n_queries,
|
||||
attn_pooler_heads=vision_cfg.attn_pooler_heads,
|
||||
output_tokens=vision_cfg.output_tokens,
|
||||
output_dim=embed_dim,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
)
|
||||
|
||||
return visual
|
||||
|
||||
|
||||
def _build_text_tower(
|
||||
embed_dim: int,
|
||||
text_cfg: CLIPTextCfg,
|
||||
quick_gelu: bool = False,
|
||||
cast_dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
if isinstance(text_cfg, dict):
|
||||
text_cfg = CLIPTextCfg(**text_cfg)
|
||||
|
||||
if text_cfg.hf_model_name:
|
||||
text = HFTextEncoder(
|
||||
text_cfg.hf_model_name,
|
||||
output_dim=embed_dim,
|
||||
proj=text_cfg.proj,
|
||||
pooler_type=text_cfg.pooler_type,
|
||||
pretrained=text_cfg.hf_model_pretrained,
|
||||
output_tokens=text_cfg.output_tokens,
|
||||
)
|
||||
else:
|
||||
act_layer = QuickGELU if quick_gelu else nn.GELU
|
||||
norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
|
||||
|
||||
text = TextTransformer(
|
||||
context_length=text_cfg.context_length,
|
||||
vocab_size=text_cfg.vocab_size,
|
||||
width=text_cfg.width,
|
||||
heads=text_cfg.heads,
|
||||
layers=text_cfg.layers,
|
||||
ls_init_value=text_cfg.ls_init_value,
|
||||
output_dim=embed_dim,
|
||||
embed_cls=text_cfg.embed_cls,
|
||||
output_tokens=text_cfg.output_tokens,
|
||||
pad_id=text_cfg.pad_id,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
class CLIP(nn.Module):
|
||||
output_dict: torch.jit.Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int,
|
||||
vision_cfg: CLIPVisionCfg,
|
||||
text_cfg: CLIPTextCfg,
|
||||
quick_gelu: bool = False,
|
||||
cast_dtype: Optional[torch.dtype] = None,
|
||||
output_dict: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.output_dict = output_dict
|
||||
self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
|
||||
|
||||
text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
|
||||
self.transformer = text.transformer
|
||||
self.vocab_size = text.vocab_size
|
||||
self.token_embedding = text.token_embedding
|
||||
self.positional_embedding = text.positional_embedding
|
||||
self.ln_final = text.ln_final
|
||||
self.text_projection = text.text_projection
|
||||
self.register_buffer('attn_mask', text.attn_mask, persistent=False)
|
||||
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
|
||||
|
||||
def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
|
||||
# lock image tower as per LiT - https://arxiv.org/abs/2111.07991
|
||||
self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
|
||||
|
||||
def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
|
||||
locked_layers = []
|
||||
locked_layers.append(self.token_embedding)
|
||||
self.positional_embedding.requires_grad = False
|
||||
if unlocked_layers > 0:
|
||||
locked_layers.append(self.transformer.resblocks[:-unlocked_layers])
|
||||
else:
|
||||
locked_layers.append(self.transformer)
|
||||
locked_layers.append(self.ln_final)
|
||||
self.text_projection.requires_grad = False
|
||||
|
||||
# freeze layers
|
||||
for module in locked_layers:
|
||||
for n, p in module.named_parameters():
|
||||
p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.visual.set_grad_checkpointing(enable)
|
||||
self.transformer.grad_checkpointing = enable
|
||||
|
||||
def encode_image(self, image, normalize: bool = False):
|
||||
features = self.visual(image)
|
||||
return F.normalize(features, dim=-1) if normalize else features
|
||||
|
||||
def encode_text(self, text, normalize: bool = False):
|
||||
cast_dtype = self.transformer.get_cast_dtype()
|
||||
|
||||
x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model]
|
||||
|
||||
x = x + self.positional_embedding.to(cast_dtype)
|
||||
x = x.permute(1, 0, 2) # NLD -> LND
|
||||
x = self.transformer(x, attn_mask=self.attn_mask)
|
||||
x = x.permute(1, 0, 2) # LND -> NLD
|
||||
x = self.ln_final(x) # [batch_size, n_ctx, transformer.width]
|
||||
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
||||
x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
|
||||
return F.normalize(x, dim=-1) if normalize else x
|
||||
|
||||
def forward(self, image, text):
|
||||
image_features = self.encode_image(image, normalize=True)
|
||||
text_features = self.encode_text(text, normalize=True)
|
||||
if self.output_dict:
|
||||
return {
|
||||
"image_features": image_features,
|
||||
"text_features": text_features,
|
||||
"logit_scale": self.logit_scale.exp()
|
||||
}
|
||||
return image_features, text_features, self.logit_scale.exp()
|
||||
|
||||
|
||||
class CustomTextCLIP(nn.Module):
|
||||
output_dict: torch.jit.Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int,
|
||||
vision_cfg: CLIPVisionCfg,
|
||||
text_cfg: CLIPTextCfg,
|
||||
quick_gelu: bool = False,
|
||||
cast_dtype: Optional[torch.dtype] = None,
|
||||
output_dict: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.output_dict = output_dict
|
||||
self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
|
||||
self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
|
||||
|
||||
def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
|
||||
# lock image tower as per LiT - https://arxiv.org/abs/2111.07991
|
||||
self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
|
||||
|
||||
def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
|
||||
self.text.lock(unlocked_layers, freeze_layer_norm)
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.visual.set_grad_checkpointing(enable)
|
||||
self.text.set_grad_checkpointing(enable)
|
||||
|
||||
def encode_image(self, image, normalize: bool = False):
|
||||
features = self.visual(image)
|
||||
return F.normalize(features, dim=-1) if normalize else features
|
||||
|
||||
def encode_text(self, text, normalize: bool = False):
|
||||
features = self.text(text)
|
||||
return F.normalize(features, dim=-1) if normalize else features
|
||||
|
||||
def forward(self, image, text):
|
||||
image_features = self.encode_image(image, normalize=True)
|
||||
text_features = self.encode_text(text, normalize=True)
|
||||
if self.output_dict:
|
||||
return {
|
||||
"image_features": image_features,
|
||||
"text_features": text_features,
|
||||
"logit_scale": self.logit_scale.exp()
|
||||
}
|
||||
return image_features, text_features, self.logit_scale.exp()
|
||||
|
||||
|
||||
def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
|
||||
"""Convert applicable model parameters to low-precision (bf16 or fp16)"""
|
||||
|
||||
def _convert_weights(l):
|
||||
if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
|
||||
l.weight.data = l.weight.data.to(dtype)
|
||||
if l.bias is not None:
|
||||
l.bias.data = l.bias.data.to(dtype)
|
||||
|
||||
if isinstance(l, (nn.MultiheadAttention, Attention)):
|
||||
for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
|
||||
tensor = getattr(l, attr)
|
||||
if tensor is not None:
|
||||
tensor.data = tensor.data.to(dtype)
|
||||
|
||||
for name in ["text_projection", "proj"]:
|
||||
if hasattr(l, name):
|
||||
attr = getattr(l, name)
|
||||
if attr is not None:
|
||||
attr.data = attr.data.to(dtype)
|
||||
|
||||
model.apply(_convert_weights)
|
||||
|
||||
|
||||
convert_weights_to_fp16 = convert_weights_to_lp # backwards compat
|
||||
|
||||
|
||||
# used to maintain checkpoint compatibility
|
||||
def convert_to_custom_text_state_dict(state_dict: dict):
|
||||
if 'text_projection' in state_dict:
|
||||
# old format state_dict, move text tower -> .text
|
||||
new_state_dict = {}
|
||||
for k, v in state_dict.items():
|
||||
if any(k.startswith(p) for p in (
|
||||
'text_projection',
|
||||
'positional_embedding',
|
||||
'token_embedding',
|
||||
'transformer',
|
||||
'ln_final',
|
||||
)):
|
||||
k = 'text.' + k
|
||||
new_state_dict[k] = v
|
||||
return new_state_dict
|
||||
return state_dict
|
||||
|
||||
|
||||
def build_model_from_openai_state_dict(
|
||||
state_dict: dict,
|
||||
quick_gelu=True,
|
||||
cast_dtype=torch.float16,
|
||||
):
|
||||
vit = "visual.proj" in state_dict
|
||||
|
||||
if vit:
|
||||
vision_width = state_dict["visual.conv1.weight"].shape[0]
|
||||
vision_layers = len(
|
||||
[k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
|
||||
vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
|
||||
grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
|
||||
image_size = vision_patch_size * grid_size
|
||||
else:
|
||||
counts: list = [
|
||||
len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
|
||||
vision_layers = tuple(counts)
|
||||
vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
|
||||
output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
|
||||
vision_patch_size = None
|
||||
assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
|
||||
image_size = output_width * 32
|
||||
|
||||
embed_dim = state_dict["text_projection"].shape[1]
|
||||
context_length = state_dict["positional_embedding"].shape[0]
|
||||
vocab_size = state_dict["token_embedding.weight"].shape[0]
|
||||
transformer_width = state_dict["ln_final.weight"].shape[0]
|
||||
transformer_heads = transformer_width // 64
|
||||
transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
|
||||
|
||||
vision_cfg = CLIPVisionCfg(
|
||||
layers=vision_layers,
|
||||
width=vision_width,
|
||||
patch_size=vision_patch_size,
|
||||
image_size=image_size,
|
||||
)
|
||||
text_cfg = CLIPTextCfg(
|
||||
context_length=context_length,
|
||||
vocab_size=vocab_size,
|
||||
width=transformer_width,
|
||||
heads=transformer_heads,
|
||||
layers=transformer_layers,
|
||||
)
|
||||
model = CLIP(
|
||||
embed_dim,
|
||||
vision_cfg=vision_cfg,
|
||||
text_cfg=text_cfg,
|
||||
quick_gelu=quick_gelu, # OpenAI models were trained with QuickGELU
|
||||
cast_dtype=cast_dtype,
|
||||
)
|
||||
|
||||
for key in ["input_resolution", "context_length", "vocab_size"]:
|
||||
state_dict.pop(key, None)
|
||||
|
||||
convert_weights_to_fp16(model) # OpenAI state dicts are partially converted to float16
|
||||
model.load_state_dict(state_dict)
|
||||
return model.eval()
|
||||
|
||||
|
||||
def trace_model(model, batch_size=256, device=torch.device('cpu')):
|
||||
model.eval()
|
||||
image_size = model.visual.image_size
|
||||
example_images = torch.ones((batch_size, 3, image_size, image_size), device=device)
|
||||
example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
|
||||
model = torch.jit.trace_module(
|
||||
model,
|
||||
inputs=dict(
|
||||
forward=(example_images, example_text),
|
||||
encode_text=(example_text,),
|
||||
encode_image=(example_images,)
|
||||
))
|
||||
model.visual.image_size = image_size
|
||||
return model
|
||||
|
||||
|
||||
def resize_pos_embed(state_dict, model, interpolation: str = 'bicubic', antialias: bool = True):
|
||||
# Rescale the grid of position embeddings when loading from state_dict
|
||||
old_pos_embed = state_dict.get('visual.positional_embedding', None)
|
||||
if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
|
||||
return
|
||||
grid_size = to_2tuple(model.visual.grid_size)
|
||||
extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
|
||||
new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
|
||||
if new_seq_len == old_pos_embed.shape[0]:
|
||||
return
|
||||
|
||||
if extra_tokens:
|
||||
pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
|
||||
else:
|
||||
pos_emb_tok, pos_emb_img = None, old_pos_embed
|
||||
old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
|
||||
|
||||
logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
|
||||
pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
|
||||
pos_emb_img = F.interpolate(
|
||||
pos_emb_img,
|
||||
size=grid_size,
|
||||
mode=interpolation,
|
||||
antialias=antialias,
|
||||
align_corners=False,
|
||||
)
|
||||
pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
|
||||
if pos_emb_tok is not None:
|
||||
new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
|
||||
else:
|
||||
new_pos_embed = pos_emb_img
|
||||
state_dict['visual.positional_embedding'] = new_pos_embed
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 32,
|
||||
"width": 1280,
|
||||
"head_width": 80,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 24
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .utils import freeze_batch_norm_2d
|
||||
|
||||
|
||||
class Bottleneck(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1):
|
||||
super().__init__()
|
||||
|
||||
# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
|
||||
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(planes)
|
||||
self.act1 = nn.ReLU(inplace=True)
|
||||
|
||||
self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
|
||||
self.bn2 = nn.BatchNorm2d(planes)
|
||||
self.act2 = nn.ReLU(inplace=True)
|
||||
|
||||
self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
|
||||
|
||||
self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||
self.act3 = nn.ReLU(inplace=True)
|
||||
|
||||
self.downsample = None
|
||||
self.stride = stride
|
||||
|
||||
if stride > 1 or inplanes != planes * Bottleneck.expansion:
|
||||
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
|
||||
self.downsample = nn.Sequential(OrderedDict([
|
||||
("-1", nn.AvgPool2d(stride)),
|
||||
("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
|
||||
("1", nn.BatchNorm2d(planes * self.expansion))
|
||||
]))
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
identity = x
|
||||
|
||||
out = self.act1(self.bn1(self.conv1(x)))
|
||||
out = self.act2(self.bn2(self.conv2(out)))
|
||||
out = self.avgpool(out)
|
||||
out = self.bn3(self.conv3(out))
|
||||
|
||||
if self.downsample is not None:
|
||||
identity = self.downsample(x)
|
||||
|
||||
out += identity
|
||||
out = self.act3(out)
|
||||
return out
|
||||
|
||||
|
||||
class AttentionPool2d(nn.Module):
|
||||
def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
|
||||
super().__init__()
|
||||
self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
|
||||
self.k_proj = nn.Linear(embed_dim, embed_dim)
|
||||
self.q_proj = nn.Linear(embed_dim, embed_dim)
|
||||
self.v_proj = nn.Linear(embed_dim, embed_dim)
|
||||
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
|
||||
self.num_heads = num_heads
|
||||
|
||||
def forward(self, x):
|
||||
x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
|
||||
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
|
||||
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
|
||||
x, _ = F.multi_head_attention_forward(
|
||||
query=x, key=x, value=x,
|
||||
embed_dim_to_check=x.shape[-1],
|
||||
num_heads=self.num_heads,
|
||||
q_proj_weight=self.q_proj.weight,
|
||||
k_proj_weight=self.k_proj.weight,
|
||||
v_proj_weight=self.v_proj.weight,
|
||||
in_proj_weight=None,
|
||||
in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
|
||||
bias_k=None,
|
||||
bias_v=None,
|
||||
add_zero_attn=False,
|
||||
dropout_p=0.,
|
||||
out_proj_weight=self.c_proj.weight,
|
||||
out_proj_bias=self.c_proj.bias,
|
||||
use_separate_proj_weight=True,
|
||||
training=self.training,
|
||||
need_weights=False
|
||||
)
|
||||
|
||||
return x[0]
|
||||
|
||||
|
||||
class ModifiedResNet(nn.Module):
|
||||
"""
|
||||
A ResNet class that is similar to torchvision's but contains the following changes:
|
||||
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
|
||||
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
|
||||
- The final pooling layer is a QKV attention instead of an average pool
|
||||
"""
|
||||
|
||||
def __init__(self, layers, output_dim, heads, image_size=224, width=64):
|
||||
super().__init__()
|
||||
self.output_dim = output_dim
|
||||
self.image_size = image_size
|
||||
|
||||
# the 3-layer stem
|
||||
self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width // 2)
|
||||
self.act1 = nn.ReLU(inplace=True)
|
||||
self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
|
||||
self.bn2 = nn.BatchNorm2d(width // 2)
|
||||
self.act2 = nn.ReLU(inplace=True)
|
||||
self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(width)
|
||||
self.act3 = nn.ReLU(inplace=True)
|
||||
self.avgpool = nn.AvgPool2d(2)
|
||||
|
||||
# residual layers
|
||||
self._inplanes = width # this is a *mutable* variable used during construction
|
||||
self.layer1 = self._make_layer(width, layers[0])
|
||||
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
|
||||
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
|
||||
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
|
||||
|
||||
embed_dim = width * 32 # the ResNet feature dimension
|
||||
self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
|
||||
|
||||
self.init_parameters()
|
||||
|
||||
def _make_layer(self, planes, blocks, stride=1):
|
||||
layers = [Bottleneck(self._inplanes, planes, stride)]
|
||||
|
||||
self._inplanes = planes * Bottleneck.expansion
|
||||
for _ in range(1, blocks):
|
||||
layers.append(Bottleneck(self._inplanes, planes))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def init_parameters(self):
|
||||
if self.attnpool is not None:
|
||||
std = self.attnpool.c_proj.in_features ** -0.5
|
||||
nn.init.normal_(self.attnpool.q_proj.weight, std=std)
|
||||
nn.init.normal_(self.attnpool.k_proj.weight, std=std)
|
||||
nn.init.normal_(self.attnpool.v_proj.weight, std=std)
|
||||
nn.init.normal_(self.attnpool.c_proj.weight, std=std)
|
||||
|
||||
for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
|
||||
for name, param in resnet_block.named_parameters():
|
||||
if name.endswith("bn3.weight"):
|
||||
nn.init.zeros_(param)
|
||||
|
||||
def lock(self, unlocked_groups=0, freeze_bn_stats=False):
|
||||
assert unlocked_groups == 0, 'partial locking not currently supported for this model'
|
||||
for param in self.parameters():
|
||||
param.requires_grad = False
|
||||
if freeze_bn_stats:
|
||||
freeze_batch_norm_2d(self)
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
# FIXME support for non-transformer
|
||||
pass
|
||||
|
||||
def stem(self, x):
|
||||
x = self.act1(self.bn1(self.conv1(x)))
|
||||
x = self.act2(self.bn2(self.conv2(x)))
|
||||
x = self.act3(self.bn3(self.conv3(x)))
|
||||
x = self.avgpool(x)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
x = self.stem(x)
|
||||
x = self.layer1(x)
|
||||
x = self.layer2(x)
|
||||
x = self.layer3(x)
|
||||
x = self.layer4(x)
|
||||
x = self.attnpool(x)
|
||||
|
||||
return x
|
||||
144
diffsynth/extensions/ImageQualityMetric/open_clip/openai.py
Normal file
144
diffsynth/extensions/ImageQualityMetric/open_clip/openai.py
Normal file
@@ -0,0 +1,144 @@
|
||||
""" OpenAI pretrained model functions
|
||||
|
||||
Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
|
||||
"""
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
|
||||
from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
|
||||
|
||||
__all__ = ["list_openai_models", "load_openai_model"]
|
||||
|
||||
|
||||
def list_openai_models() -> List[str]:
|
||||
"""Returns the names of available CLIP models"""
|
||||
return list_pretrained_models_by_tag('openai')
|
||||
|
||||
|
||||
def load_openai_model(
|
||||
name: str,
|
||||
precision: Optional[str] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
jit: bool = True,
|
||||
cache_dir: Optional[str] = None,
|
||||
):
|
||||
"""Load a CLIP model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
|
||||
precision: str
|
||||
Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
|
||||
device : Union[str, torch.device]
|
||||
The device to put the loaded model
|
||||
jit : bool
|
||||
Whether to load the optimized JIT model (default) or more hackable non-JIT model.
|
||||
cache_dir : Optional[str]
|
||||
The directory to cache the downloaded model weights
|
||||
|
||||
Returns
|
||||
-------
|
||||
model : torch.nn.Module
|
||||
The CLIP model
|
||||
preprocess : Callable[[PIL.Image], torch.Tensor]
|
||||
A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
|
||||
"""
|
||||
if device is None:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
if precision is None:
|
||||
precision = 'fp32' if device == 'cpu' else 'fp16'
|
||||
|
||||
if get_pretrained_url(name, 'openai'):
|
||||
model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
|
||||
elif os.path.isfile(name):
|
||||
model_path = name
|
||||
else:
|
||||
raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
|
||||
|
||||
try:
|
||||
# loading JIT archive
|
||||
model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
|
||||
state_dict = None
|
||||
except RuntimeError:
|
||||
# loading saved state dict
|
||||
if jit:
|
||||
warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
|
||||
jit = False
|
||||
state_dict = torch.load(model_path, map_location="cpu")
|
||||
|
||||
if not jit:
|
||||
# Build a non-jit model from the OpenAI jitted model state dict
|
||||
cast_dtype = get_cast_dtype(precision)
|
||||
try:
|
||||
model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
|
||||
except KeyError:
|
||||
sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
|
||||
model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
|
||||
|
||||
# model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
|
||||
model = model.to(device)
|
||||
if precision.startswith('amp') or precision == 'fp32':
|
||||
model.float()
|
||||
elif precision == 'bf16':
|
||||
convert_weights_to_lp(model, dtype=torch.bfloat16)
|
||||
|
||||
return model
|
||||
|
||||
# patch the device names
|
||||
device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
|
||||
device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
|
||||
|
||||
def patch_device(module):
|
||||
try:
|
||||
graphs = [module.graph] if hasattr(module, "graph") else []
|
||||
except RuntimeError:
|
||||
graphs = []
|
||||
|
||||
if hasattr(module, "forward1"):
|
||||
graphs.append(module.forward1.graph)
|
||||
|
||||
for graph in graphs:
|
||||
for node in graph.findAllNodes("prim::Constant"):
|
||||
if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
|
||||
node.copyAttributes(device_node)
|
||||
|
||||
model.apply(patch_device)
|
||||
patch_device(model.encode_image)
|
||||
patch_device(model.encode_text)
|
||||
|
||||
# patch dtype to float32 (typically for CPU)
|
||||
if precision == 'fp32':
|
||||
float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
|
||||
float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
|
||||
float_node = float_input.node()
|
||||
|
||||
def patch_float(module):
|
||||
try:
|
||||
graphs = [module.graph] if hasattr(module, "graph") else []
|
||||
except RuntimeError:
|
||||
graphs = []
|
||||
|
||||
if hasattr(module, "forward1"):
|
||||
graphs.append(module.forward1.graph)
|
||||
|
||||
for graph in graphs:
|
||||
for node in graph.findAllNodes("aten::to"):
|
||||
inputs = list(node.inputs())
|
||||
for i in [1, 2]: # dtype can be the second or third argument to aten::to()
|
||||
if inputs[i].node()["value"] == 5:
|
||||
inputs[i].node().copyAttributes(float_node)
|
||||
|
||||
model.apply(patch_float)
|
||||
patch_float(model.encode_image)
|
||||
patch_float(model.encode_text)
|
||||
model.float()
|
||||
|
||||
# ensure image_size attr available at consistent location for both jit and non-jit
|
||||
model.visual.image_size = model.input_resolution.item()
|
||||
return model
|
||||
376
diffsynth/extensions/ImageQualityMetric/open_clip/pretrained.py
Normal file
376
diffsynth/extensions/ImageQualityMetric/open_clip/pretrained.py
Normal file
@@ -0,0 +1,376 @@
|
||||
import hashlib
|
||||
import os
|
||||
import urllib
|
||||
import warnings
|
||||
from functools import partial
|
||||
from typing import Dict, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from .version import __version__
|
||||
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download
|
||||
hf_hub_download = partial(hf_hub_download, library_name="open_clip", library_version=__version__)
|
||||
_has_hf_hub = True
|
||||
except ImportError:
|
||||
hf_hub_download = None
|
||||
_has_hf_hub = False
|
||||
|
||||
|
||||
def _pcfg(url='', hf_hub='', mean=None, std=None):
|
||||
return dict(
|
||||
url=url,
|
||||
hf_hub=hf_hub,
|
||||
mean=mean,
|
||||
std=std,
|
||||
)
|
||||
|
||||
|
||||
_RN50 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt"),
|
||||
yfcc15m=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt"),
|
||||
cc12m=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt"),
|
||||
)
|
||||
|
||||
_RN50_quickgelu = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt"),
|
||||
yfcc15m=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt"),
|
||||
cc12m=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt"),
|
||||
)
|
||||
|
||||
_RN101 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt"),
|
||||
yfcc15m=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt"),
|
||||
)
|
||||
|
||||
_RN101_quickgelu = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt"),
|
||||
yfcc15m=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt"),
|
||||
)
|
||||
|
||||
_RN50x4 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt"),
|
||||
)
|
||||
|
||||
_RN50x16 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt"),
|
||||
)
|
||||
|
||||
_RN50x64 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt"),
|
||||
)
|
||||
|
||||
_VITB32 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
|
||||
laion400m_e31=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
|
||||
laion400m_e32=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
|
||||
laion2b_e16=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"),
|
||||
laion2b_s34b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-laion2B-s34B-b79K/')
|
||||
)
|
||||
|
||||
_VITB32_quickgelu = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
|
||||
laion400m_e31=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
|
||||
laion400m_e32=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
|
||||
)
|
||||
|
||||
_VITB16 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"),
|
||||
laion400m_e31=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"),
|
||||
laion400m_e32=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"),
|
||||
# laion400m_32k=_pcfg(
|
||||
# url="",
|
||||
# mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
|
||||
# laion400m_64k=_pcfg(
|
||||
# url="",
|
||||
# mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
|
||||
laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-laion2B-s34B-b88K/'),
|
||||
)
|
||||
|
||||
_VITB16_PLUS_240 = dict(
|
||||
laion400m_e31=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e31-8fb26589.pt"),
|
||||
laion400m_e32=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e32-699c4b84.pt"),
|
||||
)
|
||||
|
||||
_VITL14 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"),
|
||||
laion400m_e31=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"),
|
||||
laion400m_e32=_pcfg(
|
||||
"https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"),
|
||||
laion2b_s32b_b82k=_pcfg(
|
||||
hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/',
|
||||
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
|
||||
)
|
||||
|
||||
_VITL14_336 = dict(
|
||||
openai=_pcfg(
|
||||
"https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"),
|
||||
)
|
||||
|
||||
_VITH14 = dict(
|
||||
laion2b_s32b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-laion2B-s32B-b79K/'),
|
||||
)
|
||||
|
||||
_VITg14 = dict(
|
||||
laion2b_s12b_b42k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s12B-b42K/'),
|
||||
laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s34B-b88K/'),
|
||||
)
|
||||
|
||||
_VITbigG14 = dict(
|
||||
laion2b_s39b_b160k=_pcfg(hf_hub='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/'),
|
||||
)
|
||||
|
||||
_robertaViTB32 = dict(
|
||||
laion2b_s12b_b32k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k/'),
|
||||
)
|
||||
|
||||
_xlmRobertaBaseViTB32 = dict(
|
||||
laion5b_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-xlm-roberta-base-laion5B-s13B-b90k/'),
|
||||
)
|
||||
|
||||
_xlmRobertaLargeFrozenViTH14 = dict(
|
||||
frozen_laion5b_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k/'),
|
||||
)
|
||||
|
||||
_convnext_base = dict(
|
||||
laion400m_s13b_b51k=_pcfg(hf_hub='laion/CLIP-convnext_base-laion400M-s13B-b51K/'),
|
||||
)
|
||||
|
||||
_convnext_base_w = dict(
|
||||
laion2b_s13b_b82k=_pcfg(hf_hub='laion/CLIP-convnext_base_w-laion2B-s13B-b82K/'),
|
||||
laion2b_s13b_b82k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg/'),
|
||||
laion_aesthetic_s13b_b82k=_pcfg(hf_hub='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K/'),
|
||||
)
|
||||
|
||||
_convnext_base_w_320 = dict(
|
||||
laion_aesthetic_s13b_b82k=_pcfg(hf_hub='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K/'),
|
||||
laion_aesthetic_s13b_b82k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg/'),
|
||||
)
|
||||
|
||||
_convnext_large_d = dict(
|
||||
laion2b_s26b_b102k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg/'),
|
||||
)
|
||||
|
||||
_convnext_large_d_320 = dict(
|
||||
laion2b_s29b_b131k_ft=_pcfg(hf_hub='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft/'),
|
||||
laion2b_s29b_b131k_ft_soup=_pcfg(hf_hub='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/'),
|
||||
)
|
||||
|
||||
_convnext_xxlarge = dict(
|
||||
laion2b_s34b_b82k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg/'),
|
||||
laion2b_s34b_b82k_augreg_rewind=_pcfg(hf_hub='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind/'),
|
||||
laion2b_s34b_b82k_augreg_soup=_pcfg(hf_hub='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/'),
|
||||
)
|
||||
|
||||
_coca_VITB32 = dict(
|
||||
laion2b_s13b_b90k=_pcfg(hf_hub='laion/CoCa-ViT-B-32-laion2B-s13B-b90k/'),
|
||||
mscoco_finetuned_laion2b_s13b_b90k=_pcfg(hf_hub='laion/mscoco_finetuned_CoCa-ViT-B-32-laion2B-s13B-b90k/')
|
||||
)
|
||||
|
||||
_coca_VITL14 = dict(
|
||||
laion2b_s13b_b90k=_pcfg(hf_hub='laion/CoCa-ViT-L-14-laion2B-s13B-b90k/'),
|
||||
mscoco_finetuned_laion2b_s13b_b90k=_pcfg(hf_hub='laion/mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k/')
|
||||
)
|
||||
|
||||
|
||||
_PRETRAINED = {
|
||||
"RN50": _RN50,
|
||||
"RN50-quickgelu": _RN50_quickgelu,
|
||||
"RN101": _RN101,
|
||||
"RN101-quickgelu": _RN101_quickgelu,
|
||||
"RN50x4": _RN50x4,
|
||||
"RN50x16": _RN50x16,
|
||||
"RN50x64": _RN50x64,
|
||||
"ViT-B-32": _VITB32,
|
||||
"ViT-B-32-quickgelu": _VITB32_quickgelu,
|
||||
"ViT-B-16": _VITB16,
|
||||
"ViT-B-16-plus-240": _VITB16_PLUS_240,
|
||||
"ViT-L-14": _VITL14,
|
||||
"ViT-L-14-336": _VITL14_336,
|
||||
"ViT-H-14": _VITH14,
|
||||
"ViT-g-14": _VITg14,
|
||||
"ViT-bigG-14": _VITbigG14,
|
||||
"roberta-ViT-B-32": _robertaViTB32,
|
||||
"xlm-roberta-base-ViT-B-32": _xlmRobertaBaseViTB32,
|
||||
"xlm-roberta-large-ViT-H-14": _xlmRobertaLargeFrozenViTH14,
|
||||
"convnext_base": _convnext_base,
|
||||
"convnext_base_w": _convnext_base_w,
|
||||
"convnext_base_w_320": _convnext_base_w_320,
|
||||
"convnext_large_d": _convnext_large_d,
|
||||
"convnext_large_d_320": _convnext_large_d_320,
|
||||
"convnext_xxlarge": _convnext_xxlarge,
|
||||
"coca_ViT-B-32": _coca_VITB32,
|
||||
"coca_ViT-L-14": _coca_VITL14,
|
||||
}
|
||||
|
||||
|
||||
def _clean_tag(tag: str):
|
||||
# normalize pretrained tags
|
||||
return tag.lower().replace('-', '_')
|
||||
|
||||
|
||||
def list_pretrained(as_str: bool = False):
|
||||
""" returns list of pretrained models
|
||||
Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True
|
||||
"""
|
||||
return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()]
|
||||
|
||||
|
||||
def list_pretrained_models_by_tag(tag: str):
|
||||
""" return all models having the specified pretrain tag """
|
||||
models = []
|
||||
tag = _clean_tag(tag)
|
||||
for k in _PRETRAINED.keys():
|
||||
if tag in _PRETRAINED[k]:
|
||||
models.append(k)
|
||||
return models
|
||||
|
||||
|
||||
def list_pretrained_tags_by_model(model: str):
|
||||
""" return all pretrain tags for the specified model architecture """
|
||||
tags = []
|
||||
if model in _PRETRAINED:
|
||||
tags.extend(_PRETRAINED[model].keys())
|
||||
return tags
|
||||
|
||||
|
||||
def is_pretrained_cfg(model: str, tag: str):
|
||||
if model not in _PRETRAINED:
|
||||
return False
|
||||
return _clean_tag(tag) in _PRETRAINED[model]
|
||||
|
||||
|
||||
def get_pretrained_cfg(model: str, tag: str):
|
||||
if model not in _PRETRAINED:
|
||||
return {}
|
||||
model_pretrained = _PRETRAINED[model]
|
||||
return model_pretrained.get(_clean_tag(tag), {})
|
||||
|
||||
|
||||
def get_pretrained_url(model: str, tag: str):
|
||||
cfg = get_pretrained_cfg(model, _clean_tag(tag))
|
||||
return cfg.get('url', '')
|
||||
|
||||
|
||||
def download_pretrained_from_url(
|
||||
url: str,
|
||||
cache_dir: Union[str, None] = None,
|
||||
):
|
||||
if not cache_dir:
|
||||
cache_dir = os.path.expanduser("~/.cache/clip")
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
filename = os.path.basename(url)
|
||||
|
||||
if 'openaipublic' in url:
|
||||
expected_sha256 = url.split("/")[-2]
|
||||
elif 'mlfoundations' in url:
|
||||
expected_sha256 = os.path.splitext(filename)[0].split("-")[-1]
|
||||
else:
|
||||
expected_sha256 = ''
|
||||
|
||||
download_target = os.path.join(cache_dir, filename)
|
||||
|
||||
if os.path.exists(download_target) and not os.path.isfile(download_target):
|
||||
raise RuntimeError(f"{download_target} exists and is not a regular file")
|
||||
|
||||
if os.path.isfile(download_target):
|
||||
if expected_sha256:
|
||||
if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
|
||||
return download_target
|
||||
else:
|
||||
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
|
||||
else:
|
||||
return download_target
|
||||
|
||||
with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
|
||||
with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
|
||||
while True:
|
||||
buffer = source.read(8192)
|
||||
if not buffer:
|
||||
break
|
||||
|
||||
output.write(buffer)
|
||||
loop.update(len(buffer))
|
||||
|
||||
if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
|
||||
raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
|
||||
|
||||
return download_target
|
||||
|
||||
|
||||
def has_hf_hub(necessary=False):
|
||||
if not _has_hf_hub and necessary:
|
||||
# if no HF Hub module installed, and it is necessary to continue, raise error
|
||||
raise RuntimeError(
|
||||
'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.')
|
||||
return _has_hf_hub
|
||||
|
||||
|
||||
def download_pretrained_from_hf(
|
||||
model_id: str,
|
||||
filename: str = 'open_clip_pytorch_model.bin',
|
||||
revision=None,
|
||||
cache_dir: Union[str, None] = None,
|
||||
):
|
||||
has_hf_hub(True)
|
||||
cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir)
|
||||
return cached_file
|
||||
|
||||
|
||||
def download_pretrained(
|
||||
cfg: Dict,
|
||||
force_hf_hub: bool = False,
|
||||
cache_dir: Union[str, None] = None,
|
||||
):
|
||||
target = ''
|
||||
if not cfg:
|
||||
return target
|
||||
|
||||
download_url = cfg.get('url', '')
|
||||
download_hf_hub = cfg.get('hf_hub', '')
|
||||
if download_hf_hub and force_hf_hub:
|
||||
# use HF hub even if url exists
|
||||
download_url = ''
|
||||
|
||||
if download_url:
|
||||
target = download_pretrained_from_url(download_url, cache_dir=cache_dir)
|
||||
elif download_hf_hub:
|
||||
has_hf_hub(True)
|
||||
# we assume the hf_hub entries in pretrained config combine model_id + filename in
|
||||
# 'org/model_name/filename.pt' form. To specify just the model id w/o filename and
|
||||
# use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'.
|
||||
model_id, filename = os.path.split(download_hf_hub)
|
||||
if filename:
|
||||
target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir)
|
||||
else:
|
||||
target = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
|
||||
|
||||
return target
|
||||
@@ -0,0 +1,243 @@
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
try:
|
||||
from huggingface_hub import (
|
||||
create_repo,
|
||||
get_hf_file_metadata,
|
||||
hf_hub_download,
|
||||
hf_hub_url,
|
||||
repo_type_and_id_from_hf_id,
|
||||
upload_folder,
|
||||
)
|
||||
from huggingface_hub.utils import EntryNotFoundError
|
||||
_has_hf_hub = True
|
||||
except ImportError:
|
||||
_has_hf_hub = False
|
||||
|
||||
from .factory import create_model_from_pretrained, get_model_config, get_tokenizer
|
||||
from .tokenizer import HFTokenizer
|
||||
|
||||
|
||||
def save_config_for_hf(
|
||||
model,
|
||||
config_path: str,
|
||||
model_config: Optional[dict]
|
||||
):
|
||||
preprocess_cfg = {
|
||||
'mean': model.visual.image_mean,
|
||||
'std': model.visual.image_std,
|
||||
}
|
||||
hf_config = {
|
||||
'model_cfg': model_config,
|
||||
'preprocess_cfg': preprocess_cfg,
|
||||
}
|
||||
|
||||
with config_path.open('w') as f:
|
||||
json.dump(hf_config, f, indent=2)
|
||||
|
||||
|
||||
def save_for_hf(
|
||||
model,
|
||||
tokenizer: HFTokenizer,
|
||||
model_config: dict,
|
||||
save_directory: str,
|
||||
weights_filename='open_clip_pytorch_model.bin',
|
||||
config_filename='open_clip_config.json',
|
||||
):
|
||||
save_directory = Path(save_directory)
|
||||
save_directory.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
weights_path = save_directory / weights_filename
|
||||
torch.save(model.state_dict(), weights_path)
|
||||
|
||||
tokenizer.save_pretrained(save_directory)
|
||||
|
||||
config_path = save_directory / config_filename
|
||||
save_config_for_hf(model, config_path, model_config=model_config)
|
||||
|
||||
|
||||
def push_to_hf_hub(
|
||||
model,
|
||||
tokenizer,
|
||||
model_config: Optional[dict],
|
||||
repo_id: str,
|
||||
commit_message: str = 'Add model',
|
||||
token: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
private: bool = False,
|
||||
create_pr: bool = False,
|
||||
model_card: Optional[dict] = None,
|
||||
):
|
||||
if not isinstance(tokenizer, HFTokenizer):
|
||||
# default CLIP tokenizers use https://huggingface.co/openai/clip-vit-large-patch14
|
||||
tokenizer = HFTokenizer('openai/clip-vit-large-patch14')
|
||||
|
||||
# Create repo if it doesn't exist yet
|
||||
repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True)
|
||||
|
||||
# Infer complete repo_id from repo_url
|
||||
# Can be different from the input `repo_id` if repo_owner was implicit
|
||||
_, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
|
||||
repo_id = f"{repo_owner}/{repo_name}"
|
||||
|
||||
# Check if README file already exist in repo
|
||||
try:
|
||||
get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
|
||||
has_readme = True
|
||||
except EntryNotFoundError:
|
||||
has_readme = False
|
||||
|
||||
# Dump model and push to Hub
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
# Save model weights and config.
|
||||
save_for_hf(
|
||||
model,
|
||||
tokenizer=tokenizer,
|
||||
model_config=model_config,
|
||||
save_directory=tmpdir,
|
||||
)
|
||||
|
||||
# Add readme if it does not exist
|
||||
if not has_readme:
|
||||
model_card = model_card or {}
|
||||
model_name = repo_id.split('/')[-1]
|
||||
readme_path = Path(tmpdir) / "README.md"
|
||||
readme_text = generate_readme(model_card, model_name)
|
||||
readme_path.write_text(readme_text)
|
||||
|
||||
# Upload model and return
|
||||
return upload_folder(
|
||||
repo_id=repo_id,
|
||||
folder_path=tmpdir,
|
||||
revision=revision,
|
||||
create_pr=create_pr,
|
||||
commit_message=commit_message,
|
||||
)
|
||||
|
||||
|
||||
def push_pretrained_to_hf_hub(
|
||||
model_name,
|
||||
pretrained: str,
|
||||
repo_id: str,
|
||||
image_mean: Optional[Tuple[float, ...]] = None,
|
||||
image_std: Optional[Tuple[float, ...]] = None,
|
||||
commit_message: str = 'Add model',
|
||||
token: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
private: bool = False,
|
||||
create_pr: bool = False,
|
||||
model_card: Optional[dict] = None,
|
||||
):
|
||||
model, preprocess_eval = create_model_from_pretrained(
|
||||
model_name,
|
||||
pretrained=pretrained,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
)
|
||||
|
||||
model_config = get_model_config(model_name)
|
||||
assert model_config
|
||||
|
||||
tokenizer = get_tokenizer(model_name)
|
||||
|
||||
push_to_hf_hub(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
model_config=model_config,
|
||||
repo_id=repo_id,
|
||||
commit_message=commit_message,
|
||||
token=token,
|
||||
revision=revision,
|
||||
private=private,
|
||||
create_pr=create_pr,
|
||||
model_card=model_card,
|
||||
)
|
||||
|
||||
|
||||
def generate_readme(model_card: dict, model_name: str):
|
||||
readme_text = "---\n"
|
||||
readme_text += "tags:\n- zero-shot-image-classification\n- clip\n"
|
||||
readme_text += "library_tag: open_clip\n"
|
||||
readme_text += f"license: {model_card.get('license', 'mit')}\n"
|
||||
if 'details' in model_card and 'Dataset' in model_card['details']:
|
||||
readme_text += 'datasets:\n'
|
||||
readme_text += f"- {model_card['details']['Dataset'].lower()}\n"
|
||||
readme_text += "---\n"
|
||||
readme_text += f"# Model card for {model_name}\n"
|
||||
if 'description' in model_card:
|
||||
readme_text += f"\n{model_card['description']}\n"
|
||||
if 'details' in model_card:
|
||||
readme_text += f"\n## Model Details\n"
|
||||
for k, v in model_card['details'].items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
readme_text += f"- **{k}:**\n"
|
||||
for vi in v:
|
||||
readme_text += f" - {vi}\n"
|
||||
elif isinstance(v, dict):
|
||||
readme_text += f"- **{k}:**\n"
|
||||
for ki, vi in v.items():
|
||||
readme_text += f" - {ki}: {vi}\n"
|
||||
else:
|
||||
readme_text += f"- **{k}:** {v}\n"
|
||||
if 'usage' in model_card:
|
||||
readme_text += f"\n## Model Usage\n"
|
||||
readme_text += model_card['usage']
|
||||
readme_text += '\n'
|
||||
|
||||
if 'comparison' in model_card:
|
||||
readme_text += f"\n## Model Comparison\n"
|
||||
readme_text += model_card['comparison']
|
||||
readme_text += '\n'
|
||||
|
||||
if 'citation' in model_card:
|
||||
readme_text += f"\n## Citation\n"
|
||||
if not isinstance(model_card['citation'], (list, tuple)):
|
||||
citations = [model_card['citation']]
|
||||
else:
|
||||
citations = model_card['citation']
|
||||
for c in citations:
|
||||
readme_text += f"```bibtex\n{c}\n```\n"
|
||||
|
||||
return readme_text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Push to Hugging Face Hub")
|
||||
parser.add_argument(
|
||||
"--model", type=str, help="Name of the model to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained", type=str,
|
||||
help="Use a pretrained CLIP model weights with the specified tag or file path.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo-id", type=str,
|
||||
help="Destination HF Hub repo-id ie 'organization/model_id'.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--image-mean', type=float, nargs='+', default=None, metavar='MEAN',
|
||||
help='Override default image mean value of dataset')
|
||||
parser.add_argument(
|
||||
'--image-std', type=float, nargs='+', default=None, metavar='STD',
|
||||
help='Override default image std deviation of of dataset')
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f'Saving model {args.model} with pretrained weights {args.pretrained} to Hugging Face Hub at {args.repo_id}')
|
||||
|
||||
# FIXME add support to pass model_card json / template from file via cmd line
|
||||
|
||||
push_pretrained_to_hf_hub(
|
||||
args.model,
|
||||
args.pretrained,
|
||||
args.repo_id,
|
||||
image_mean=args.image_mean, # override image mean/std if trained w/ non defaults
|
||||
image_std=args.image_std,
|
||||
)
|
||||
|
||||
print(f'{args.model} saved.')
|
||||
127
diffsynth/extensions/ImageQualityMetric/open_clip/timm_model.py
Normal file
127
diffsynth/extensions/ImageQualityMetric/open_clip/timm_model.py
Normal file
@@ -0,0 +1,127 @@
|
||||
""" timm model adapter
|
||||
|
||||
Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
|
||||
"""
|
||||
import logging
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
try:
|
||||
import timm
|
||||
from timm.models.layers import Mlp, to_2tuple
|
||||
try:
|
||||
# old timm imports < 0.8.1
|
||||
from timm.models.layers.attention_pool2d import RotAttentionPool2d
|
||||
from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
|
||||
except ImportError:
|
||||
# new timm imports >= 0.8.1
|
||||
from timm.layers import RotAttentionPool2d
|
||||
from timm.layers import AttentionPool2d as AbsAttentionPool2d
|
||||
except ImportError:
|
||||
timm = None
|
||||
|
||||
from .utils import freeze_batch_norm_2d
|
||||
|
||||
|
||||
class TimmModel(nn.Module):
|
||||
""" timm model adapter
|
||||
# FIXME this adapter is a work in progress, may change in ways that break weight compat
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name,
|
||||
embed_dim,
|
||||
image_size=224,
|
||||
pool='avg',
|
||||
proj='linear',
|
||||
proj_bias=False,
|
||||
drop=0.,
|
||||
drop_path=None,
|
||||
pretrained=False,
|
||||
):
|
||||
super().__init__()
|
||||
if timm is None:
|
||||
raise RuntimeError("Please `pip install timm` to use timm models.")
|
||||
|
||||
self.image_size = to_2tuple(image_size)
|
||||
timm_kwargs = {}
|
||||
if drop_path is not None:
|
||||
timm_kwargs['drop_path_rate'] = drop_path
|
||||
self.trunk = timm.create_model(model_name, pretrained=pretrained, **timm_kwargs)
|
||||
feat_size = self.trunk.default_cfg.get('pool_size', None)
|
||||
feature_ndim = 1 if not feat_size else 2
|
||||
if pool in ('abs_attn', 'rot_attn'):
|
||||
assert feature_ndim == 2
|
||||
# if attn pooling used, remove both classifier and default pool
|
||||
self.trunk.reset_classifier(0, global_pool='')
|
||||
else:
|
||||
# reset global pool if pool config set, otherwise leave as network default
|
||||
reset_kwargs = dict(global_pool=pool) if pool else {}
|
||||
self.trunk.reset_classifier(0, **reset_kwargs)
|
||||
prev_chs = self.trunk.num_features
|
||||
|
||||
head_layers = OrderedDict()
|
||||
if pool == 'abs_attn':
|
||||
head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
|
||||
prev_chs = embed_dim
|
||||
elif pool == 'rot_attn':
|
||||
head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
|
||||
prev_chs = embed_dim
|
||||
else:
|
||||
assert proj, 'projection layer needed if non-attention pooling is used.'
|
||||
|
||||
# NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
|
||||
if proj == 'linear':
|
||||
head_layers['drop'] = nn.Dropout(drop)
|
||||
head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
|
||||
elif proj == 'mlp':
|
||||
head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=(drop, 0), bias=(True, proj_bias))
|
||||
|
||||
self.head = nn.Sequential(head_layers)
|
||||
|
||||
def lock(self, unlocked_groups=0, freeze_bn_stats=False):
|
||||
""" lock modules
|
||||
Args:
|
||||
unlocked_groups (int): leave last n layer groups unlocked (default: 0)
|
||||
"""
|
||||
if not unlocked_groups:
|
||||
# lock full model
|
||||
for param in self.trunk.parameters():
|
||||
param.requires_grad = False
|
||||
if freeze_bn_stats:
|
||||
freeze_batch_norm_2d(self.trunk)
|
||||
else:
|
||||
# NOTE: partial freeze requires latest timm (master) branch and is subject to change
|
||||
try:
|
||||
# FIXME import here until API stable and in an official release
|
||||
from timm.models.helpers import group_parameters, group_modules
|
||||
except ImportError:
|
||||
raise RuntimeError(
|
||||
'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
|
||||
matcher = self.trunk.group_matcher()
|
||||
gparams = group_parameters(self.trunk, matcher)
|
||||
max_layer_id = max(gparams.keys())
|
||||
max_layer_id = max_layer_id - unlocked_groups
|
||||
for group_idx in range(max_layer_id + 1):
|
||||
group = gparams[group_idx]
|
||||
for param in group:
|
||||
self.trunk.get_parameter(param).requires_grad = False
|
||||
if freeze_bn_stats:
|
||||
gmodules = group_modules(self.trunk, matcher, reverse=True)
|
||||
gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
|
||||
freeze_batch_norm_2d(self.trunk, gmodules)
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
try:
|
||||
self.trunk.set_grad_checkpointing(enable)
|
||||
except Exception as e:
|
||||
logging.warning('grad checkpointing not supported for this timm image tower, continuing without...')
|
||||
|
||||
def forward(self, x):
|
||||
x = self.trunk(x)
|
||||
x = self.head(x)
|
||||
return x
|
||||
211
diffsynth/extensions/ImageQualityMetric/open_clip/tokenizer.py
Normal file
211
diffsynth/extensions/ImageQualityMetric/open_clip/tokenizer.py
Normal file
@@ -0,0 +1,211 @@
|
||||
""" CLIP tokenizer
|
||||
|
||||
Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
|
||||
"""
|
||||
import gzip
|
||||
import html
|
||||
import os
|
||||
from functools import lru_cache
|
||||
from typing import Union, List
|
||||
|
||||
import ftfy
|
||||
import regex as re
|
||||
import torch
|
||||
|
||||
# https://stackoverflow.com/q/62691279
|
||||
import os
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def default_bpe():
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
|
||||
quality_metric_path = os.path.join(project_root, 'models', 'QualityMetric')
|
||||
return os.path.join(quality_metric_path, "bpe_simple_vocab_16e6.txt.gz")
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
def get_pairs(word):
|
||||
"""Return set of symbol pairs in a word.
|
||||
Word is represented as tuple of symbols (symbols being variable-length strings).
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
|
||||
def basic_clean(text):
|
||||
text = ftfy.fix_text(text)
|
||||
text = html.unescape(html.unescape(text))
|
||||
return text.strip()
|
||||
|
||||
|
||||
def whitespace_clean(text):
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
class SimpleTokenizer(object):
|
||||
def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
|
||||
merges = merges[1:49152-256-2+1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
vocab = list(bytes_to_unicode().values())
|
||||
vocab = vocab + [v+'</w>' for v in vocab]
|
||||
for merge in merges:
|
||||
vocab.append(''.join(merge))
|
||||
if not special_tokens:
|
||||
special_tokens = ['<start_of_text>', '<end_of_text>']
|
||||
else:
|
||||
special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
|
||||
vocab.extend(special_tokens)
|
||||
self.encoder = dict(zip(vocab, range(len(vocab))))
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {t:t for t in special_tokens}
|
||||
special = "|".join(special_tokens)
|
||||
self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
|
||||
|
||||
self.vocab_size = len(self.encoder)
|
||||
self.all_special_ids = [self.encoder[t] for t in special_tokens]
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
word = tuple(token[:-1]) + ( token[-1] + '</w>',)
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token+'</w>'
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word)-1 and word[i+1] == second:
|
||||
new_word.append(first+second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def encode(self, text):
|
||||
bpe_tokens = []
|
||||
text = whitespace_clean(basic_clean(text)).lower()
|
||||
for token in re.findall(self.pat, text):
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
def decode(self, tokens):
|
||||
text = ''.join([self.decoder[token] for token in tokens])
|
||||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
|
||||
return text
|
||||
|
||||
def __call__(self, texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
|
||||
"""
|
||||
Returns the tokenized representation of given input string(s)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
texts : Union[str, List[str]]
|
||||
An input string or a list of input strings to tokenize
|
||||
context_length : int
|
||||
The context length to use; all CLIP models use 77 as the context length
|
||||
|
||||
Returns
|
||||
-------
|
||||
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
|
||||
"""
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
|
||||
sot_token = self.encoder["<start_of_text>"]
|
||||
eot_token = self.encoder["<end_of_text>"]
|
||||
all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
|
||||
result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
|
||||
|
||||
for i, tokens in enumerate(all_tokens):
|
||||
if len(tokens) > context_length:
|
||||
tokens = tokens[:context_length] # Truncate
|
||||
tokens[-1] = eot_token
|
||||
result[i, :len(tokens)] = torch.tensor(tokens)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
||||
class HFTokenizer:
|
||||
"""HuggingFace tokenizer wrapper"""
|
||||
|
||||
def __init__(self, tokenizer_name: str):
|
||||
from transformers import AutoTokenizer
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
||||
|
||||
def save_pretrained(self, dest):
|
||||
self.tokenizer.save_pretrained(dest)
|
||||
|
||||
def __call__(self, texts: Union[str, List[str]], context_length: int = 77) -> torch.Tensor:
|
||||
# same cleaning as for default tokenizer, except lowercasing
|
||||
# adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
texts = [whitespace_clean(basic_clean(text)) for text in texts]
|
||||
input_ids = self.tokenizer(
|
||||
texts,
|
||||
return_tensors='pt',
|
||||
max_length=context_length,
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
).input_ids
|
||||
return input_ids
|
||||
216
diffsynth/extensions/ImageQualityMetric/open_clip/transform.py
Normal file
216
diffsynth/extensions/ImageQualityMetric/open_clip/transform.py
Normal file
@@ -0,0 +1,216 @@
|
||||
import warnings
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Any, Dict, Optional, Sequence, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.transforms.functional as F
|
||||
from functools import partial
|
||||
from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
|
||||
CenterCrop
|
||||
|
||||
from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
|
||||
|
||||
|
||||
@dataclass
|
||||
class AugmentationCfg:
|
||||
scale: Tuple[float, float] = (0.9, 1.0)
|
||||
ratio: Optional[Tuple[float, float]] = None
|
||||
color_jitter: Optional[Union[float, Tuple[float, float, float]]] = None
|
||||
interpolation: Optional[str] = None
|
||||
re_prob: Optional[float] = None
|
||||
re_count: Optional[int] = None
|
||||
use_timm: bool = False
|
||||
|
||||
|
||||
class ResizeMaxSize(nn.Module):
|
||||
|
||||
def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
|
||||
super().__init__()
|
||||
if not isinstance(max_size, int):
|
||||
raise TypeError(f"Size should be int. Got {type(max_size)}")
|
||||
self.max_size = max_size
|
||||
self.interpolation = interpolation
|
||||
self.fn = min if fn == 'min' else min
|
||||
self.fill = fill
|
||||
|
||||
def forward(self, img):
|
||||
if isinstance(img, torch.Tensor):
|
||||
height, width = img.shape[1:]
|
||||
else:
|
||||
width, height = img.size
|
||||
scale = self.max_size / float(max(height, width))
|
||||
if scale != 1.0:
|
||||
new_size = tuple(round(dim * scale) for dim in (height, width))
|
||||
img = F.resize(img, new_size, self.interpolation)
|
||||
pad_h = self.max_size - new_size[0]
|
||||
pad_w = self.max_size - new_size[1]
|
||||
img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
|
||||
return img
|
||||
|
||||
|
||||
def _convert_to_rgb_or_rgba(image):
|
||||
if image.mode == 'RGBA':
|
||||
return image
|
||||
else:
|
||||
return image.convert('RGB')
|
||||
|
||||
# def transform_and_split(merged, transform_fn, normalize_fn):
|
||||
# transformed = transform_fn(merged)
|
||||
# crop_img, crop_label = torch.split(transformed, [3,1], dim=0)
|
||||
|
||||
# # crop_img = _convert_to_rgb(crop_img)
|
||||
# crop_img = normalize_fn(ToTensor()(crop_img))
|
||||
# return crop_img, crop_label
|
||||
|
||||
class MaskAwareNormalize(nn.Module):
|
||||
def __init__(self, mean, std):
|
||||
super().__init__()
|
||||
self.normalize = Normalize(mean=mean, std=std)
|
||||
|
||||
def forward(self, tensor):
|
||||
if tensor.shape[0] == 4:
|
||||
return torch.cat([self.normalize(tensor[:3]), tensor[3:]], dim=0)
|
||||
else:
|
||||
return self.normalize(tensor)
|
||||
|
||||
def image_transform(
|
||||
image_size: int,
|
||||
is_train: bool,
|
||||
mean: Optional[Tuple[float, ...]] = None,
|
||||
std: Optional[Tuple[float, ...]] = None,
|
||||
resize_longest_max: bool = False,
|
||||
fill_color: int = 0,
|
||||
aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
|
||||
):
|
||||
mean = mean or OPENAI_DATASET_MEAN
|
||||
if not isinstance(mean, (list, tuple)):
|
||||
mean = (mean,) * 3
|
||||
|
||||
std = std or OPENAI_DATASET_STD
|
||||
if not isinstance(std, (list, tuple)):
|
||||
std = (std,) * 3
|
||||
|
||||
if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
|
||||
# for square size, pass size as int so that Resize() uses aspect preserving shortest edge
|
||||
image_size = image_size[0]
|
||||
|
||||
if isinstance(aug_cfg, dict):
|
||||
aug_cfg = AugmentationCfg(**aug_cfg)
|
||||
else:
|
||||
aug_cfg = aug_cfg or AugmentationCfg()
|
||||
normalize = MaskAwareNormalize(mean=mean, std=std)
|
||||
if is_train:
|
||||
aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None}
|
||||
use_timm = aug_cfg_dict.pop('use_timm', False)
|
||||
if use_timm:
|
||||
assert False, "not tested for augmentation with mask"
|
||||
from timm.data import create_transform # timm can still be optional
|
||||
if isinstance(image_size, (tuple, list)):
|
||||
assert len(image_size) >= 2
|
||||
input_size = (3,) + image_size[-2:]
|
||||
else:
|
||||
input_size = (3, image_size, image_size)
|
||||
# by default, timm aug randomly alternates bicubic & bilinear for better robustness at inference time
|
||||
aug_cfg_dict.setdefault('interpolation', 'random')
|
||||
aug_cfg_dict.setdefault('color_jitter', None) # disable by default
|
||||
train_transform = create_transform(
|
||||
input_size=input_size,
|
||||
is_training=True,
|
||||
hflip=0.,
|
||||
mean=mean,
|
||||
std=std,
|
||||
re_mode='pixel',
|
||||
**aug_cfg_dict,
|
||||
)
|
||||
else:
|
||||
train_transform = Compose([
|
||||
_convert_to_rgb_or_rgba,
|
||||
ToTensor(),
|
||||
RandomResizedCrop(
|
||||
image_size,
|
||||
scale=aug_cfg_dict.pop('scale'),
|
||||
interpolation=InterpolationMode.BICUBIC,
|
||||
),
|
||||
normalize,
|
||||
])
|
||||
if aug_cfg_dict:
|
||||
warnings.warn(f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).')
|
||||
return train_transform
|
||||
else:
|
||||
transforms = [
|
||||
_convert_to_rgb_or_rgba,
|
||||
ToTensor(),
|
||||
]
|
||||
if resize_longest_max:
|
||||
transforms.extend([
|
||||
ResizeMaxSize(image_size, fill=fill_color)
|
||||
])
|
||||
else:
|
||||
transforms.extend([
|
||||
Resize(image_size, interpolation=InterpolationMode.BICUBIC),
|
||||
CenterCrop(image_size),
|
||||
])
|
||||
transforms.extend([
|
||||
normalize,
|
||||
])
|
||||
return Compose(transforms)
|
||||
|
||||
|
||||
# def image_transform_region(
|
||||
# image_size: int,
|
||||
# is_train: bool,
|
||||
# mean: Optional[Tuple[float, ...]] = None,
|
||||
# std: Optional[Tuple[float, ...]] = None,
|
||||
# resize_longest_max: bool = False,
|
||||
# fill_color: int = 0,
|
||||
# aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
|
||||
# ):
|
||||
# mean = mean or OPENAI_DATASET_MEAN
|
||||
# if not isinstance(mean, (list, tuple)):
|
||||
# mean = (mean,) * 3
|
||||
|
||||
# std = std or OPENAI_DATASET_STD
|
||||
# if not isinstance(std, (list, tuple)):
|
||||
# std = (std,) * 3
|
||||
|
||||
# if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
|
||||
# # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
|
||||
# image_size = image_size[0]
|
||||
|
||||
# if isinstance(aug_cfg, dict):
|
||||
# aug_cfg = AugmentationCfg(**aug_cfg)
|
||||
# else:
|
||||
# aug_cfg = aug_cfg or AugmentationCfg()
|
||||
# normalize = Normalize(mean=mean, std=std)
|
||||
# if is_train:
|
||||
# aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None}
|
||||
|
||||
# transform = Compose([
|
||||
# RandomResizedCrop(
|
||||
# image_size,
|
||||
# scale=aug_cfg_dict.pop('scale'),
|
||||
# interpolation=InterpolationMode.BICUBIC,
|
||||
# ),
|
||||
# ])
|
||||
# train_transform = Compose([
|
||||
# partial(transform_and_split, transform_fn=transform,normalize_fn=normalize)
|
||||
# ])
|
||||
# return train_transform
|
||||
# else:
|
||||
# if resize_longest_max:
|
||||
# transform = [
|
||||
# ResizeMaxSize(image_size, fill=fill_color)
|
||||
# ]
|
||||
# val_transform = Compose([
|
||||
# partial(transform_and_split, transform_fn=transform,normalize_fn=normalize),
|
||||
# ])
|
||||
# else:
|
||||
# transform = [
|
||||
# Resize(image_size, interpolation=InterpolationMode.BICUBIC),
|
||||
# CenterCrop(image_size),
|
||||
# ]
|
||||
# val_transform = Compose([
|
||||
# partial(transform_and_split, transform_fn=transform,normalize_fn=normalize),
|
||||
# ])
|
||||
# return val_transform
|
||||
727
diffsynth/extensions/ImageQualityMetric/open_clip/transformer.py
Normal file
727
diffsynth/extensions/ImageQualityMetric/open_clip/transformer.py
Normal file
@@ -0,0 +1,727 @@
|
||||
from collections import OrderedDict
|
||||
import math
|
||||
from typing import Callable, Optional, Sequence, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
|
||||
from .utils import to_2tuple
|
||||
|
||||
|
||||
class LayerNormFp32(nn.LayerNorm):
|
||||
"""Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
orig_type = x.dtype
|
||||
x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
|
||||
return x.to(orig_type)
|
||||
|
||||
|
||||
class LayerNorm(nn.LayerNorm):
|
||||
"""Subclass torch's LayerNorm (with cast back to input dtype)."""
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
orig_type = x.dtype
|
||||
x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
|
||||
return x.to(orig_type)
|
||||
|
||||
|
||||
class QuickGELU(nn.Module):
|
||||
# NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
|
||||
def forward(self, x: torch.Tensor):
|
||||
return x * torch.sigmoid(1.702 * x)
|
||||
|
||||
|
||||
class LayerScale(nn.Module):
|
||||
def __init__(self, dim, init_values=1e-5, inplace=False):
|
||||
super().__init__()
|
||||
self.inplace = inplace
|
||||
self.gamma = nn.Parameter(init_values * torch.ones(dim))
|
||||
|
||||
def forward(self, x):
|
||||
return x.mul_(self.gamma) if self.inplace else x * self.gamma
|
||||
|
||||
|
||||
class PatchDropout(nn.Module):
|
||||
"""
|
||||
https://arxiv.org/abs/2212.00794
|
||||
"""
|
||||
|
||||
def __init__(self, prob, exclude_first_token=True):
|
||||
super().__init__()
|
||||
assert 0 <= prob < 1.
|
||||
self.prob = prob
|
||||
self.exclude_first_token = exclude_first_token # exclude CLS token
|
||||
|
||||
def forward(self, x):
|
||||
if not self.training or self.prob == 0.:
|
||||
return x
|
||||
|
||||
if self.exclude_first_token:
|
||||
cls_tokens, x = x[:, :1], x[:, 1:]
|
||||
else:
|
||||
cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
|
||||
|
||||
batch = x.size()[0]
|
||||
num_tokens = x.size()[1]
|
||||
|
||||
batch_indices = torch.arange(batch)
|
||||
batch_indices = batch_indices[..., None]
|
||||
|
||||
keep_prob = 1 - self.prob
|
||||
num_patches_keep = max(1, int(num_tokens * keep_prob))
|
||||
|
||||
rand = torch.randn(batch, num_tokens)
|
||||
patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
|
||||
|
||||
x = x[batch_indices, patch_indices_keep]
|
||||
|
||||
if self.exclude_first_token:
|
||||
x = torch.cat((cls_tokens, x), dim=1)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
qkv_bias=True,
|
||||
scaled_cosine=False,
|
||||
scale_heads=False,
|
||||
logit_scale_max=math.log(1. / 0.01),
|
||||
attn_drop=0.,
|
||||
proj_drop=0.
|
||||
):
|
||||
super().__init__()
|
||||
self.scaled_cosine = scaled_cosine
|
||||
self.scale_heads = scale_heads
|
||||
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.scale = self.head_dim ** -0.5
|
||||
self.logit_scale_max = logit_scale_max
|
||||
|
||||
# keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
|
||||
self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
|
||||
if qkv_bias:
|
||||
self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
|
||||
else:
|
||||
self.in_proj_bias = None
|
||||
|
||||
if self.scaled_cosine:
|
||||
self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
|
||||
else:
|
||||
self.logit_scale = None
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
if self.scale_heads:
|
||||
self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
|
||||
else:
|
||||
self.head_scale = None
|
||||
self.out_proj = nn.Linear(dim, dim)
|
||||
self.out_drop = nn.Dropout(proj_drop)
|
||||
|
||||
def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
|
||||
L, N, C = x.shape
|
||||
q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
|
||||
q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
|
||||
k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
|
||||
v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
|
||||
|
||||
if self.logit_scale is not None:
|
||||
attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
|
||||
logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
|
||||
attn = attn.view(N, self.num_heads, L, L) * logit_scale
|
||||
attn = attn.view(-1, L, L)
|
||||
else:
|
||||
q = q * self.scale
|
||||
attn = torch.bmm(q, k.transpose(-1, -2))
|
||||
|
||||
if attn_mask is not None:
|
||||
if attn_mask.dtype == torch.bool:
|
||||
new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
|
||||
new_attn_mask.masked_fill_(attn_mask, float("-inf"))
|
||||
attn_mask = new_attn_mask
|
||||
attn += attn_mask
|
||||
|
||||
attn = attn.softmax(dim=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
x = torch.bmm(attn, v)
|
||||
if self.head_scale is not None:
|
||||
x = x.view(N, self.num_heads, L, C) * self.head_scale
|
||||
x = x.view(-1, L, C)
|
||||
x = x.transpose(0, 1).reshape(L, N, C)
|
||||
x = self.out_proj(x)
|
||||
x = self.out_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class AttentionalPooler(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int,
|
||||
context_dim: int,
|
||||
n_head: int = 8,
|
||||
n_queries: int = 256,
|
||||
norm_layer: Callable = LayerNorm
|
||||
):
|
||||
super().__init__()
|
||||
self.query = nn.Parameter(torch.randn(n_queries, d_model))
|
||||
self.attn = nn.MultiheadAttention(d_model, n_head, kdim=context_dim, vdim=context_dim)
|
||||
self.ln_q = norm_layer(d_model)
|
||||
self.ln_k = norm_layer(context_dim)
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
x = self.ln_k(x).permute(1, 0, 2) # NLD -> LND
|
||||
N = x.shape[1]
|
||||
q = self.ln_q(self.query)
|
||||
out = self.attn(self._repeat(q, N), x, x, need_weights=False)[0]
|
||||
return out.permute(1, 0, 2) # LND -> NLD
|
||||
|
||||
def _repeat(self, query, N: int):
|
||||
return query.unsqueeze(1).repeat(1, N, 1)
|
||||
|
||||
|
||||
class ResidualAttentionBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int,
|
||||
n_head: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
ls_init_value: float = None,
|
||||
act_layer: Callable = nn.GELU,
|
||||
norm_layer: Callable = LayerNorm,
|
||||
is_cross_attention: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.ln_1 = norm_layer(d_model)
|
||||
self.attn = nn.MultiheadAttention(d_model, n_head)
|
||||
self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
|
||||
if is_cross_attention:
|
||||
self.ln_1_kv = norm_layer(d_model)
|
||||
|
||||
self.ln_2 = norm_layer(d_model)
|
||||
mlp_width = int(d_model * mlp_ratio)
|
||||
self.mlp = nn.Sequential(OrderedDict([
|
||||
("c_fc", nn.Linear(d_model, mlp_width)),
|
||||
("gelu", act_layer()),
|
||||
("c_proj", nn.Linear(mlp_width, d_model))
|
||||
]))
|
||||
self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
|
||||
|
||||
def attention(
|
||||
self,
|
||||
q_x: torch.Tensor,
|
||||
k_x: Optional[torch.Tensor] = None,
|
||||
v_x: Optional[torch.Tensor] = None,
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
):
|
||||
k_x = k_x if k_x is not None else q_x
|
||||
v_x = v_x if v_x is not None else q_x
|
||||
|
||||
attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
|
||||
return self.attn(
|
||||
q_x, k_x, v_x, need_weights=False, attn_mask=attn_mask
|
||||
)[0]
|
||||
|
||||
def forward(
|
||||
self,
|
||||
q_x: torch.Tensor,
|
||||
k_x: Optional[torch.Tensor] = None,
|
||||
v_x: Optional[torch.Tensor] = None,
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
):
|
||||
k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
|
||||
v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
|
||||
|
||||
x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
|
||||
x = x + self.ls_2(self.mlp(self.ln_2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class CustomResidualAttentionBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int,
|
||||
n_head: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
ls_init_value: float = None,
|
||||
act_layer: Callable = nn.GELU,
|
||||
norm_layer: Callable = LayerNorm,
|
||||
scale_cosine_attn: bool = False,
|
||||
scale_heads: bool = False,
|
||||
scale_attn: bool = False,
|
||||
scale_fc: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.ln_1 = norm_layer(d_model)
|
||||
self.attn = Attention(
|
||||
d_model, n_head,
|
||||
scaled_cosine=scale_cosine_attn,
|
||||
scale_heads=scale_heads,
|
||||
)
|
||||
self.ln_attn = norm_layer(d_model) if scale_attn else nn.Identity()
|
||||
self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
|
||||
|
||||
self.ln_2 = norm_layer(d_model)
|
||||
mlp_width = int(d_model * mlp_ratio)
|
||||
self.mlp = nn.Sequential(OrderedDict([
|
||||
("c_fc", nn.Linear(d_model, mlp_width)),
|
||||
('ln', norm_layer(mlp_width) if scale_fc else nn.Identity()),
|
||||
("gelu", act_layer()),
|
||||
("c_proj", nn.Linear(mlp_width, d_model))
|
||||
]))
|
||||
self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
|
||||
|
||||
def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
|
||||
x = x + self.ls_1(self.ln_attn(self.attn(self.ln_1(x), attn_mask=attn_mask)))
|
||||
x = x + self.ls_2(self.mlp(self.ln_2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
ls_init_value: float = None,
|
||||
act_layer: Callable = nn.GELU,
|
||||
norm_layer: Callable = LayerNorm,
|
||||
):
|
||||
super().__init__()
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
self.grad_checkpointing = False
|
||||
|
||||
self.resblocks = nn.ModuleList([
|
||||
ResidualAttentionBlock(
|
||||
width, heads, mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer)
|
||||
for _ in range(layers)
|
||||
])
|
||||
|
||||
def get_cast_dtype(self) -> torch.dtype:
|
||||
return self.resblocks[0].mlp.c_fc.weight.dtype
|
||||
|
||||
def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
|
||||
for r in self.resblocks:
|
||||
if self.grad_checkpointing and not torch.jit.is_scripting():
|
||||
# TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
|
||||
x = checkpoint(r, x, None, None, attn_mask)
|
||||
else:
|
||||
x = r(x, attn_mask=attn_mask)
|
||||
return x
|
||||
|
||||
|
||||
class VisionTransformer(nn.Module):
|
||||
output_tokens: torch.jit.Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_size: int,
|
||||
patch_size: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
mlp_ratio: float,
|
||||
ls_init_value: float = None,
|
||||
global_average_pool: bool = False,
|
||||
attentional_pool: bool = False,
|
||||
n_queries: int = 256,
|
||||
attn_pooler_heads: int = 8,
|
||||
output_dim: int = 512,
|
||||
patch_dropout: float = 0.,
|
||||
input_patchnorm: bool = False,
|
||||
act_layer: Callable = nn.GELU,
|
||||
norm_layer: Callable = LayerNorm,
|
||||
output_tokens: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
self.output_tokens = output_tokens
|
||||
image_height, image_width = self.image_size = to_2tuple(image_size)
|
||||
patch_height, patch_width = self.patch_size = to_2tuple(patch_size)
|
||||
self.grid_size = (image_height // patch_height, image_width // patch_width)
|
||||
self.output_dim = output_dim
|
||||
|
||||
# whether to layernorm each patch, as done in dual patchnorm paper - https://arxiv.org/abs/2302.01327v1
|
||||
self.input_patchnorm = input_patchnorm
|
||||
|
||||
if input_patchnorm:
|
||||
patch_input_dim = patch_height * patch_width * 3
|
||||
self.patchnorm_pre_ln = LayerNorm(patch_input_dim)
|
||||
self.conv1 = nn.Linear(patch_input_dim, width)
|
||||
else:
|
||||
self.patchnorm_pre_ln = nn.Identity()
|
||||
self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
|
||||
|
||||
# class embeddings and positional embeddings
|
||||
scale = width ** -0.5
|
||||
self.class_embedding = nn.Parameter(scale * torch.randn(width))
|
||||
self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width))
|
||||
|
||||
# setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
|
||||
self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
|
||||
|
||||
self.ln_pre = norm_layer(width)
|
||||
self.transformer = Transformer(
|
||||
width,
|
||||
layers,
|
||||
heads,
|
||||
mlp_ratio,
|
||||
ls_init_value=ls_init_value,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
)
|
||||
|
||||
self.global_average_pool = global_average_pool
|
||||
if attentional_pool:
|
||||
self.attn_pool = AttentionalPooler(output_dim, width, n_head=attn_pooler_heads, n_queries=n_queries)
|
||||
self.ln_post = norm_layer(output_dim)
|
||||
self.proj = nn.Parameter(scale * torch.randn(output_dim, output_dim))
|
||||
else:
|
||||
self.attn_pool = None
|
||||
self.ln_post = norm_layer(width)
|
||||
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
|
||||
|
||||
self.init_parameters()
|
||||
|
||||
def lock(self, unlocked_groups=0, freeze_bn_stats=False):
|
||||
for param in self.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
if unlocked_groups != 0:
|
||||
groups = [
|
||||
[
|
||||
self.conv1,
|
||||
self.class_embedding,
|
||||
self.positional_embedding,
|
||||
self.ln_pre,
|
||||
],
|
||||
*self.transformer.resblocks[:-1],
|
||||
[
|
||||
self.transformer.resblocks[-1],
|
||||
self.ln_post,
|
||||
],
|
||||
self.proj,
|
||||
]
|
||||
|
||||
def _unlock(x):
|
||||
if isinstance(x, Sequence):
|
||||
for g in x:
|
||||
_unlock(g)
|
||||
else:
|
||||
if isinstance(x, torch.nn.Parameter):
|
||||
x.requires_grad = True
|
||||
else:
|
||||
for p in x.parameters():
|
||||
p.requires_grad = True
|
||||
|
||||
_unlock(groups[-unlocked_groups:])
|
||||
|
||||
def init_parameters(self):
|
||||
# FIXME OpenAI CLIP did not define an init for the VisualTransformer
|
||||
# TODO experiment if default PyTorch init, below, or alternate init is best.
|
||||
|
||||
# nn.init.normal_(self.class_embedding, std=self.scale)
|
||||
# nn.init.normal_(self.positional_embedding, std=self.scale)
|
||||
#
|
||||
# proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
|
||||
# attn_std = self.transformer.width ** -0.5
|
||||
# fc_std = (2 * self.transformer.width) ** -0.5
|
||||
# for block in self.transformer.resblocks:
|
||||
# nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
|
||||
# nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
|
||||
# nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
|
||||
# nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
|
||||
#
|
||||
# if self.text_projection is not None:
|
||||
# nn.init.normal_(self.text_projection, std=self.scale)
|
||||
pass
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.transformer.grad_checkpointing = enable
|
||||
|
||||
def _global_pool(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if self.global_average_pool:
|
||||
return x.mean(dim=1), x
|
||||
else:
|
||||
return x[:, 0], x[:, 1:]
|
||||
|
||||
def forward(self, x: torch.Tensor, skip_pool: bool = False):
|
||||
|
||||
# to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
|
||||
if self.input_patchnorm:
|
||||
# einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)')
|
||||
x = x.reshape(x.shape[0], x.shape[1], self.grid_size[0], self.patch_size[0], self.grid_size[1], self.patch_size[1])
|
||||
x = x.permute(0, 2, 4, 1, 3, 5)
|
||||
x = x.reshape(x.shape[0], self.grid_size[0] * self.grid_size[1], -1)
|
||||
x = self.patchnorm_pre_ln(x)
|
||||
x = self.conv1(x)
|
||||
else:
|
||||
x = self.conv1(x) # shape = [*, width, grid, grid]
|
||||
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
|
||||
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
|
||||
|
||||
# class embeddings and positional embeddings
|
||||
x = torch.cat(
|
||||
[self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
|
||||
x], dim=1) # shape = [*, grid ** 2 + 1, width]
|
||||
x = x + self.positional_embedding.to(x.dtype)
|
||||
|
||||
# a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
|
||||
x = self.patch_dropout(x)
|
||||
x = self.ln_pre(x)
|
||||
|
||||
x = x.permute(1, 0, 2) # NLD -> LND
|
||||
x = self.transformer(x)
|
||||
x = x.permute(1, 0, 2) # LND -> NLD
|
||||
|
||||
if skip_pool:
|
||||
return x
|
||||
|
||||
if self.attn_pool is not None:
|
||||
x = self.attn_pool(x)
|
||||
x = self.ln_post(x)
|
||||
pooled, tokens = self._global_pool(x)
|
||||
else:
|
||||
pooled, tokens = self._global_pool(x)
|
||||
pooled = self.ln_post(pooled)
|
||||
|
||||
if self.proj is not None:
|
||||
pooled = pooled @ self.proj
|
||||
|
||||
if self.output_tokens:
|
||||
return pooled, tokens
|
||||
|
||||
return pooled
|
||||
|
||||
|
||||
class TextTransformer(nn.Module):
|
||||
output_tokens: torch.jit.Final[bool]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context_length: int = 77,
|
||||
vocab_size: int = 49408,
|
||||
width: int = 512,
|
||||
heads: int = 8,
|
||||
layers: int = 12,
|
||||
ls_init_value: float = None,
|
||||
output_dim: int = 512,
|
||||
act_layer: Callable = nn.GELU,
|
||||
norm_layer: Callable = LayerNorm,
|
||||
embed_cls: bool = False,
|
||||
pad_id: int = 0,
|
||||
output_tokens: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.output_tokens = output_tokens
|
||||
self.num_pos = self.context_length = context_length
|
||||
self.vocab_size = vocab_size
|
||||
self.width = width
|
||||
self.output_dim = output_dim
|
||||
self.heads = heads
|
||||
self.pad_id = pad_id
|
||||
|
||||
self.text_projection = nn.Parameter(torch.empty(width, output_dim))
|
||||
|
||||
if embed_cls:
|
||||
self.cls_emb = nn.Parameter(torch.empty(width))
|
||||
self.num_pos += 1
|
||||
else:
|
||||
self.cls_emb = None
|
||||
|
||||
self.token_embedding = nn.Embedding(vocab_size, width)
|
||||
self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width))
|
||||
self.transformer = Transformer(
|
||||
width=width,
|
||||
layers=layers,
|
||||
heads=heads,
|
||||
ls_init_value=ls_init_value,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
)
|
||||
self.ln_final = norm_layer(width)
|
||||
|
||||
self.register_buffer('attn_mask', self.build_attention_mask(), persistent=False)
|
||||
|
||||
self.init_parameters()
|
||||
|
||||
def init_parameters(self):
|
||||
nn.init.normal_(self.token_embedding.weight, std=0.02)
|
||||
nn.init.normal_(self.positional_embedding, std=0.01)
|
||||
if self.cls_emb is not None:
|
||||
nn.init.normal_(self.cls_emb, std=0.01)
|
||||
|
||||
proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
|
||||
attn_std = self.transformer.width ** -0.5
|
||||
fc_std = (2 * self.transformer.width) ** -0.5
|
||||
for block in self.transformer.resblocks:
|
||||
nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
|
||||
nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
|
||||
nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
|
||||
nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
|
||||
|
||||
if self.text_projection is not None:
|
||||
nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.transformer.grad_checkpointing = enable
|
||||
|
||||
def build_attention_mask(self):
|
||||
# lazily create causal attention mask, with full attention between the tokens
|
||||
# pytorch uses additive attention mask; fill with -inf
|
||||
mask = torch.empty(self.num_pos, self.num_pos)
|
||||
mask.fill_(float("-inf"))
|
||||
mask.triu_(1) # zero out the lower diagonal
|
||||
return mask
|
||||
|
||||
def build_cls_mask(self, text, cast_dtype: torch.dtype):
|
||||
cls_mask = (text != self.pad_id).unsqueeze(1)
|
||||
cls_mask = F.pad(cls_mask, (1, 0, cls_mask.shape[2], 0), value=1.0)
|
||||
additive_mask = torch.empty(cls_mask.shape, dtype=cast_dtype, device=cls_mask.device)
|
||||
additive_mask.fill_(0)
|
||||
additive_mask.masked_fill_(~cls_mask, float("-inf"))
|
||||
additive_mask = torch.repeat_interleave(additive_mask, self.heads, 0)
|
||||
return additive_mask
|
||||
|
||||
def _repeat(self, t, N: int):
|
||||
return t.reshape(1, 1, -1).repeat(N, 1, 1)
|
||||
|
||||
def forward(self, text):
|
||||
cast_dtype = self.transformer.get_cast_dtype()
|
||||
seq_len = text.shape[1]
|
||||
|
||||
x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model]
|
||||
attn_mask = self.attn_mask
|
||||
if self.cls_emb is not None:
|
||||
seq_len += 1
|
||||
x = torch.cat([x, self._repeat(self.cls_emb, x.shape[0])], dim=1)
|
||||
cls_mask = self.build_cls_mask(text, cast_dtype)
|
||||
attn_mask = attn_mask[None, :seq_len, :seq_len] + cls_mask[:, :seq_len, :seq_len]
|
||||
|
||||
x = x + self.positional_embedding[:seq_len].to(cast_dtype)
|
||||
x = x.permute(1, 0, 2) # NLD -> LND
|
||||
x = self.transformer(x, attn_mask=attn_mask)
|
||||
x = x.permute(1, 0, 2) # LND -> NLD
|
||||
|
||||
# x.shape = [batch_size, n_ctx, transformer.width]
|
||||
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
||||
if self.cls_emb is not None:
|
||||
pooled, tokens = x[:, -1], x[:, :-1]
|
||||
pooled = self.ln_final(pooled)
|
||||
else:
|
||||
x = self.ln_final(x)
|
||||
pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x
|
||||
|
||||
if self.text_projection is not None:
|
||||
pooled = pooled @ self.text_projection
|
||||
|
||||
if self.output_tokens:
|
||||
return pooled, tokens
|
||||
|
||||
return pooled
|
||||
|
||||
|
||||
class MultimodalTransformer(Transformer):
|
||||
def __init__(
|
||||
self,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
context_length: int = 77,
|
||||
mlp_ratio: float = 4.0,
|
||||
ls_init_value: float = None,
|
||||
act_layer: Callable = nn.GELU,
|
||||
norm_layer: Callable = LayerNorm,
|
||||
output_dim: int = 512,
|
||||
):
|
||||
|
||||
super().__init__(
|
||||
width=width,
|
||||
layers=layers,
|
||||
heads=heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
ls_init_value=ls_init_value,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
)
|
||||
self.context_length = context_length
|
||||
self.cross_attn = nn.ModuleList([
|
||||
ResidualAttentionBlock(
|
||||
width,
|
||||
heads,
|
||||
mlp_ratio,
|
||||
ls_init_value=ls_init_value,
|
||||
act_layer=act_layer,
|
||||
norm_layer=norm_layer,
|
||||
is_cross_attention=True,
|
||||
)
|
||||
for _ in range(layers)
|
||||
])
|
||||
|
||||
self.register_buffer('attn_mask', self.build_attention_mask(), persistent=False)
|
||||
|
||||
self.ln_final = norm_layer(width)
|
||||
self.text_projection = nn.Parameter(torch.empty(width, output_dim))
|
||||
|
||||
def init_parameters(self):
|
||||
proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
|
||||
attn_std = self.transformer.width ** -0.5
|
||||
fc_std = (2 * self.transformer.width) ** -0.5
|
||||
for block in self.transformer.resblocks:
|
||||
nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
|
||||
nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
|
||||
nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
|
||||
nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
|
||||
for block in self.transformer.cross_attn:
|
||||
nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
|
||||
nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
|
||||
nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
|
||||
nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
|
||||
|
||||
if self.text_projection is not None:
|
||||
nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
|
||||
|
||||
def build_attention_mask(self):
|
||||
# lazily create causal attention mask, with full attention between the tokens
|
||||
# pytorch uses additive attention mask; fill with -inf
|
||||
mask = torch.empty(self.context_length, self.context_length)
|
||||
mask.fill_(float("-inf"))
|
||||
mask.triu_(1) # zero out the lower diagonal
|
||||
return mask
|
||||
|
||||
def forward(self, image_embs, text_embs):
|
||||
text_embs = text_embs.permute(1, 0, 2) # NLD -> LNDsq
|
||||
image_embs = image_embs.permute(1, 0, 2) # NLD -> LND
|
||||
seq_len = text_embs.shape[0]
|
||||
|
||||
for resblock, cross_attn in zip(self.resblocks, self.cross_attn):
|
||||
if self.grad_checkpointing and not torch.jit.is_scripting():
|
||||
# TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
|
||||
text_embs = checkpoint(resblock, text_embs, None, None, self.attn_mask[:seq_len, :seq_len])
|
||||
text_embs = checkpoint(cross_attn, text_embs, image_embs, image_embs, None)
|
||||
else:
|
||||
text_embs = resblock(text_embs, attn_mask=self.attn_mask[:seq_len, :seq_len])
|
||||
text_embs = cross_attn(text_embs, k_x=image_embs, v_x=image_embs)
|
||||
|
||||
x = text_embs.permute(1, 0, 2) # LND -> NLD
|
||||
x = self.ln_final(x)
|
||||
|
||||
if self.text_projection is not None:
|
||||
x = x @ self.text_projection
|
||||
|
||||
return x
|
||||
|
||||
@torch.jit.ignore
|
||||
def set_grad_checkpointing(self, enable=True):
|
||||
self.grad_checkpointing = enable
|
||||
60
diffsynth/extensions/ImageQualityMetric/open_clip/utils.py
Normal file
60
diffsynth/extensions/ImageQualityMetric/open_clip/utils.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from itertools import repeat
|
||||
import collections.abc
|
||||
|
||||
from torch import nn as nn
|
||||
from torchvision.ops.misc import FrozenBatchNorm2d
|
||||
|
||||
|
||||
def freeze_batch_norm_2d(module, module_match={}, name=''):
|
||||
"""
|
||||
Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
|
||||
itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
|
||||
returned. Otherwise, the module is walked recursively and submodules are converted in place.
|
||||
|
||||
Args:
|
||||
module (torch.nn.Module): Any PyTorch module.
|
||||
module_match (dict): Dictionary of full module names to freeze (all if empty)
|
||||
name (str): Full module name (prefix)
|
||||
|
||||
Returns:
|
||||
torch.nn.Module: Resulting module
|
||||
|
||||
Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
|
||||
"""
|
||||
res = module
|
||||
is_match = True
|
||||
if module_match:
|
||||
is_match = name in module_match
|
||||
if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
|
||||
res = FrozenBatchNorm2d(module.num_features)
|
||||
res.num_features = module.num_features
|
||||
res.affine = module.affine
|
||||
if module.affine:
|
||||
res.weight.data = module.weight.data.clone().detach()
|
||||
res.bias.data = module.bias.data.clone().detach()
|
||||
res.running_mean.data = module.running_mean.data
|
||||
res.running_var.data = module.running_var.data
|
||||
res.eps = module.eps
|
||||
else:
|
||||
for child_name, child in module.named_children():
|
||||
full_child_name = '.'.join([name, child_name]) if name else child_name
|
||||
new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
|
||||
if new_child is not child:
|
||||
res.add_module(child_name, new_child)
|
||||
return res
|
||||
|
||||
|
||||
# From PyTorch internals
|
||||
def _ntuple(n):
|
||||
def parse(x):
|
||||
if isinstance(x, collections.abc.Iterable):
|
||||
return x
|
||||
return tuple(repeat(x, n))
|
||||
return parse
|
||||
|
||||
|
||||
to_1tuple = _ntuple(1)
|
||||
to_2tuple = _ntuple(2)
|
||||
to_3tuple = _ntuple(3)
|
||||
to_4tuple = _ntuple(4)
|
||||
to_ntuple = lambda n, x: _ntuple(n)(x)
|
||||
@@ -0,0 +1 @@
|
||||
__version__ = '2.16.0'
|
||||
Reference in New Issue
Block a user