mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-19 06:39:43 +00:00
add new quality metric
This commit is contained in:
@@ -13,8 +13,16 @@ from transformers import BertTokenizer
|
||||
from .vit import VisionTransformer, interpolate_pos_embed
|
||||
|
||||
|
||||
def default_bert():
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
|
||||
model_path = os.path.join(project_root, 'models', 'QualityMetric')
|
||||
return os.path.join(model_path, "bert-base-uncased")
|
||||
|
||||
bert_model_path = default_bert()
|
||||
|
||||
def init_tokenizer():
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
tokenizer = BertTokenizer.from_pretrained(bert_model_path)
|
||||
tokenizer.add_special_tokens({'bos_token':'[DEC]'})
|
||||
tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
|
||||
tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
|
||||
|
||||
@@ -50,31 +50,30 @@ class MLP(torch.nn.Module):
|
||||
|
||||
|
||||
class AestheticScore:
|
||||
def __init__(self, device: torch.device, model_path: str = MODEL_PATHS.get("aesthetic_predictor")):
|
||||
def __init__(self, device: torch.device, path: str = MODEL_PATHS):
|
||||
"""Initialize the Selector with a model and processor.
|
||||
|
||||
Args:
|
||||
device (torch.device): The device to load the model on.
|
||||
model_path (str): Path to the model weights file.
|
||||
"""
|
||||
self.device = device
|
||||
|
||||
self.aes_model_path = path.get("aesthetic_predictor")
|
||||
# Load the MLP model
|
||||
self.model = MLP(768)
|
||||
try:
|
||||
if model_path.endswith(".safetensors"):
|
||||
state_dict = load_file(model_path)
|
||||
if self.aes_model_path.endswith(".safetensors"):
|
||||
state_dict = load_file(self.aes_model_path)
|
||||
else:
|
||||
state_dict = torch.load(model_path)
|
||||
state_dict = torch.load(self.aes_model_path)
|
||||
self.model.load_state_dict(state_dict)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error loading model weights from {model_path}: {e}")
|
||||
raise ValueError(f"Error loading model weights from {self.aes_model_path}: {e}")
|
||||
|
||||
self.model.to(device)
|
||||
self.model.eval()
|
||||
|
||||
# Load the CLIP model and processor
|
||||
clip_model_name = MODEL_PATHS.get('clip-large')
|
||||
clip_model_name = path.get('clip-large')
|
||||
self.model2 = AutoModel.from_pretrained(clip_model_name).eval().to(device)
|
||||
self.processor = AutoProcessor.from_pretrained(clip_model_name)
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from .open_clip import create_model_and_transforms, get_tokenizer
|
||||
from .config import MODEL_PATHS
|
||||
|
||||
class CLIPScore:
|
||||
def __init__(self, device: torch.device):
|
||||
def __init__(self, device: torch.device, path: str = MODEL_PATHS):
|
||||
"""Initialize the CLIPScore with a model and tokenizer.
|
||||
|
||||
Args:
|
||||
@@ -17,7 +17,7 @@ class CLIPScore:
|
||||
self.model, _, self.preprocess_val = create_model_and_transforms(
|
||||
"ViT-H-14",
|
||||
# "laion2B-s32B-b79K",
|
||||
pretrained=MODEL_PATHS.get("open_clip"),
|
||||
pretrained=path.get("open_clip"),
|
||||
precision="amp",
|
||||
device=device,
|
||||
jit=False,
|
||||
|
||||
@@ -2,11 +2,11 @@ import os
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.abspath(os.path.join(current_dir, '../../../'))
|
||||
quality_metric_path = os.path.join(project_root, 'models', 'QualityMetric')
|
||||
model_path = os.path.join(project_root, 'models', 'QualityMetric')
|
||||
|
||||
|
||||
def get_model_path(model_name):
|
||||
return os.path.join(quality_metric_path, model_name)
|
||||
return os.path.join(model_path, model_name)
|
||||
|
||||
|
||||
MODEL_PATHS = {
|
||||
@@ -18,6 +18,6 @@ MODEL_PATHS = {
|
||||
"med_config": get_model_path("ImageReward/med_config.json"),
|
||||
"clip": get_model_path("CLIP-ViT-H-14-laion2B-s32B-b79K"),
|
||||
"clip-large": get_model_path("clip-vit-large-patch14"),
|
||||
"mps": get_model_path("MPS_overall_checkpoint/MPS_overall_checkpoint_diffsynth.pth"),
|
||||
"mps": get_model_path("MPS_overall_checkpoint/MPS_overall_checkpoint_diffsynth.safetensors"),
|
||||
"pickscore": get_model_path("PickScore_v1")
|
||||
}
|
||||
@@ -7,7 +7,7 @@ import os
|
||||
from .config import MODEL_PATHS
|
||||
|
||||
class HPScore_v2:
|
||||
def __init__(self, device: torch.device, model_version: str = "v2"):
|
||||
def __init__(self, device: torch.device, path: str = MODEL_PATHS, model_version: str = "v2"):
|
||||
"""Initialize the Selector with a model and tokenizer.
|
||||
|
||||
Args:
|
||||
@@ -17,9 +17,9 @@ class HPScore_v2:
|
||||
self.device = device
|
||||
|
||||
if model_version == "v2":
|
||||
safetensors_path = MODEL_PATHS.get("hpsv2")
|
||||
safetensors_path = path.get("hpsv2")
|
||||
elif model_version == "v21":
|
||||
safetensors_path = MODEL_PATHS.get("hpsv2.1")
|
||||
safetensors_path = path.get("hpsv2.1")
|
||||
else:
|
||||
raise ValueError(f"Unsupported model version: {model_version}. Choose 'v2' or 'v21'.")
|
||||
|
||||
@@ -27,7 +27,7 @@ class HPScore_v2:
|
||||
model, _, self.preprocess_val = create_model_and_transforms(
|
||||
"ViT-H-14",
|
||||
# "laion2B-s32B-b79K",
|
||||
pretrained=MODEL_PATHS.get("open_clip"),
|
||||
pretrained=path.get("open_clip"),
|
||||
precision="amp",
|
||||
device=device,
|
||||
jit=False,
|
||||
|
||||
@@ -188,15 +188,15 @@ class ImageReward(torch.nn.Module):
|
||||
|
||||
|
||||
class ImageRewardScore:
|
||||
def __init__(self, device: Union[str, torch.device]):
|
||||
def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS):
|
||||
"""Initialize the Selector with a processor and model.
|
||||
|
||||
Args:
|
||||
device (Union[str, torch.device]): The device to load the model on.
|
||||
"""
|
||||
self.device = device if isinstance(device, torch.device) else torch.device(device)
|
||||
model_path = MODEL_PATHS.get("imagereward")
|
||||
med_config = MODEL_PATHS.get("med_config")
|
||||
model_path = path.get("imagereward")
|
||||
med_config = path.get("med_config")
|
||||
state_dict = load_file(model_path)
|
||||
self.model = ImageReward(device=self.device, med_config=med_config).to(self.device)
|
||||
self.model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
@@ -4,10 +4,10 @@ from PIL import Image
|
||||
from io import BytesIO
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import CLIPFeatureExtractor, CLIPImageProcessor
|
||||
|
||||
from transformers import CLIPConfig
|
||||
from dataclasses import dataclass
|
||||
from transformers import CLIPModel as HFCLIPModel
|
||||
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn, einsum
|
||||
|
||||
from .trainer.models.base_model import BaseModelConfig
|
||||
@@ -18,26 +18,27 @@ from typing import Any, Optional, Tuple, Union, List
|
||||
import torch
|
||||
|
||||
from .trainer.models.cross_modeling import Cross_model
|
||||
from .trainer.models import clip_model
|
||||
import torch.nn.functional as F
|
||||
|
||||
import gc
|
||||
import json
|
||||
from .config import MODEL_PATHS
|
||||
|
||||
class MPScore:
|
||||
def __init__(self, device: Union[str, torch.device], condition: str = 'overall'):
|
||||
def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS, condition: str = 'overall'):
|
||||
"""Initialize the MPSModel with a processor, tokenizer, and model.
|
||||
|
||||
Args:
|
||||
device (Union[str, torch.device]): The device to load the model on.
|
||||
"""
|
||||
self.device = device
|
||||
processor_name_or_path = MODEL_PATHS.get("clip")
|
||||
processor_name_or_path = path.get("clip")
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(processor_name_or_path)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(processor_name_or_path, trust_remote_code=True)
|
||||
|
||||
model_ckpt_path = MODEL_PATHS.get("mps")
|
||||
self.model = torch.load(model_ckpt_path).eval().to(device)
|
||||
self.model = clip_model.CLIPModel(processor_name_or_path)
|
||||
state_dict = load_file(path.get("mps"))
|
||||
self.model.load_state_dict(state_dict, strict=False)
|
||||
self.model.to(device)
|
||||
self.condition = condition
|
||||
|
||||
def _calculate_score(self, image: torch.Tensor, prompt: str) -> float:
|
||||
|
||||
Binary file not shown.
@@ -1,22 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"quick_gelu": true,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": [
|
||||
3,
|
||||
4,
|
||||
23,
|
||||
3
|
||||
],
|
||||
"width": 64,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": [
|
||||
3,
|
||||
4,
|
||||
23,
|
||||
3
|
||||
],
|
||||
"width": 64,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"quick_gelu": true,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": [
|
||||
3,
|
||||
4,
|
||||
6,
|
||||
3
|
||||
],
|
||||
"width": 64,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": [
|
||||
3,
|
||||
4,
|
||||
6,
|
||||
3
|
||||
],
|
||||
"width": 64,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 384,
|
||||
"layers": [
|
||||
6,
|
||||
8,
|
||||
18,
|
||||
8
|
||||
],
|
||||
"width": 96,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"image_size": 288,
|
||||
"layers": [
|
||||
4,
|
||||
6,
|
||||
10,
|
||||
6
|
||||
],
|
||||
"width": 80,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 448,
|
||||
"layers": [
|
||||
3,
|
||||
15,
|
||||
36,
|
||||
10
|
||||
],
|
||||
"width": 128,
|
||||
"patch_size": null
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"image_size": 240,
|
||||
"layers": 12,
|
||||
"width": 896,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 896,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"image_size": 256,
|
||||
"layers": 12,
|
||||
"width": 896,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"quick_gelu": true,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 32,
|
||||
"width": 1280,
|
||||
"head_width": 80,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 24
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 280,
|
||||
"layers": 24,
|
||||
"width": 1024,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 336,
|
||||
"layers": 24,
|
||||
"width": 1024,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 24,
|
||||
"width": 1024,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 320,
|
||||
"layers": 24,
|
||||
"width": 1024,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 24,
|
||||
"width": 1024,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"embed_dim": 384,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 512,
|
||||
"patch_size": 16,
|
||||
"ls_init_value": 1e-4
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 384,
|
||||
"heads": 6,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 512,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 384,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 512,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 384,
|
||||
"heads": 6,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 512,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 256,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 384,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 256,
|
||||
"heads": 4,
|
||||
"layers": 10
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 384,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 384,
|
||||
"patch_size": 16
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 384,
|
||||
"heads": 6,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 256,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 384,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 256,
|
||||
"heads": 4,
|
||||
"layers": 10
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 384,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 384,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 384,
|
||||
"heads": 6,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1280,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 48,
|
||||
"width": 1664,
|
||||
"head_width": 104,
|
||||
"mlp_ratio": 4.9231,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1280,
|
||||
"heads": 20,
|
||||
"layers": 32
|
||||
}
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1280,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 56,
|
||||
"width": 1792,
|
||||
"head_width": 112,
|
||||
"mlp_ratio": 8.5715,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1280,
|
||||
"heads": 20,
|
||||
"layers": 36
|
||||
}
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 40,
|
||||
"width": 1408,
|
||||
"head_width": 88,
|
||||
"mlp_ratio": 4.3637,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 24
|
||||
}
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32,
|
||||
"attentional_pool": true,
|
||||
"attn_pooler_heads": 8,
|
||||
"output_tokens": true
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 76,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12,
|
||||
"embed_cls": true,
|
||||
"output_tokens": true
|
||||
},
|
||||
"multimodal_cfg": {
|
||||
"context_length": 76,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12,
|
||||
"attn_pooler_heads": 8
|
||||
},
|
||||
"custom_text": true
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 24,
|
||||
"width": 1024,
|
||||
"patch_size": 14,
|
||||
"attentional_pool": true,
|
||||
"attn_pooler_heads": 8,
|
||||
"output_tokens": true
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 76,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12,
|
||||
"embed_cls": true,
|
||||
"output_tokens": true
|
||||
},
|
||||
"multimodal_cfg": {
|
||||
"context_length": 76,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12,
|
||||
"attn_pooler_heads": 12
|
||||
},
|
||||
"custom_text": true
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"multimodal_cfg": {
|
||||
"width": 768,
|
||||
"context_length": 76,
|
||||
"vocab_size": 64000,
|
||||
"mlp_ratio": 4,
|
||||
"layers": 12,
|
||||
"dim_head": 64,
|
||||
"heads": 12,
|
||||
"n_queries": 256,
|
||||
"attn_pooler_heads": 8
|
||||
},
|
||||
"vision_cfg": {
|
||||
"image_size": 288,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 18,
|
||||
"output_tokens": true
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 76,
|
||||
"vocab_size": 64000,
|
||||
"layers": 12,
|
||||
"heads": 12,
|
||||
"width": 768,
|
||||
"embed_cls": true,
|
||||
"output_tokens": true
|
||||
},
|
||||
"custom_text": true
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32,
|
||||
"output_tokens": true
|
||||
},
|
||||
"text_cfg": {
|
||||
"hf_model_name": "roberta-base",
|
||||
"hf_tokenizer_name": "roberta-base",
|
||||
"proj": "linear",
|
||||
"width": 768,
|
||||
"output_tokens": true
|
||||
},
|
||||
"multimodal_cfg": {
|
||||
"context_length": 76,
|
||||
"width": 768,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
},
|
||||
"custom_text": true
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_base",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 224
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_base",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 256
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_base",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 320
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_large",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 224
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_large",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "mlp",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 256
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 16
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 768,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_large",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "mlp",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 320
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 768,
|
||||
"heads": 12,
|
||||
"layers": 16
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_small",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 224
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_tiny",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 224
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_xlarge",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 256
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 20
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_xxlarge",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 256
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 24
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "convnext_xxlarge",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"timm_drop": 0.0,
|
||||
"timm_drop_path": 0.1,
|
||||
"image_size": 320
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 1024,
|
||||
"heads": 16,
|
||||
"layers": 24
|
||||
}
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"hf_model_name": "google/mt5-base",
|
||||
"hf_tokenizer_name": "google/mt5-base",
|
||||
"proj": "mlp",
|
||||
"pooler_type": "mean_pooler"
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 32,
|
||||
"width": 1280,
|
||||
"head_width": 80,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"hf_model_name": "google/mt5-xl",
|
||||
"hf_tokenizer_name": "google/mt5-xl",
|
||||
"proj": "mlp",
|
||||
"pooler_type": "mean_pooler"
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"quick_gelu": true,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"hf_model_name": "roberta-base",
|
||||
"hf_tokenizer_name": "roberta-base",
|
||||
"proj": "mlp",
|
||||
"pooler_type": "mean_pooler"
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"embed_dim": 640,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "swin_base_patch4_window7_224",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"image_size": 224
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 640,
|
||||
"heads": 10,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "vit_medium_patch16_gap_256",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"image_size": 256
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"timm_model_name": "vit_relpos_medium_patch16_cls_224",
|
||||
"timm_model_pretrained": false,
|
||||
"timm_pool": "",
|
||||
"timm_proj": "linear",
|
||||
"image_size": 224
|
||||
},
|
||||
"text_cfg": {
|
||||
"context_length": 77,
|
||||
"vocab_size": 49408,
|
||||
"width": 512,
|
||||
"heads": 8,
|
||||
"layers": 12
|
||||
}
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
{
|
||||
"embed_dim": 512,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 12,
|
||||
"width": 768,
|
||||
"patch_size": 32
|
||||
},
|
||||
"text_cfg": {
|
||||
"hf_model_name": "xlm-roberta-base",
|
||||
"hf_tokenizer_name": "xlm-roberta-base",
|
||||
"proj": "mlp",
|
||||
"pooler_type": "mean_pooler"
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"embed_dim": 1024,
|
||||
"vision_cfg": {
|
||||
"image_size": 224,
|
||||
"layers": 32,
|
||||
"width": 1280,
|
||||
"head_width": 80,
|
||||
"patch_size": 14
|
||||
},
|
||||
"text_cfg": {
|
||||
"hf_model_name": "xlm-roberta-large",
|
||||
"hf_tokenizer_name": "xlm-roberta-large",
|
||||
"proj": "mlp",
|
||||
"pooler_type": "mean_pooler"
|
||||
}
|
||||
}
|
||||
@@ -19,7 +19,10 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
@lru_cache()
|
||||
def default_bpe():
|
||||
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
|
||||
quality_metric_path = os.path.join(project_root, 'models', 'QualityMetric')
|
||||
return os.path.join(quality_metric_path, "bpe_simple_vocab_16e6.txt.gz")
|
||||
|
||||
|
||||
@lru_cache()
|
||||
|
||||
@@ -6,15 +6,15 @@ import os
|
||||
from .config import MODEL_PATHS
|
||||
|
||||
class PickScore:
|
||||
def __init__(self, device: Union[str, torch.device]):
|
||||
def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS):
|
||||
"""Initialize the Selector with a processor and model.
|
||||
|
||||
Args:
|
||||
device (Union[str, torch.device]): The device to load the model on.
|
||||
"""
|
||||
self.device = device if isinstance(device, torch.device) else torch.device(device)
|
||||
processor_name_or_path = MODEL_PATHS.get("clip")
|
||||
model_pretrained_name_or_path = MODEL_PATHS.get("pickscore")
|
||||
processor_name_or_path = path.get("clip")
|
||||
model_pretrained_name_or_path = path.get("pickscore")
|
||||
self.processor = AutoProcessor.from_pretrained(processor_name_or_path)
|
||||
self.model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(self.device)
|
||||
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
from .base_model import *
|
||||
from .clip_model import *
|
||||
from .cross_modeling import *
|
||||
@@ -4,13 +4,13 @@ from transformers import AutoTokenizer
|
||||
|
||||
from torch import nn, einsum
|
||||
|
||||
from trainer.models.base_model import BaseModelConfig
|
||||
from .base_model import BaseModelConfig
|
||||
|
||||
from transformers import CLIPConfig
|
||||
from typing import Any, Optional, Tuple, Union
|
||||
import torch
|
||||
|
||||
from trainer.models.cross_modeling import Cross_model
|
||||
from .cross_modeling import Cross_model
|
||||
|
||||
import gc
|
||||
|
||||
@@ -91,7 +91,7 @@ class XCLIPModel(HFCLIPModel):
|
||||
|
||||
@dataclass
|
||||
class ClipModelConfig(BaseModelConfig):
|
||||
_target_: str = "trainer.models.clip_model.CLIPModel"
|
||||
_target_: str = "diffsynth.extensions.QualityMetric.trainer.models.clip_model.CLIPModel"
|
||||
pretrained_model_name_or_path: str ="checkpoints/clip-vit-base-patch32"
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user