add new quality metric

2026-03-19 06:39:43 +00:00 · 2025-02-17 14:42:20 +08:00
parent 77d0f4d297
commit 991ba162bd
69 changed files with 88 additions and 1461 deletions
--- a/diffsynth/extensions/QualityMetric/BLIP/blip.py
+++ b/diffsynth/extensions/QualityMetric/BLIP/blip.py
@@ -13,8 +13,16 @@ from transformers import BertTokenizer
 from .vit import VisionTransformer, interpolate_pos_embed


+def default_bert():
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
+    model_path = os.path.join(project_root, 'models', 'QualityMetric')
+    return os.path.join(model_path, "bert-base-uncased")
+
+bert_model_path = default_bert()
+
 def init_tokenizer():
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    tokenizer = BertTokenizer.from_pretrained(bert_model_path)
    tokenizer.add_special_tokens({'bos_token':'[DEC]'})
    tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})       
    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]  
--- a/diffsynth/extensions/QualityMetric/aesthetic.py
+++ b/diffsynth/extensions/QualityMetric/aesthetic.py
@@ -50,31 +50,30 @@ class MLP(torch.nn.Module):


 class AestheticScore:
-    def __init__(self, device: torch.device, model_path: str = MODEL_PATHS.get("aesthetic_predictor")):
+    def __init__(self, device: torch.device, path: str = MODEL_PATHS):
        """Initialize the Selector with a model and processor.

        Args:
            device (torch.device): The device to load the model on.
-            model_path (str): Path to the model weights file.
        """
        self.device = device
-
+        self.aes_model_path = path.get("aesthetic_predictor")
        # Load the MLP model
        self.model = MLP(768)
        try:
-            if model_path.endswith(".safetensors"):
-                state_dict = load_file(model_path)
+            if self.aes_model_path.endswith(".safetensors"):
+                state_dict = load_file(self.aes_model_path)
            else:
-                state_dict = torch.load(model_path)
+                state_dict = torch.load(self.aes_model_path)
            self.model.load_state_dict(state_dict)
        except Exception as e:
-            raise ValueError(f"Error loading model weights from {model_path}: {e}")
+            raise ValueError(f"Error loading model weights from {self.aes_model_path}: {e}")

        self.model.to(device)
        self.model.eval()

        # Load the CLIP model and processor
-        clip_model_name = MODEL_PATHS.get('clip-large')
+        clip_model_name = path.get('clip-large')
        self.model2 = AutoModel.from_pretrained(clip_model_name).eval().to(device)
        self.processor = AutoProcessor.from_pretrained(clip_model_name)

--- a/diffsynth/extensions/QualityMetric/clip.py
+++ b/diffsynth/extensions/QualityMetric/clip.py
@@ -5,7 +5,7 @@ from .open_clip import create_model_and_transforms, get_tokenizer
 from .config import MODEL_PATHS

 class CLIPScore:
-    def __init__(self, device: torch.device):
+    def __init__(self, device: torch.device, path: str = MODEL_PATHS):
        """Initialize the CLIPScore with a model and tokenizer.
        
        Args:
@@ -17,7 +17,7 @@ class CLIPScore:
        self.model, _, self.preprocess_val = create_model_and_transforms(
            "ViT-H-14",
            # "laion2B-s32B-b79K",
-            pretrained=MODEL_PATHS.get("open_clip"),
+            pretrained=path.get("open_clip"),
            precision="amp",
            device=device,
            jit=False,
--- a/diffsynth/extensions/QualityMetric/config.py
+++ b/diffsynth/extensions/QualityMetric/config.py
@@ -2,11 +2,11 @@ import os

 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.abspath(os.path.join(current_dir, '../../../'))
-quality_metric_path = os.path.join(project_root, 'models', 'QualityMetric')
+model_path = os.path.join(project_root, 'models', 'QualityMetric')


 def get_model_path(model_name):
-    return os.path.join(quality_metric_path, model_name)
+    return os.path.join(model_path, model_name)


 MODEL_PATHS = {
@@ -18,6 +18,6 @@ MODEL_PATHS = {
    "med_config": get_model_path("ImageReward/med_config.json"),
    "clip": get_model_path("CLIP-ViT-H-14-laion2B-s32B-b79K"),
    "clip-large": get_model_path("clip-vit-large-patch14"),
-    "mps": get_model_path("MPS_overall_checkpoint/MPS_overall_checkpoint_diffsynth.pth"),
+    "mps": get_model_path("MPS_overall_checkpoint/MPS_overall_checkpoint_diffsynth.safetensors"),
    "pickscore": get_model_path("PickScore_v1")
 }
--- a/diffsynth/extensions/QualityMetric/hps.py
+++ b/diffsynth/extensions/QualityMetric/hps.py
@@ -7,7 +7,7 @@ import os
 from .config import MODEL_PATHS

 class HPScore_v2:
-    def __init__(self, device: torch.device, model_version: str = "v2"):
+    def __init__(self, device: torch.device, path: str = MODEL_PATHS, model_version: str = "v2"):
        """Initialize the Selector with a model and tokenizer.

        Args:
@@ -17,9 +17,9 @@ class HPScore_v2:
        self.device = device

        if model_version == "v2":
-            safetensors_path = MODEL_PATHS.get("hpsv2")
+            safetensors_path = path.get("hpsv2")
        elif model_version == "v21":
-            safetensors_path = MODEL_PATHS.get("hpsv2.1")
+            safetensors_path = path.get("hpsv2.1")
        else:
            raise ValueError(f"Unsupported model version: {model_version}. Choose 'v2' or 'v21'.")

@@ -27,7 +27,7 @@ class HPScore_v2:
        model, _, self.preprocess_val = create_model_and_transforms(
            "ViT-H-14",
            # "laion2B-s32B-b79K",
-            pretrained=MODEL_PATHS.get("open_clip"),
+            pretrained=path.get("open_clip"),
            precision="amp",
            device=device,
            jit=False,
--- a/diffsynth/extensions/QualityMetric/imagereward.py
+++ b/diffsynth/extensions/QualityMetric/imagereward.py
@@ -188,15 +188,15 @@ class ImageReward(torch.nn.Module):


 class ImageRewardScore:
-    def __init__(self, device: Union[str, torch.device]):
+    def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS):
        """Initialize the Selector with a processor and model.

        Args:
            device (Union[str, torch.device]): The device to load the model on.
        """
        self.device = device if isinstance(device, torch.device) else torch.device(device)
-        model_path = MODEL_PATHS.get("imagereward")
-        med_config = MODEL_PATHS.get("med_config")
+        model_path = path.get("imagereward")
+        med_config = path.get("med_config")
        state_dict = load_file(model_path)
        self.model = ImageReward(device=self.device, med_config=med_config).to(self.device)
        self.model.load_state_dict(state_dict, strict=False)
--- a/diffsynth/extensions/QualityMetric/mps.py
+++ b/diffsynth/extensions/QualityMetric/mps.py
@@ -4,10 +4,10 @@ from PIL import Image
 from io import BytesIO
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPImageProcessor
-
+from transformers import CLIPConfig
 from dataclasses import dataclass
 from transformers import CLIPModel as HFCLIPModel
-
+from safetensors.torch import load_file
 from torch import nn, einsum

 from .trainer.models.base_model import BaseModelConfig
@@ -18,26 +18,27 @@ from typing import Any, Optional, Tuple, Union, List
 import torch

 from .trainer.models.cross_modeling import Cross_model
+from .trainer.models import clip_model
 import torch.nn.functional as F
-
 import gc
 import json
 from .config import MODEL_PATHS

 class MPScore:
-    def __init__(self, device: Union[str, torch.device], condition: str = 'overall'):
+    def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS, condition: str = 'overall'):
        """Initialize the MPSModel with a processor, tokenizer, and model.

        Args:
            device (Union[str, torch.device]): The device to load the model on.
        """
        self.device = device
-        processor_name_or_path = MODEL_PATHS.get("clip")
+        processor_name_or_path = path.get("clip")
        self.image_processor = CLIPImageProcessor.from_pretrained(processor_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(processor_name_or_path, trust_remote_code=True)
-
-        model_ckpt_path = MODEL_PATHS.get("mps")
-        self.model = torch.load(model_ckpt_path).eval().to(device)
+        self.model = clip_model.CLIPModel(processor_name_or_path)
+        state_dict = load_file(path.get("mps"))
+        self.model.load_state_dict(state_dict, strict=False)
+        self.model.to(device)
        self.condition = condition

    def _calculate_score(self, image: torch.Tensor, prompt: str) -> float:
--- a/diffsynth/extensions/QualityMetric/open_clip/bpe_simple_vocab_16e6.txt.gz
+++ b/diffsynth/extensions/QualityMetric/open_clip/bpe_simple_vocab_16e6.txt.gz
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN101-quickgelu.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN101-quickgelu.json
@@ -1,22 +0,0 @@
-{
-    "embed_dim": 512,
-    "quick_gelu": true,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": [
-            3,
-            4,
-            23,
-            3
-        ],
-        "width": 64,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN101.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN101.json
@@ -1,21 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": [
-            3,
-            4,
-            23,
-            3
-        ],
-        "width": 64,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50-quickgelu.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50-quickgelu.json
@@ -1,22 +0,0 @@
-{
-    "embed_dim": 1024,
-    "quick_gelu": true,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": [
-            3,
-            4,
-            6,
-            3
-        ],
-        "width": 64,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50.json
@@ -1,21 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": [
-            3,
-            4,
-            6,
-            3
-        ],
-        "width": 64,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50x16.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50x16.json
@@ -1,21 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 384,
-        "layers": [
-            6,
-            8,
-            18,
-            8
-        ],
-        "width": 96,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50x4.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50x4.json
@@ -1,21 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "image_size": 288,
-        "layers": [
-            4,
-            6,
-            10,
-            6
-        ],
-        "width": 80,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50x64.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/RN50x64.json
@@ -1,21 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 448,
-        "layers": [
-            3,
-            15,
-            36,
-            10
-        ],
-        "width": 128,
-        "patch_size": null
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-16-plus-240.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-16-plus-240.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "image_size": 240,
-        "layers": 12,
-        "width": 896,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-16-plus.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-16-plus.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 896,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-16.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-16.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-32-plus-256.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-32-plus-256.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "image_size": 256,
-        "layers": 12,
-        "width": 896,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-32-quickgelu.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-32-quickgelu.json
@@ -1,17 +0,0 @@
-{
-    "embed_dim": 512,
-    "quick_gelu": true,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-B-32.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-H-16.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-H-16.json
@@ -1,17 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 32,
-        "width": 1280,
-        "head_width": 80,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 24
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-14-280.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-14-280.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 280,
-        "layers": 24,
-        "width": 1024,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-14-336.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-14-336.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 336,
-        "layers": 24,
-        "width": 1024,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-14.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 24,
-        "width": 1024,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-16-320.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-16-320.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 320,
-        "layers": 24,
-        "width": 1024,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-16.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-L-16.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 24,
-        "width": 1024,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-16-alt.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-16-alt.json
@@ -1,17 +0,0 @@
-{
-    "embed_dim": 384,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 512,
-        "patch_size": 16,
-        "ls_init_value": 1e-4
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 384,
-        "heads": 6,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-16.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-16.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 512,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-32-alt.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-32-alt.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 384,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 512,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 384,
-        "heads": 6,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-M-32.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 512,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-16-alt.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-16-alt.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 256,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 384,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 256,
-        "heads": 4,
-        "layers": 10
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-16.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-16.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 384,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 384,
-        "patch_size": 16
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 384,
-        "heads": 6,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-32-alt.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-32-alt.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 256,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 384,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 256,
-        "heads": 4,
-        "layers": 10
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-S-32.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 384,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 384,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 384,
-        "heads": 6,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-bigG-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-bigG-14.json
@@ -1,18 +0,0 @@
-{
-    "embed_dim": 1280,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 48,
-        "width": 1664,
-        "head_width": 104,
-        "mlp_ratio": 4.9231,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1280,
-        "heads": 20,
-        "layers": 32
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-e-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-e-14.json
@@ -1,18 +0,0 @@
-{
-    "embed_dim": 1280,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 56,
-        "width": 1792,
-        "head_width": 112,
-        "mlp_ratio": 8.5715,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1280,
-        "heads": 20,
-        "layers": 36
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-g-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/ViT-g-14.json
@@ -1,18 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 40,
-        "width": 1408,
-        "head_width": 88,
-        "mlp_ratio": 4.3637,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 24
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_ViT-B-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_ViT-B-32.json
@@ -1,30 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32,
-        "attentional_pool": true,
-        "attn_pooler_heads": 8,
-        "output_tokens": true
-    },
-    "text_cfg": {
-        "context_length": 76,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12,
-        "embed_cls": true,
-        "output_tokens": true
-    },
-    "multimodal_cfg": {
-        "context_length": 76,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12,
-        "attn_pooler_heads": 8
-    },
-    "custom_text": true
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_ViT-L-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_ViT-L-14.json
@@ -1,30 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 24,
-        "width": 1024,
-        "patch_size": 14,
-        "attentional_pool": true,
-        "attn_pooler_heads": 8,
-        "output_tokens": true
-    },
-    "text_cfg": {
-        "context_length": 76,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12,
-        "embed_cls": true,
-        "output_tokens": true
-    },
-    "multimodal_cfg": {
-        "context_length": 76,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12,
-        "attn_pooler_heads": 12
-    },
-    "custom_text": true
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_base.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_base.json
@@ -1,31 +0,0 @@
-{
-    "embed_dim": 512,
-    "multimodal_cfg": {
-        "width": 768,
-        "context_length": 76,
-        "vocab_size": 64000,
-        "mlp_ratio": 4,
-        "layers": 12,
-        "dim_head": 64,
-        "heads": 12,
-        "n_queries": 256,
-        "attn_pooler_heads": 8
-    },
-    "vision_cfg": {
-        "image_size": 288,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 18,
-        "output_tokens": true
-    },
-    "text_cfg": {
-        "context_length": 76,
-        "vocab_size": 64000,
-        "layers": 12,
-        "heads": 12,
-        "width": 768,
-        "embed_cls": true,
-        "output_tokens": true
-    },
-    "custom_text": true
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_roberta-ViT-B-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/coca_roberta-ViT-B-32.json
@@ -1,24 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32,
-        "output_tokens": true
-    },
-    "text_cfg": {
-        "hf_model_name": "roberta-base",
-        "hf_tokenizer_name": "roberta-base",
-        "proj": "linear",
-        "width": 768,
-        "output_tokens": true
-    },
-    "multimodal_cfg": {
-        "context_length": 76,
-        "width": 768,
-        "heads": 8,
-        "layers": 12
-    },
-    "custom_text": true
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_base.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_base.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "timm_model_name": "convnext_base",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 224
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_base_w.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_base_w.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "timm_model_name": "convnext_base",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 256
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_base_w_320.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_base_w_320.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "timm_model_name": "convnext_base",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 320
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_large.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_large.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "timm_model_name": "convnext_large",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 224
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_large_d.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_large_d.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "timm_model_name": "convnext_large",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "mlp",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 256
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 16
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_large_d_320.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_large_d_320.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "timm_model_name": "convnext_large",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "mlp",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 320
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 16
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_small.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_small.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "timm_model_name": "convnext_small",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 224
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_tiny.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_tiny.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "timm_model_name": "convnext_tiny",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 224
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_xlarge.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_xlarge.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "timm_model_name": "convnext_xlarge",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 256
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 20
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_xxlarge.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_xxlarge.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "timm_model_name": "convnext_xxlarge",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 256
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 24
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_xxlarge_320.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/convnext_xxlarge_320.json
@@ -1,19 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "timm_model_name": "convnext_xxlarge",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "timm_drop": 0.0,
-        "timm_drop_path": 0.1,
-        "image_size": 320
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 24
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/mt5-base-ViT-B-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/mt5-base-ViT-B-32.json
@@ -1,15 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "hf_model_name": "google/mt5-base",
-        "hf_tokenizer_name": "google/mt5-base",
-        "proj": "mlp",
-        "pooler_type": "mean_pooler"
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/mt5-xl-ViT-H-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/mt5-xl-ViT-H-14.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 32,
-        "width": 1280,
-        "head_width": 80,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "hf_model_name": "google/mt5-xl",
-        "hf_tokenizer_name": "google/mt5-xl",
-        "proj": "mlp",
-        "pooler_type": "mean_pooler"
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/roberta-ViT-B-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/roberta-ViT-B-32.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 512,
-    "quick_gelu": true,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "hf_model_name": "roberta-base",
-        "hf_tokenizer_name": "roberta-base",
-        "proj": "mlp",
-        "pooler_type": "mean_pooler"
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/swin_base_patch4_window7_224.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/swin_base_patch4_window7_224.json
@@ -1,17 +0,0 @@
-{
-    "embed_dim": 640,
-    "vision_cfg": {
-        "timm_model_name": "swin_base_patch4_window7_224",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "image_size": 224
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 640,
-        "heads": 10,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/vit_medium_patch16_gap_256.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/vit_medium_patch16_gap_256.json
@@ -1,17 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "timm_model_name": "vit_medium_patch16_gap_256",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "image_size": 256
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json
@@ -1,17 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "timm_model_name": "vit_relpos_medium_patch16_cls_224",
-        "timm_model_pretrained": false,
-        "timm_pool": "",
-        "timm_proj": "linear",
-        "image_size": 224
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json
@@ -1,15 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 32
-    },
-    "text_cfg": {
-        "hf_model_name": "xlm-roberta-base",
-        "hf_tokenizer_name": "xlm-roberta-base",
-        "proj": "mlp",
-        "pooler_type": "mean_pooler"
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json
+++ b/diffsynth/extensions/QualityMetric/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json
@@ -1,16 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 32,
-        "width": 1280,
-        "head_width": 80,
-        "patch_size": 14
-    },
-    "text_cfg": {
-        "hf_model_name": "xlm-roberta-large",
-        "hf_tokenizer_name": "xlm-roberta-large",
-        "proj": "mlp",
-        "pooler_type": "mean_pooler"
-    }
-}
--- a/diffsynth/extensions/QualityMetric/open_clip/tokenizer.py
+++ b/diffsynth/extensions/QualityMetric/open_clip/tokenizer.py
@@ -19,7 +19,10 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"

@lru_cache()
 def default_bpe():
-    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
+    quality_metric_path = os.path.join(project_root, 'models', 'QualityMetric')
+    return os.path.join(quality_metric_path, "bpe_simple_vocab_16e6.txt.gz")


@lru_cache()
--- a/diffsynth/extensions/QualityMetric/pickscore.py
+++ b/diffsynth/extensions/QualityMetric/pickscore.py
@@ -6,15 +6,15 @@ import os
 from .config import MODEL_PATHS

 class PickScore:
-    def __init__(self, device: Union[str, torch.device]):
+    def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS):
        """Initialize the Selector with a processor and model.

        Args:
            device (Union[str, torch.device]): The device to load the model on.
        """
        self.device = device if isinstance(device, torch.device) else torch.device(device)
-        processor_name_or_path = MODEL_PATHS.get("clip")
-        model_pretrained_name_or_path = MODEL_PATHS.get("pickscore")
+        processor_name_or_path = path.get("clip")
+        model_pretrained_name_or_path = path.get("pickscore")
        self.processor = AutoProcessor.from_pretrained(processor_name_or_path)
        self.model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(self.device)

--- a/diffsynth/extensions/QualityMetric/trainer/models/init.py
+++ b/diffsynth/extensions/QualityMetric/trainer/models/init.py
@@ -0,0 +1,3 @@
+from .base_model import *
+from .clip_model import *
+from .cross_modeling import *
--- a/diffsynth/extensions/QualityMetric/trainer/models/clip_model.py
+++ b/diffsynth/extensions/QualityMetric/trainer/models/clip_model.py
@@ -4,13 +4,13 @@ from transformers import AutoTokenizer

 from torch import nn, einsum

-from trainer.models.base_model import BaseModelConfig
+from .base_model import BaseModelConfig

 from transformers import CLIPConfig
 from typing import Any, Optional, Tuple, Union
 import torch

-from trainer.models.cross_modeling import Cross_model
+from .cross_modeling import Cross_model

 import gc

@@ -91,7 +91,7 @@ class XCLIPModel(HFCLIPModel):

@dataclass
 class ClipModelConfig(BaseModelConfig):
-    _target_: str = "trainer.models.clip_model.CLIPModel"
+    _target_: str = "diffsynth.extensions.QualityMetric.trainer.models.clip_model.CLIPModel"
    pretrained_model_name_or_path: str ="checkpoints/clip-vit-base-patch32"