mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-04-24 23:26:15 +00:00
fix version issue of transformers (#1412)
This commit is contained in:
@@ -42,6 +42,7 @@ qwen_image_series = [
|
|||||||
"model_hash": "5722b5c873720009de96422993b15682",
|
"model_hash": "5722b5c873720009de96422993b15682",
|
||||||
"model_name": "dinov3_image_encoder",
|
"model_name": "dinov3_image_encoder",
|
||||||
"model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
|
"model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
|
||||||
|
"state_dict_converter": "diffsynth.utils.state_dict_converters.dino_v3.DINOv3StateDictConverter",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
# Example:
|
# Example:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from transformers import DINOv3ViTModel, DINOv3ViTImageProcessor
|
from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTModel, DINOv3ViTConfig
|
||||||
from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTConfig
|
from transformers import DINOv3ViTImageProcessor
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from ..core.device.npu_compatible_device import get_device_type
|
from ..core.device.npu_compatible_device import get_device_type
|
||||||
@@ -82,7 +82,7 @@ class DINOv3ImageEncoder(DINOv3ViTModel):
|
|||||||
hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
||||||
position_embeddings = self.rope_embeddings(pixel_values)
|
position_embeddings = self.rope_embeddings(pixel_values)
|
||||||
|
|
||||||
for i, layer_module in enumerate(self.layer):
|
for i, layer_module in enumerate(self.model.layer):
|
||||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||||
hidden_states = layer_module(
|
hidden_states = layer_module(
|
||||||
hidden_states,
|
hidden_states,
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig
|
from transformers.models.siglip.modeling_siglip import SiglipVisionModel, SiglipVisionConfig
|
||||||
from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessor
|
from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessor
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from diffsynth.core.device.npu_compatible_device import get_device_type
|
from diffsynth.core.device.npu_compatible_device import get_device_type
|
||||||
|
|
||||||
|
|
||||||
class Siglip2ImageEncoder(SiglipVisionTransformer):
|
class Siglip2ImageEncoder(SiglipVisionModel):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
config = SiglipVisionConfig(
|
config = SiglipVisionConfig(
|
||||||
attention_dropout = 0.0,
|
attention_dropout = 0.0,
|
||||||
|
|||||||
9
diffsynth/utils/state_dict_converters/dino_v3.py
Normal file
9
diffsynth/utils/state_dict_converters/dino_v3.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
def DINOv3StateDictConverter(state_dict):
|
||||||
|
new_state_dict = {}
|
||||||
|
for key in state_dict:
|
||||||
|
value = state_dict[key]
|
||||||
|
if key.startswith("layer"):
|
||||||
|
new_state_dict["model." + key] = value
|
||||||
|
else:
|
||||||
|
new_state_dict[key] = value
|
||||||
|
return new_state_dict
|
||||||
Reference in New Issue
Block a user