fix version issue of transformers (#1412)

This commit is contained in:
Zhongjie Duan
2026-04-24 17:01:08 +08:00
committed by GitHub
parent b1af4af8a9
commit 5b66f223b6
4 changed files with 15 additions and 5 deletions

View File

@@ -42,6 +42,7 @@ qwen_image_series = [
"model_hash": "5722b5c873720009de96422993b15682", "model_hash": "5722b5c873720009de96422993b15682",
"model_name": "dinov3_image_encoder", "model_name": "dinov3_image_encoder",
"model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder", "model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
"state_dict_converter": "diffsynth.utils.state_dict_converters.dino_v3.DINOv3StateDictConverter",
}, },
{ {
# Example: # Example:

View File

@@ -1,5 +1,5 @@
from transformers import DINOv3ViTModel, DINOv3ViTImageProcessor from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTModel, DINOv3ViTConfig
from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTConfig from transformers import DINOv3ViTImageProcessor
import torch import torch
from ..core.device.npu_compatible_device import get_device_type from ..core.device.npu_compatible_device import get_device_type
@@ -82,7 +82,7 @@ class DINOv3ImageEncoder(DINOv3ViTModel):
hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
position_embeddings = self.rope_embeddings(pixel_values) position_embeddings = self.rope_embeddings(pixel_values)
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.model.layer):
layer_head_mask = head_mask[i] if head_mask is not None else None layer_head_mask = head_mask[i] if head_mask is not None else None
hidden_states = layer_module( hidden_states = layer_module(
hidden_states, hidden_states,

View File

@@ -1,11 +1,11 @@
from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig from transformers.models.siglip.modeling_siglip import SiglipVisionModel, SiglipVisionConfig
from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessor from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessor
import torch import torch
from diffsynth.core.device.npu_compatible_device import get_device_type from diffsynth.core.device.npu_compatible_device import get_device_type
class Siglip2ImageEncoder(SiglipVisionTransformer): class Siglip2ImageEncoder(SiglipVisionModel):
def __init__(self): def __init__(self):
config = SiglipVisionConfig( config = SiglipVisionConfig(
attention_dropout = 0.0, attention_dropout = 0.0,

View File

@@ -0,0 +1,9 @@
def DINOv3StateDictConverter(state_dict):
new_state_dict = {}
for key in state_dict:
value = state_dict[key]
if key.startswith("layer"):
new_state_dict["model." + key] = value
else:
new_state_dict[key] = value
return new_state_dict