mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
support i2L
This commit is contained in:
70
diffsynth/models/siglip2_image_encoder.py
Normal file
70
diffsynth/models/siglip2_image_encoder.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig
|
||||
from transformers import SiglipImageProcessor
|
||||
import torch
|
||||
|
||||
|
||||
class Siglip2ImageEncoder(SiglipVisionTransformer):
|
||||
def __init__(self):
|
||||
config = SiglipVisionConfig(
|
||||
attention_dropout = 0.0,
|
||||
dtype = "float32",
|
||||
hidden_act = "gelu_pytorch_tanh",
|
||||
hidden_size = 1536,
|
||||
image_size = 384,
|
||||
intermediate_size = 6144,
|
||||
layer_norm_eps = 1e-06,
|
||||
model_type = "siglip_vision_model",
|
||||
num_attention_heads = 16,
|
||||
num_channels = 3,
|
||||
num_hidden_layers = 40,
|
||||
patch_size = 16,
|
||||
transformers_version = "4.56.1",
|
||||
_attn_implementation = "sdpa"
|
||||
)
|
||||
super().__init__(config)
|
||||
self.processor = SiglipImageProcessor(
|
||||
do_convert_rgb = None,
|
||||
do_normalize = True,
|
||||
do_rescale = True,
|
||||
do_resize = True,
|
||||
image_mean = [
|
||||
0.5,
|
||||
0.5,
|
||||
0.5
|
||||
],
|
||||
image_processor_type = "SiglipImageProcessor",
|
||||
image_std = [
|
||||
0.5,
|
||||
0.5,
|
||||
0.5
|
||||
],
|
||||
processor_class = "SiglipProcessor",
|
||||
resample = 2,
|
||||
rescale_factor = 0.00392156862745098,
|
||||
size = {
|
||||
"height": 384,
|
||||
"width": 384
|
||||
}
|
||||
)
|
||||
|
||||
def forward(self, image, torch_dtype=torch.bfloat16, device="cuda"):
|
||||
pixel_values = self.processor(images=[image], return_tensors="pt")["pixel_values"]
|
||||
pixel_values = pixel_values.to(device=device, dtype=torch_dtype)
|
||||
output_attentions = False
|
||||
output_hidden_states = False
|
||||
interpolate_pos_encoding = False
|
||||
|
||||
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
inputs_embeds=hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
|
||||
last_hidden_state = encoder_outputs.last_hidden_state
|
||||
last_hidden_state = self.post_layernorm(last_hidden_state)
|
||||
|
||||
pooler_output = self.head(last_hidden_state) if self.use_head else None
|
||||
|
||||
return pooler_output
|
||||
Reference in New Issue
Block a user