support z-image-omni-base training

This commit is contained in:
Artiprocher
2026-01-05 20:04:00 +08:00
parent 5745c9f200
commit 32449a6aa0
9 changed files with 128 additions and 4 deletions

View File

@@ -90,12 +90,10 @@ class Siglip2ImageEncoder428M(Siglip2VisionModel):
super().__init__(config)
self.processor = Siglip2ImageProcessorFast(
**{
"crop_size": None,
"data_format": "channels_first",
"default_to_square": True,
"device": None,
"disable_grouping": None,
"do_center_crop": None,
"do_convert_rgb": None,
"do_normalize": True,
"do_pad": None,
@@ -120,7 +118,6 @@ class Siglip2ImageEncoder428M(Siglip2VisionModel):
"resample": 2,
"rescale_factor": 0.00392156862745098,
"return_tensors": None,
"size": None
}
)

View File

@@ -626,7 +626,7 @@ class ZImageDiT(nn.Module):
# Pad token
feats_cat = torch.cat(feats, dim=0)
feats_cat[torch.cat(inner_pad_mask)] = pad_token
feats_cat[torch.cat(inner_pad_mask)] = pad_token.to(dtype=feats_cat.dtype, device=feats_cat.device)
feats = list(feats_cat.split(item_seqlens, dim=0))
# RoPE