support ltx-2 t2v and i2v

2026-03-18 22:08:13 +00:00 · 2026-02-02 19:53:07 +08:00
parent 1c8a0f8317
commit f4f991d409
20 changed files with 1084 additions and 25 deletions
--- a/diffsynth/models/ltx2_dit.py
+++ b/diffsynth/models/ltx2_dit.py
@@ -1442,6 +1442,10 @@ class LTXModel(torch.nn.Module):
        return vx, ax

    def forward(self, video_latents, video_positions, video_context, video_timesteps, audio_latents, audio_positions, audio_context, audio_timesteps):
+        cross_pe_max_pos = None
+        if self.model_type.is_video_enabled() and self.model_type.is_audio_enabled():
+            cross_pe_max_pos = max(self.positional_embedding_max_pos[0], self.audio_positional_embedding_max_pos[0])
+        self._init_preprocessors(cross_pe_max_pos)
        video = Modality(video_latents, video_timesteps, video_positions, video_context)
        audio = Modality(audio_latents, audio_timesteps, audio_positions, audio_context)
        vx, ax = self._forward(video=video, audio=audio, perturbations=None)
--- a/diffsynth/models/ltx2_video_vae.py
+++ b/diffsynth/models/ltx2_video_vae.py
@@ -1648,11 +1648,8 @@ class LTX2VideoEncoder(nn.Module):
        tile_overlap_in_pixels: Optional[int] = 128,
        **kwargs,
    ) -> torch.Tensor:
-        device = next(self.parameters()).device
-        vae_dtype = next(self.parameters()).dtype
        if video.ndim == 4:
            video = video.unsqueeze(0)  # [C, F, H, W] -> [B, C, F, H, W]
-        video = video.to(device=device, dtype=vae_dtype)
        # Choose encoding method based on tiling flag
        if tiled:
            latents = self.tiled_encode_video(