add audio_vae, audio_vocoder, text_encoder, connector and upsampler for ltx2

2026-03-18 22:08:13 +00:00 · 2026-01-28 16:09:22 +08:00
parent 00da4b6c4f
commit 8d303b47e9
8 changed files with 2207 additions and 24 deletions
--- a/diffsynth/models/ltx2_audio_vae.py
+++ b/diffsynth/models/ltx2_audio_vae.py
--- a/diffsynth/models/ltx2_common.py
+++ b/diffsynth/models/ltx2_common.py
@@ -1,7 +1,9 @@
 from dataclasses import dataclass
-from typing import NamedTuple
+from typing import NamedTuple, Protocol, Tuple
 import torch
 from torch import nn
+from enum import Enum
+

 class VideoPixelShape(NamedTuple):
    """
@@ -180,6 +182,13 @@ class LatentState:
        )


+class NormType(Enum):
+    """Normalization layer types: GROUP (GroupNorm) or PIXEL (per-location RMS norm)."""
+
+    GROUP = "group"
+    PIXEL = "pixel"
+
+
 class PixelNorm(nn.Module):
    """
    Per-pixel (per-location) RMS normalization layer.
@@ -209,6 +218,25 @@ class PixelNorm(nn.Module):
        return x / rms


+def build_normalization_layer(
+    in_channels: int, *, num_groups: int = 32, normtype: NormType = NormType.GROUP
+) -> nn.Module:
+    """
+    Create a normalization layer based on the normalization type.
+    Args:
+        in_channels: Number of input channels
+        num_groups: Number of groups for group normalization
+        normtype: Type of normalization: "group" or "pixel"
+    Returns:
+        A normalization layer
+    """
+    if normtype == NormType.GROUP:
+        return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+    if normtype == NormType.PIXEL:
+        return PixelNorm(dim=1, eps=1e-6)
+    raise ValueError(f"Invalid normalization type: {normtype}")
+
+
 def rms_norm(x: torch.Tensor, weight: torch.Tensor | None = None, eps: float = 1e-6) -> torch.Tensor:
    """Root-mean-square (RMS) normalize `x` over its last dimension.
    Thin wrapper around `torch.nn.functional.rms_norm` that infers the normalized
@@ -251,3 +279,61 @@ def to_denoised(
    if isinstance(sigma, torch.Tensor):
        sigma = sigma.to(calc_dtype)
    return (sample.to(calc_dtype) - velocity.to(calc_dtype) * sigma).to(sample.dtype)
+
+
+
+class Patchifier(Protocol):
+    """
+    Protocol for patchifiers that convert latent tensors into patches and assemble them back.
+    """
+
+    def patchify(
+        self,
+        latents: torch.Tensor,
+    ) -> torch.Tensor:
+        ...
+        """
+        Convert latent tensors into flattened patch tokens.
+        Args:
+            latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor.
+        """
+
+    def unpatchify(
+        self,
+        latents: torch.Tensor,
+        output_shape: AudioLatentShape | VideoLatentShape,
+    ) -> torch.Tensor:
+        """
+        Converts latent tensors between spatio-temporal formats and flattened sequence representations.
+        Args:
+            latents: Patch tokens that must be rearranged back into the latent grid constructed by `patchify`.
+            output_shape: Shape of the output tensor. Note that output_shape is either AudioLatentShape or
+            VideoLatentShape.
+        Returns:
+            Dense latent tensor restored from the flattened representation.
+        """
+
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        ...
+        """
+        Returns the patch size as a tuple of (temporal, height, width) dimensions
+        """
+
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        ...
+        """
+        Compute metadata describing where each latent patch resides within the
+        grid specified by `output_shape`.
+        Args:
+            output_shape: Target grid layout for the patches.
+            device: Target device for the returned tensor.
+        Returns:
+            Tensor containing patch coordinate metadata such as spatial or temporal intervals.
+        """
--- a/diffsynth/models/ltx2_text_encoder.py
+++ b/diffsynth/models/ltx2_text_encoder.py
@@ -0,0 +1,366 @@
+import torch
+from transformers import Gemma3ForConditionalGeneration, Gemma3Config, AutoTokenizer
+from .ltx2_dit import (LTXRopeType, generate_freq_grid_np, generate_freq_grid_pytorch, precompute_freqs_cis, Attention,
+                       FeedForward)
+from .ltx2_common import rms_norm
+
+
+class LTX2TextEncoder(Gemma3ForConditionalGeneration):
+    def __init__(self):
+        config = Gemma3Config(
+            **{
+                "architectures": ["Gemma3ForConditionalGeneration"],
+                "boi_token_index": 255999,
+                "dtype": "bfloat16",
+                "eoi_token_index": 256000,
+                "eos_token_id": [1, 106],
+                "image_token_index": 262144,
+                "initializer_range": 0.02,
+                "mm_tokens_per_image": 256,
+                "model_type": "gemma3",
+                "text_config": {
+                    "_sliding_window_pattern": 6,
+                    "attention_bias": False,
+                    "attention_dropout": 0.0,
+                    "attn_logit_softcapping": None,
+                    "cache_implementation": "hybrid",
+                    "dtype": "bfloat16",
+                    "final_logit_softcapping": None,
+                    "head_dim": 256,
+                    "hidden_activation": "gelu_pytorch_tanh",
+                    "hidden_size": 3840,
+                    "initializer_range": 0.02,
+                    "intermediate_size": 15360,
+                    "layer_types": [
+                        "sliding_attention", "sliding_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "full_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "full_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "full_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "full_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "full_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "full_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "full_attention", "sliding_attention", "sliding_attention",
+                        "sliding_attention", "sliding_attention", "sliding_attention", "full_attention"
+                    ],
+                    "max_position_embeddings": 131072,
+                    "model_type": "gemma3_text",
+                    "num_attention_heads": 16,
+                    "num_hidden_layers": 48,
+                    "num_key_value_heads": 8,
+                    "query_pre_attn_scalar": 256,
+                    "rms_norm_eps": 1e-06,
+                    "rope_local_base_freq": 10000,
+                    "rope_scaling": {
+                        "factor": 8.0,
+                        "rope_type": "linear"
+                    },
+                    "rope_theta": 1000000,
+                    "sliding_window": 1024,
+                    "sliding_window_pattern": 6,
+                    "use_bidirectional_attention": False,
+                    "use_cache": True,
+                    "vocab_size": 262208
+                },
+                "transformers_version": "4.57.3",
+                "vision_config": {
+                    "attention_dropout": 0.0,
+                    "dtype": "bfloat16",
+                    "hidden_act": "gelu_pytorch_tanh",
+                    "hidden_size": 1152,
+                    "image_size": 896,
+                    "intermediate_size": 4304,
+                    "layer_norm_eps": 1e-06,
+                    "model_type": "siglip_vision_model",
+                    "num_attention_heads": 16,
+                    "num_channels": 3,
+                    "num_hidden_layers": 27,
+                    "patch_size": 14,
+                    "vision_use_head": False
+                }
+            })
+        super().__init__(config)
+
+
+class LTXVGemmaTokenizer:
+    """
+    Tokenizer wrapper for Gemma models compatible with LTXV processes.
+    This class wraps HuggingFace's `AutoTokenizer` for use with Gemma text encoders,
+    ensuring correct settings and output formatting for downstream consumption.
+    """
+
+    def __init__(self, tokenizer_path: str, max_length: int = 1024):
+        """
+        Initialize the tokenizer.
+        Args:
+            tokenizer_path (str): Path to the pretrained tokenizer files or model directory.
+            max_length (int, optional): Max sequence length for encoding. Defaults to 256.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path, local_files_only=True, model_max_length=max_length
+        )
+        # Gemma expects left padding for chat-style prompts; for plain text it doesn't matter much.
+        self.tokenizer.padding_side = "left"
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.max_length = max_length
+
+    def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict[str, list[tuple[int, int]]]:
+        """
+        Tokenize the given text and return token IDs and attention weights.
+        Args:
+            text (str): The input string to tokenize.
+            return_word_ids (bool, optional): If True, includes the token's position (index) in the output tuples.
+                                              If False (default), omits the indices.
+        Returns:
+            dict[str, list[tuple[int, int]]] OR dict[str, list[tuple[int, int, int]]]:
+                A dictionary with a "gemma" key mapping to:
+                    - a list of (token_id, attention_mask) tuples if return_word_ids is False;
+                    - a list of (token_id, attention_mask, index) tuples if return_word_ids is True.
+        Example:
+            >>> tokenizer = LTXVGemmaTokenizer("path/to/tokenizer", max_length=8)
+            >>> tokenizer.tokenize_with_weights("hello world")
+            {'gemma': [(1234, 1), (5678, 1), (2, 0), ...]}
+        """
+        text = text.strip()
+        encoded = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = encoded.input_ids
+        attention_mask = encoded.attention_mask
+        tuples = [
+            (token_id, attn, i) for i, (token_id, attn) in enumerate(zip(input_ids[0], attention_mask[0], strict=True))
+        ]
+        out = {"gemma": tuples}
+
+        if not return_word_ids:
+            # Return only (token_id, attention_mask) pairs, omitting token position
+            out = {k: [(t, w) for t, w, _ in v] for k, v in out.items()}
+
+        return out
+
+
+class GemmaFeaturesExtractorProjLinear(torch.nn.Module):
+    """
+    Feature extractor module for Gemma models.
+    This module applies a single linear projection to the input tensor.
+    It expects a flattened feature tensor of shape (batch_size, 3840*49).
+    The linear layer maps this to a (batch_size, 3840) embedding.
+    Attributes:
+        aggregate_embed (torch.nn.Linear): Linear projection layer.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the GemmaFeaturesExtractorProjLinear module.
+        The input dimension is expected to be 3840 * 49, and the output is 3840.
+        """
+        super().__init__()
+        self.aggregate_embed = torch.nn.Linear(3840 * 49, 3840, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the feature extractor.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, 3840 * 49).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, 3840).
+        """
+        return self.aggregate_embed(x)
+
+
+class _BasicTransformerBlock1D(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        rope_type: LTXRopeType = LTXRopeType.INTERLEAVED,
+    ):
+        super().__init__()
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            rope_type=rope_type,
+        )
+
+        self.ff = FeedForward(
+            dim,
+            dim_out=dim,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        pe: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+
+        # 1. Normalization Before Self-Attention
+        norm_hidden_states = rms_norm(hidden_states)
+
+        norm_hidden_states = norm_hidden_states.squeeze(1)
+
+        # 2. Self-Attention
+        attn_output = self.attn1(norm_hidden_states, mask=attention_mask, pe=pe)
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 3. Normalization before Feed-Forward
+        norm_hidden_states = rms_norm(hidden_states)
+
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class Embeddings1DConnector(torch.nn.Module):
+    """
+    Embeddings1DConnector applies a 1D transformer-based processing to sequential embeddings (e.g., for video, audio, or
+    other modalities). It supports rotary positional encoding (rope), optional causal temporal positioning, and can
+    substitute padded positions with learnable registers. The module is highly configurable for head size, number of
+    layers, and register usage.
+    Args:
+        attention_head_dim (int): Dimension of each attention head (default=128).
+        num_attention_heads (int): Number of attention heads (default=30).
+        num_layers (int): Number of transformer layers (default=2).
+        positional_embedding_theta (float): Scaling factor for position embedding (default=10000.0).
+        positional_embedding_max_pos (list[int] | None): Max positions for positional embeddings (default=[1]).
+        causal_temporal_positioning (bool): If True, uses causal attention (default=False).
+        num_learnable_registers (int | None): Number of learnable registers to replace padded tokens. If None, disables
+            register replacement. (default=128)
+        rope_type (LTXRopeType): The RoPE variant to use (default=DEFAULT_ROPE_TYPE).
+        double_precision_rope (bool): Use double precision rope calculation (default=False).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 30,
+        num_layers: int = 2,
+        positional_embedding_theta: float = 10000.0,
+        positional_embedding_max_pos: list[int] | None = [4096],
+        causal_temporal_positioning: bool = False,
+        num_learnable_registers: int | None = 128,
+        rope_type: LTXRopeType = LTXRopeType.SPLIT,
+        double_precision_rope: bool = True,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.causal_temporal_positioning = causal_temporal_positioning
+        self.positional_embedding_theta = positional_embedding_theta
+        self.positional_embedding_max_pos = (
+            positional_embedding_max_pos if positional_embedding_max_pos is not None else [1]
+        )
+        self.rope_type = rope_type
+        self.double_precision_rope = double_precision_rope
+        self.transformer_1d_blocks = torch.nn.ModuleList(
+            [
+                _BasicTransformerBlock1D(
+                    dim=self.inner_dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    rope_type=rope_type,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.num_learnable_registers = num_learnable_registers
+        if self.num_learnable_registers:
+            self.learnable_registers = torch.nn.Parameter(
+                torch.rand(self.num_learnable_registers, self.inner_dim, dtype=torch.bfloat16) * 2.0 - 1.0
+            )
+
+    def _replace_padded_with_learnable_registers(
+        self, hidden_states: torch.Tensor, attention_mask: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert hidden_states.shape[1] % self.num_learnable_registers == 0, (
+            f"Hidden states sequence length {hidden_states.shape[1]} must be divisible by num_learnable_registers "
+            f"{self.num_learnable_registers}."
+        )
+
+        num_registers_duplications = hidden_states.shape[1] // self.num_learnable_registers
+        learnable_registers = torch.tile(self.learnable_registers, (num_registers_duplications, 1))
+        attention_mask_binary = (attention_mask.squeeze(1).squeeze(1).unsqueeze(-1) >= -9000.0).int()
+
+        non_zero_hidden_states = hidden_states[:, attention_mask_binary.squeeze().bool(), :]
+        non_zero_nums = non_zero_hidden_states.shape[1]
+        pad_length = hidden_states.shape[1] - non_zero_nums
+        adjusted_hidden_states = torch.nn.functional.pad(non_zero_hidden_states, pad=(0, 0, 0, pad_length), value=0)
+        flipped_mask = torch.flip(attention_mask_binary, dims=[1])
+        hidden_states = flipped_mask * adjusted_hidden_states + (1 - flipped_mask) * learnable_registers
+
+        attention_mask = torch.full_like(
+            attention_mask,
+            0.0,
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+
+        return hidden_states, attention_mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass of Embeddings1DConnector.
+        Args:
+            hidden_states (torch.Tensor): Input tensor of embeddings (shape [batch, seq_len, feature_dim]).
+            attention_mask (torch.Tensor|None): Optional mask for valid tokens (shape compatible with hidden_states).
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: Processed features and the corresponding (possibly modified) mask.
+        """
+        if self.num_learnable_registers:
+            hidden_states, attention_mask = self._replace_padded_with_learnable_registers(hidden_states, attention_mask)
+
+        indices_grid = torch.arange(hidden_states.shape[1], dtype=torch.float32, device=hidden_states.device)
+        indices_grid = indices_grid[None, None, :]
+        freq_grid_generator = generate_freq_grid_np if self.double_precision_rope else generate_freq_grid_pytorch
+        freqs_cis = precompute_freqs_cis(
+            indices_grid=indices_grid,
+            dim=self.inner_dim,
+            out_dtype=hidden_states.dtype,
+            theta=self.positional_embedding_theta,
+            max_pos=self.positional_embedding_max_pos,
+            num_attention_heads=self.num_attention_heads,
+            rope_type=self.rope_type,
+            freq_grid_generator=freq_grid_generator,
+        )
+
+        for block in self.transformer_1d_blocks:
+            hidden_states = block(hidden_states, attention_mask=attention_mask, pe=freqs_cis)
+
+        hidden_states = rms_norm(hidden_states)
+
+        return hidden_states, attention_mask
+
+
+class LTX2TextEncoderPostModules(torch.nn.Module):
+    def __init__(self,):
+        super().__init__()
+        self.feature_extractor_linear = GemmaFeaturesExtractorProjLinear()
+        self.embeddings_connector = Embeddings1DConnector()
+        self.audio_embeddings_connector = Embeddings1DConnector()
--- a/diffsynth/models/ltx2_upsampler.py
+++ b/diffsynth/models/ltx2_upsampler.py
@@ -0,0 +1,313 @@
+import math
+from typing import Optional, Tuple
+import torch
+from einops import rearrange
+import torch.nn.functional as F
+from .ltx2_video_vae import LTX2VideoEncoder
+
+class PixelShuffleND(torch.nn.Module):
+    """
+    N-dimensional pixel shuffle operation for upsampling tensors.
+    Args:
+        dims (int): Number of dimensions to apply pixel shuffle to.
+            - 1: Temporal (e.g., frames)
+            - 2: Spatial (e.g., height and width)
+            - 3: Spatiotemporal (e.g., depth, height, width)
+        upscale_factors (tuple[int, int, int], optional): Upscaling factors for each dimension.
+            For dims=1, only the first value is used.
+            For dims=2, the first two values are used.
+            For dims=3, all three values are used.
+    The input tensor is rearranged so that the channel dimension is split into
+    smaller channels and upscaling factors, and the upscaling factors are moved
+    into the corresponding spatial/temporal dimensions.
+    Note:
+    This operation is equivalent to the patchifier operation in for the models. Consider
+    using this class instead.
+    """
+
+    def __init__(self, dims: int, upscale_factors: tuple[int, int, int] = (2, 2, 2)):
+        super().__init__()
+        assert dims in [1, 2, 3], "dims must be 1, 2, or 3"
+        self.dims = dims
+        self.upscale_factors = upscale_factors
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dims == 3:
+            return rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.upscale_factors[0],
+                p2=self.upscale_factors[1],
+                p3=self.upscale_factors[2],
+            )
+        elif self.dims == 2:
+            return rearrange(
+                x,
+                "b (c p1 p2) h w -> b c (h p1) (w p2)",
+                p1=self.upscale_factors[0],
+                p2=self.upscale_factors[1],
+            )
+        elif self.dims == 1:
+            return rearrange(
+                x,
+                "b (c p1) f h w -> b c (f p1) h w",
+                p1=self.upscale_factors[0],
+            )
+        else:
+            raise ValueError(f"Unsupported dims: {self.dims}")
+
+
+class ResBlock(torch.nn.Module):
+    """
+    Residual block with two convolutional layers, group normalization, and SiLU activation.
+    Args:
+        channels (int): Number of input and output channels.
+        mid_channels (Optional[int]): Number of channels in the intermediate convolution layer. Defaults to `channels`
+        if not specified.
+        dims (int): Dimensionality of the convolution (2 for Conv2d, 3 for Conv3d). Defaults to 3.
+    """
+
+    def __init__(self, channels: int, mid_channels: Optional[int] = None, dims: int = 3):
+        super().__init__()
+        if mid_channels is None:
+            mid_channels = channels
+
+        conv = torch.nn.Conv2d if dims == 2 else torch.nn.Conv3d
+
+        self.conv1 = conv(channels, mid_channels, kernel_size=3, padding=1)
+        self.norm1 = torch.nn.GroupNorm(32, mid_channels)
+        self.conv2 = conv(mid_channels, channels, kernel_size=3, padding=1)
+        self.norm2 = torch.nn.GroupNorm(32, channels)
+        self.activation = torch.nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.activation(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.activation(x + residual)
+        return x
+
+
+class BlurDownsample(torch.nn.Module):
+    """
+    Anti-aliased spatial downsampling by integer stride using a fixed separable binomial kernel.
+    Applies only on H,W. Works for dims=2 or dims=3 (per-frame).
+    """
+
+    def __init__(self, dims: int, stride: int, kernel_size: int = 5) -> None:
+        super().__init__()
+        assert dims in (2, 3)
+        assert isinstance(stride, int)
+        assert stride >= 1
+        assert kernel_size >= 3
+        assert kernel_size % 2 == 1
+        self.dims = dims
+        self.stride = stride
+        self.kernel_size = kernel_size
+
+        # 5x5 separable binomial kernel using binomial coefficients [1, 4, 6, 4, 1] from
+        # the 4th row of Pascal's triangle. This kernel is used for anti-aliasing and
+        # provides a smooth approximation of a Gaussian filter (often called a "binomial filter").
+        # The 2D kernel is constructed as the outer product and normalized.
+        k = torch.tensor([math.comb(kernel_size - 1, k) for k in range(kernel_size)])
+        k2d = k[:, None] @ k[None, :]
+        k2d = (k2d / k2d.sum()).float()  # shape (kernel_size, kernel_size)
+        self.register_buffer("kernel", k2d[None, None, :, :])  # (1, 1, kernel_size, kernel_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.stride == 1:
+            return x
+
+        if self.dims == 2:
+            return self._apply_2d(x)
+        else:
+            # dims == 3: apply per-frame on H,W
+            b, _, f, _, _ = x.shape
+            x = rearrange(x, "b c f h w -> (b f) c h w")
+            x = self._apply_2d(x)
+            h2, w2 = x.shape[-2:]
+            x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f, h=h2, w=w2)
+            return x
+
+    def _apply_2d(self, x2d: torch.Tensor) -> torch.Tensor:
+        c = x2d.shape[1]
+        weight = self.kernel.expand(c, 1, self.kernel_size, self.kernel_size)  # depthwise
+        x2d = F.conv2d(x2d, weight=weight, bias=None, stride=self.stride, padding=self.kernel_size // 2, groups=c)
+        return x2d
+
+
+def _rational_for_scale(scale: float) -> Tuple[int, int]:
+    mapping = {0.75: (3, 4), 1.5: (3, 2), 2.0: (2, 1), 4.0: (4, 1)}
+    if float(scale) not in mapping:
+        raise ValueError(f"Unsupported scale {scale}. Choose from {list(mapping.keys())}")
+    return mapping[float(scale)]
+
+
+class SpatialRationalResampler(torch.nn.Module):
+    """
+    Fully-learned rational spatial scaling: up by 'num' via PixelShuffle, then anti-aliased
+    downsample by 'den' using fixed blur + stride. Operates on H,W only.
+    For dims==3, work per-frame for spatial scaling (temporal axis untouched).
+    Args:
+        mid_channels (`int`): Number of intermediate channels for the convolution layer
+        scale (`float`): Spatial scaling factor. Supported values are:
+            - 0.75: Downsample by 3/4 (reduce spatial size)
+            - 1.5: Upsample by 3/2 (increase spatial size)
+            - 2.0: Upsample by 2x (double spatial size)
+            - 4.0: Upsample by 4x (quadruple spatial size)
+            Any other value will raise a ValueError.
+    """
+
+    def __init__(self, mid_channels: int, scale: float):
+        super().__init__()
+        self.scale = float(scale)
+        self.num, self.den = _rational_for_scale(self.scale)
+        self.conv = torch.nn.Conv2d(mid_channels, (self.num**2) * mid_channels, kernel_size=3, padding=1)
+        self.pixel_shuffle = PixelShuffleND(2, upscale_factors=(self.num, self.num))
+        self.blur_down = BlurDownsample(dims=2, stride=self.den)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, _, f, _, _ = x.shape
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = self.conv(x)
+        x = self.pixel_shuffle(x)
+        x = self.blur_down(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+        return x
+
+
+class LTX2LatentUpsampler(torch.nn.Module):
+    """
+    Model to upsample VAE latents spatially and/or temporally.
+    Args:
+        in_channels (`int`): Number of channels in the input latent
+        mid_channels (`int`): Number of channels in the middle layers
+        num_blocks_per_stage (`int`): Number of ResBlocks to use in each stage (pre/post upsampling)
+        dims (`int`): Number of dimensions for convolutions (2 or 3)
+        spatial_upsample (`bool`): Whether to spatially upsample the latent
+        temporal_upsample (`bool`): Whether to temporally upsample the latent
+        spatial_scale (`float`): Scale factor for spatial upsampling
+        rational_resampler (`bool`): Whether to use a rational resampler for spatial upsampling
+    """
+    def __init__(
+        self,
+        in_channels: int = 128,
+        mid_channels: int = 1024,
+        num_blocks_per_stage: int = 4,
+        dims: int = 3,
+        spatial_upsample: bool = True,
+        temporal_upsample: bool = False,
+        spatial_scale: float = 2.0,
+        rational_resampler: bool = True,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.num_blocks_per_stage = num_blocks_per_stage
+        self.dims = dims
+        self.spatial_upsample = spatial_upsample
+        self.temporal_upsample = temporal_upsample
+        self.spatial_scale = float(spatial_scale)
+        self.rational_resampler = rational_resampler
+
+        conv = torch.nn.Conv2d if dims == 2 else torch.nn.Conv3d
+
+        self.initial_conv = conv(in_channels, mid_channels, kernel_size=3, padding=1)
+        self.initial_norm = torch.nn.GroupNorm(32, mid_channels)
+        self.initial_activation = torch.nn.SiLU()
+
+        self.res_blocks = torch.nn.ModuleList([ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)])
+
+        if spatial_upsample and temporal_upsample:
+            self.upsampler = torch.nn.Sequential(
+                torch.nn.Conv3d(mid_channels, 8 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(3),
+            )
+        elif spatial_upsample:
+            if rational_resampler:
+                self.upsampler = SpatialRationalResampler(mid_channels=mid_channels, scale=self.spatial_scale)
+            else:
+                self.upsampler = torch.nn.Sequential(
+                    torch.nn.Conv2d(mid_channels, 4 * mid_channels, kernel_size=3, padding=1),
+                    PixelShuffleND(2),
+                )
+        elif temporal_upsample:
+            self.upsampler = torch.nn.Sequential(
+                torch.nn.Conv3d(mid_channels, 2 * mid_channels, kernel_size=3, padding=1),
+                PixelShuffleND(1),
+            )
+        else:
+            raise ValueError("Either spatial_upsample or temporal_upsample must be True")
+
+        self.post_upsample_res_blocks = torch.nn.ModuleList(
+            [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
+        )
+
+        self.final_conv = conv(mid_channels, in_channels, kernel_size=3, padding=1)
+
+    def forward(self, latent: torch.Tensor) -> torch.Tensor:
+        b, _, f, _, _ = latent.shape
+
+        if self.dims == 2:
+            x = rearrange(latent, "b c f h w -> (b f) c h w")
+            x = self.initial_conv(x)
+            x = self.initial_norm(x)
+            x = self.initial_activation(x)
+
+            for block in self.res_blocks:
+                x = block(x)
+
+            x = self.upsampler(x)
+
+            for block in self.post_upsample_res_blocks:
+                x = block(x)
+
+            x = self.final_conv(x)
+            x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+        else:
+            x = self.initial_conv(latent)
+            x = self.initial_norm(x)
+            x = self.initial_activation(x)
+
+            for block in self.res_blocks:
+                x = block(x)
+
+            if self.temporal_upsample:
+                x = self.upsampler(x)
+                # remove the first frame after upsampling.
+                # This is done because the first frame encodes one pixel frame.
+                x = x[:, :, 1:, :, :]
+            elif isinstance(self.upsampler, SpatialRationalResampler):
+                x = self.upsampler(x)
+            else:
+                x = rearrange(x, "b c f h w -> (b f) c h w")
+                x = self.upsampler(x)
+                x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+
+            for block in self.post_upsample_res_blocks:
+                x = block(x)
+
+            x = self.final_conv(x)
+
+        return x
+
+
+def upsample_video(latent: torch.Tensor, video_encoder: LTX2VideoEncoder, upsampler: "LTX2LatentUpsampler") -> torch.Tensor:
+    """
+    Apply upsampling to the latent representation using the provided upsampler,
+    with normalization and un-normalization based on the video encoder's per-channel statistics.
+    Args:
+        latent: Input latent tensor of shape [B, C, F, H, W].
+        video_encoder: VideoEncoder with per_channel_statistics for normalization.
+        upsampler: LTX2LatentUpsampler module to perform upsampling.
+    Returns:
+        torch.Tensor: Upsampled and re-normalized latent tensor.
+    """
+    latent = video_encoder.per_channel_statistics.un_normalize(latent)
+    latent = upsampler(latent)
+    latent = video_encoder.per_channel_statistics.normalize(latent)
+    return latent