support ltx-2 t2v and i2v

2026-03-22 08:40:47 +00:00 · 2026-02-02 19:53:07 +08:00
parent 1c8a0f8317
commit f4f991d409
20 changed files with 1084 additions and 25 deletions
--- a/diffsynth/utils/data/media_io_ltx2.py
+++ b/diffsynth/utils/data/media_io_ltx2.py
@@ -0,0 +1,149 @@
+
+from fractions import Fraction
+import torch
+import av
+from tqdm import tqdm
+from PIL import Image
+import numpy as np
+from io import BytesIO
+from collections.abc import Generator, Iterator
+
+
+def _resample_audio(
+    container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame
+) -> None:
+    cc = audio_stream.codec_context
+
+    # Use the encoder's format/layout/rate as the *target*
+    target_format = cc.format or "fltp"  # AAC → usually fltp
+    target_layout = cc.layout or "stereo"
+    target_rate = cc.sample_rate or frame_in.sample_rate
+
+    audio_resampler = av.audio.resampler.AudioResampler(
+        format=target_format,
+        layout=target_layout,
+        rate=target_rate,
+    )
+
+    audio_next_pts = 0
+    for rframe in audio_resampler.resample(frame_in):
+        if rframe.pts is None:
+            rframe.pts = audio_next_pts
+        audio_next_pts += rframe.samples
+        rframe.sample_rate = frame_in.sample_rate
+        container.mux(audio_stream.encode(rframe))
+
+    # flush audio encoder
+    for packet in audio_stream.encode():
+        container.mux(packet)
+
+
+def _write_audio(
+    container: av.container.Container, audio_stream: av.audio.AudioStream, samples: torch.Tensor, audio_sample_rate: int
+) -> None:
+    if samples.ndim == 1:
+        samples = samples[:, None]
+
+    if samples.shape[1] != 2 and samples.shape[0] == 2:
+        samples = samples.T
+
+    if samples.shape[1] != 2:
+        raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.")
+
+    # Convert to int16 packed for ingestion; resampler converts to encoder fmt.
+    if samples.dtype != torch.int16:
+        samples = torch.clip(samples, -1.0, 1.0)
+        samples = (samples * 32767.0).to(torch.int16)
+
+    frame_in = av.AudioFrame.from_ndarray(
+        samples.contiguous().reshape(1, -1).cpu().numpy(),
+        format="s16",
+        layout="stereo",
+    )
+    frame_in.sample_rate = audio_sample_rate
+
+    _resample_audio(container, audio_stream, frame_in)
+
+
+def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream:
+    """
+    Prepare the audio stream for writing.
+    """
+    audio_stream = container.add_stream("aac", rate=audio_sample_rate)
+    audio_stream.codec_context.sample_rate = audio_sample_rate
+    audio_stream.codec_context.layout = "stereo"
+    audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate)
+    return audio_stream
+
+def write_video_audio_ltx2(
+    video: list[Image.Image],
+    audio: torch.Tensor | None,
+    output_path: str,
+    fps: int = 24,
+    audio_sample_rate: int | None = 24000,
+) -> None:
+
+    width, height = video[0].size
+    container = av.open(output_path, mode="w")
+    stream = container.add_stream("libx264", rate=int(fps))
+    stream.width = width
+    stream.height = height
+    stream.pix_fmt = "yuv420p"
+    
+    if audio is not None:
+        if audio_sample_rate is None:
+            raise ValueError("audio_sample_rate is required when audio is provided")
+        audio_stream = _prepare_audio_stream(container, audio_sample_rate)
+
+    for frame in tqdm(video, total=len(video)):
+        frame = av.VideoFrame.from_image(frame)
+        for packet in stream.encode(frame):
+            container.mux(packet)
+
+    # Flush encoder
+    for packet in stream.encode():
+        container.mux(packet)
+
+    if audio is not None:
+        _write_audio(container, audio_stream, audio, audio_sample_rate)
+
+    container.close()
+
+
+def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None:
+    container = av.open(output_file, "w", format="mp4")
+    try:
+        stream = container.add_stream("libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"})
+        # Round to nearest multiple of 2 for compatibility with video codecs
+        height = image_array.shape[0] // 2 * 2
+        width = image_array.shape[1] // 2 * 2
+        image_array = image_array[:height, :width]
+        stream.height = height
+        stream.width = width
+        av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(format="yuv420p")
+        container.mux(stream.encode(av_frame))
+        container.mux(stream.encode())
+    finally:
+        container.close()
+
+
+def decode_single_frame(video_file: str) -> np.array:
+    container = av.open(video_file)
+    try:
+        stream = next(s for s in container.streams if s.type == "video")
+        frame = next(container.decode(stream))
+    finally:
+        container.close()
+    return frame.to_ndarray(format="rgb24")
+
+
+def ltx2_preprocess(image: np.array, crf: float = 33) -> np.array:
+    if crf == 0:
+        return image
+
+    with BytesIO() as output_file:
+        encode_single_frame(output_file, image, crf)
+        video_bytes = output_file.getvalue()
+    with BytesIO(video_bytes) as video_file:
+        image_array = decode_single_frame(video_file)
+    return image_array