From f4f991d4096d5a3753a7ca7524f54fd0988e413b Mon Sep 17 00:00:00 2001
From: mi804 <1576993271@qq.com>
Date: Mon, 2 Feb 2026 19:53:07 +0800
Subject: [PATCH] support ltx-2 t2v and i2v
---
README.md | 89 ++++++++++++++
README_zh.md | 89 ++++++++++++++
diffsynth/models/ltx2_dit.py | 4 +
diffsynth/models/ltx2_video_vae.py | 3 -
diffsynth/pipelines/ltx2_audio_video.py | 98 +++++++++++++---
.../data/{media_io.py => media_io_ltx2.py} | 42 +++++++
docs/en/Model_Details/LTX-2.md | 109 ++++++++++++++++++
docs/zh/Model_Details/LTX-2.md | 109 ++++++++++++++++++
.../LTX-2-I2AV-DistilledPipeline.py | 69 +++++++++++
.../model_inference/LTX-2-I2AV-OneStage.py | 55 +++++++++
.../model_inference/LTX-2-I2AV-TwoStage.py | 72 ++++++++++++
.../LTX-2-T2AV-DistilledPipeline.py | 4 +-
.../model_inference/LTX-2-T2AV-OneStage.py | 4 +-
.../model_inference/LTX-2-T2AV-TwoStage.py | 4 +-
.../LTX-2-I2AV-DistilledPipeline.py | 70 +++++++++++
.../LTX-2-I2AV-OneStage.py | 56 +++++++++
.../LTX-2-I2AV-TwoStage.py | 72 ++++++++++++
.../LTX-2-T2AV-DistilledPipeline.py | 58 ++++++++++
.../LTX-2-T2AV-OneStage.py | 43 +++++++
.../LTX-2-T2AV-TwoStage.py | 59 ++++++++++
20 files changed, 1084 insertions(+), 25 deletions(-)
rename diffsynth/utils/data/{media_io.py => media_io_ltx2.py} (69%)
create mode 100644 docs/en/Model_Details/LTX-2.md
create mode 100644 docs/zh/Model_Details/LTX-2.md
create mode 100644 examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py
create mode 100644 examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py
create mode 100644 examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py
create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py
create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py
create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py
create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py
create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py
create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py
diff --git a/README.md b/README.md
index 4f3bb97..5e008e0 100644
--- a/README.md
+++ b/README.md
@@ -522,6 +522,95 @@ Example code for FLUX.1 is available at: [/examples/flux/](/examples/flux/)
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
+#### LTX-2: [/docs/en/Model_Details/LTX-2.md](/docs/en/Model_Details/LTX-2.md)
+
+
+
+Quick Start
+
+Running the following code will quickly load the [Lightricks/LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2) model for inference. VRAM management is enabled, and the framework automatically adjusts model parameter loading based on available GPU memory. The model can run with as little as 8GB of VRAM.
+
+```python
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+
+prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\""
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_two_stage_pipeline=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_twostage.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
+```
+
+
+
+
+
+Examples
+
+Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/)
+
+| Model ID | Extra Args | Inference | Low-VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation |
+|-|-|-|-|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py)|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py)|-|-|-|-|
+
+
+
#### Wan: [/docs/en/Model_Details/Wan.md](/docs/en/Model_Details/Wan.md)
diff --git a/README_zh.md b/README_zh.md
index a464dab..a1619a5 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -522,6 +522,95 @@ FLUX.1 的示例代码位于:[/examples/flux/](/examples/flux/)
https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314
+#### LTX-2: [/docs/zh/Model_Details/LTX-2.md](/docs/zh/Model_Details/LTX-2.md)
+
+
+
+快速开始
+
+运行以下代码可以快速加载 [Lightricks/LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 8GB 显存即可运行。
+
+```python
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+
+prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\""
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_two_stage_pipeline=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_twostage.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
+```
+
+
+
+
+
+示例代码
+
+LTX-2 的示例代码位于:[/examples/ltx2/](/examples/ltx2/)
+
+|模型 ID|额外参数|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
+|-|-|-|-|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py)|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py)|-|-|-|-|
+
+
+
#### Wan: [/docs/zh/Model_Details/Wan.md](/docs/zh/Model_Details/Wan.md)
diff --git a/diffsynth/models/ltx2_dit.py b/diffsynth/models/ltx2_dit.py
index 43a27f6..5f9f856 100644
--- a/diffsynth/models/ltx2_dit.py
+++ b/diffsynth/models/ltx2_dit.py
@@ -1442,6 +1442,10 @@ class LTXModel(torch.nn.Module):
return vx, ax
def forward(self, video_latents, video_positions, video_context, video_timesteps, audio_latents, audio_positions, audio_context, audio_timesteps):
+ cross_pe_max_pos = None
+ if self.model_type.is_video_enabled() and self.model_type.is_audio_enabled():
+ cross_pe_max_pos = max(self.positional_embedding_max_pos[0], self.audio_positional_embedding_max_pos[0])
+ self._init_preprocessors(cross_pe_max_pos)
video = Modality(video_latents, video_timesteps, video_positions, video_context)
audio = Modality(audio_latents, audio_timesteps, audio_positions, audio_context)
vx, ax = self._forward(video=video, audio=audio, perturbations=None)
diff --git a/diffsynth/models/ltx2_video_vae.py b/diffsynth/models/ltx2_video_vae.py
index ebc9483..0c99432 100644
--- a/diffsynth/models/ltx2_video_vae.py
+++ b/diffsynth/models/ltx2_video_vae.py
@@ -1648,11 +1648,8 @@ class LTX2VideoEncoder(nn.Module):
tile_overlap_in_pixels: Optional[int] = 128,
**kwargs,
) -> torch.Tensor:
- device = next(self.parameters()).device
- vae_dtype = next(self.parameters()).dtype
if video.ndim == 4:
video = video.unsqueeze(0) # [C, F, H, W] -> [B, C, F, H, W]
- video = video.to(device=device, dtype=vae_dtype)
# Choose encoding method based on tiling flag
if tiled:
latents = self.tiled_encode_video(
diff --git a/diffsynth/pipelines/ltx2_audio_video.py b/diffsynth/pipelines/ltx2_audio_video.py
index 4a96e36..25f6ecd 100644
--- a/diffsynth/pipelines/ltx2_audio_video.py
+++ b/diffsynth/pipelines/ltx2_audio_video.py
@@ -8,9 +8,7 @@ import numpy as np
from PIL import Image
from tqdm import tqdm
from typing import Optional
-from typing_extensions import Literal
from transformers import AutoImageProcessor, Gemma3Processor
-import einops
from ..core.device.npu_compatible_device import get_device_type
from ..diffusion import FlowMatchScheduler
@@ -23,6 +21,7 @@ from ..models.ltx2_video_vae import LTX2VideoEncoder, LTX2VideoDecoder, VideoLat
from ..models.ltx2_audio_vae import LTX2AudioEncoder, LTX2AudioDecoder, LTX2Vocoder, AudioPatchifier
from ..models.ltx2_upsampler import LTX2LatentUpsampler
from ..models.ltx2_common import VideoLatentShape, AudioLatentShape, VideoPixelShape, get_pixel_coords, VIDEO_SCALE_FACTORS
+from ..utils.data.media_io_ltx2 import ltx2_preprocess
class LTX2AudioVideoPipeline(BasePipeline):
@@ -59,6 +58,7 @@ class LTX2AudioVideoPipeline(BasePipeline):
LTX2AudioVideoUnit_PromptEmbedder(),
LTX2AudioVideoUnit_NoiseInitializer(),
LTX2AudioVideoUnit_InputVideoEmbedder(),
+ LTX2AudioVideoUnit_InputImagesEmbedder(),
]
self.model_fn = model_fn_ltx2
@@ -124,13 +124,22 @@ class LTX2AudioVideoPipeline(BasePipeline):
def stage2_denoise(self, inputs_shared, inputs_posi, inputs_nega, progress_bar_cmd=tqdm):
if inputs_shared["use_two_stage_pipeline"]:
latent = self.video_vae_encoder.per_channel_statistics.un_normalize(inputs_shared["video_latents"])
- self.load_models_to_device(self.in_iteration_models + ('upsampler',))
+ self.load_models_to_device('upsampler',)
latent = self.upsampler(latent)
latent = self.video_vae_encoder.per_channel_statistics.normalize(latent)
self.scheduler.set_timesteps(special_case="stage2")
inputs_shared.update({k.replace("stage2_", ""): v for k, v in inputs_shared.items() if k.startswith("stage2_")})
- inputs_shared["video_latents"] = self.scheduler.sigmas[0] * inputs_shared["video_noise"] + (1 - self.scheduler.sigmas[0]) * latent
- inputs_shared["audio_latents"] = self.scheduler.sigmas[0] * inputs_shared["audio_noise"] + (1 - self.scheduler.sigmas[0]) * inputs_shared["audio_latents"]
+ denoise_mask_video = 1.0
+ if inputs_shared.get("input_images", None) is not None:
+ latent, denoise_mask_video, initial_latents = self.apply_input_images_to_latents(
+ latent, inputs_shared.pop("input_latents"), inputs_shared["input_images_indexes"],
+ inputs_shared["input_images_strength"], latent.clone())
+ inputs_shared.update({"input_latents_video": initial_latents, "denoise_mask_video": denoise_mask_video})
+ inputs_shared["video_latents"] = self.scheduler.sigmas[0] * denoise_mask_video * inputs_shared[
+ "video_noise"] + (1 - self.scheduler.sigmas[0] * denoise_mask_video) * latent
+ inputs_shared["audio_latents"] = self.scheduler.sigmas[0] * inputs_shared["audio_noise"] + (
+ 1 - self.scheduler.sigmas[0]) * inputs_shared["audio_latents"]
+
self.load_models_to_device(self.in_iteration_models)
if not inputs_shared["use_distilled_pipeline"]:
self.load_lora(self.dit, self.stage2_lora_path, alpha=0.8)
@@ -142,7 +151,8 @@ class LTX2AudioVideoPipeline(BasePipeline):
**models, timestep=timestep, progress_id=progress_id
)
inputs_shared["video_latents"] = self.step(self.scheduler, inputs_shared["video_latents"], progress_id=progress_id,
- noise_pred=noise_pred_video, **inputs_shared)
+ noise_pred=noise_pred_video, inpaint_mask=inputs_shared.get("denoise_mask_video", None),
+ input_latents=inputs_shared.get("input_latents_video", None), **inputs_shared)
inputs_shared["audio_latents"] = self.step(self.scheduler, inputs_shared["audio_latents"], progress_id=progress_id,
noise_pred=noise_pred_audio, **inputs_shared)
return inputs_shared
@@ -154,8 +164,10 @@ class LTX2AudioVideoPipeline(BasePipeline):
prompt: str,
negative_prompt: Optional[str] = "",
# Image-to-video
- input_image: Optional[Image.Image] = None,
denoising_strength: float = 1.0,
+ input_images: Optional[list[Image.Image]] = None,
+ input_images_indexes: Optional[list[int]] = None,
+ input_images_strength: Optional[float] = 1.0,
# Randomness
seed: Optional[int] = None,
rand_device: Optional[str] = "cpu",
@@ -191,7 +203,7 @@ class LTX2AudioVideoPipeline(BasePipeline):
"negative_prompt": negative_prompt,
}
inputs_shared = {
- "input_image": input_image,
+ "input_images": input_images, "input_images_indexes": input_images_indexes, "input_images_strength": input_images_strength,
"seed": seed, "rand_device": rand_device,
"height": height, "width": width, "num_frames": num_frames,
"cfg_scale": cfg_scale, "cfg_merge": cfg_merge,
@@ -212,8 +224,8 @@ class LTX2AudioVideoPipeline(BasePipeline):
self.model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega,
**models, timestep=timestep, progress_id=progress_id
)
- inputs_shared["video_latents"] = self.step(self.scheduler, inputs_shared["video_latents"], progress_id=progress_id,
- noise_pred=noise_pred_video, **inputs_shared)
+ inputs_shared["video_latents"] = self.step(self.scheduler, inputs_shared["video_latents"], progress_id=progress_id, noise_pred=noise_pred_video,
+ inpaint_mask=inputs_shared.get("denoise_mask_video", None), input_latents=inputs_shared.get("input_latents_video", None), **inputs_shared)
inputs_shared["audio_latents"] = self.step(self.scheduler, inputs_shared["audio_latents"], progress_id=progress_id,
noise_pred=noise_pred_audio, **inputs_shared)
@@ -223,13 +235,25 @@ class LTX2AudioVideoPipeline(BasePipeline):
# Decode
self.load_models_to_device(['video_vae_decoder'])
video = self.video_vae_decoder.decode(inputs_shared["video_latents"], tiled, tile_size_in_pixels,
- tile_overlap_in_pixels, tile_size_in_frames, tile_overlap_in_frames)
+ tile_overlap_in_pixels, tile_size_in_frames, tile_overlap_in_frames)
video = self.vae_output_to_video(video)
self.load_models_to_device(['audio_vae_decoder', 'audio_vocoder'])
decoded_audio = self.audio_vae_decoder(inputs_shared["audio_latents"])
decoded_audio = self.audio_vocoder(decoded_audio).squeeze(0).float()
return video, decoded_audio
+ def apply_input_images_to_latents(self, latents, input_latents, input_indexes, input_strength, initial_latents=None, num_frames=121):
+ b, _, f, h, w = latents.shape
+ denoise_mask = torch.ones((b, 1, f, h, w), dtype=latents.dtype, device=latents.device)
+ initial_latents = torch.zeros_like(latents) if initial_latents is None else initial_latents
+ for idx, input_latent in zip(input_indexes, input_latents):
+ idx = min(max(1 + (idx-1) // 8, 0), f - 1)
+ input_latent = input_latent.to(dtype=latents.dtype, device=latents.device)
+ initial_latents[:, :, idx:idx + input_latent.shape[2], :, :] = input_latent
+ denoise_mask[:, :, idx:idx + input_latent.shape[2], :, :] = 1.0 - input_strength
+ latents = latents * denoise_mask + initial_latents * (1.0 - denoise_mask)
+ return latents, denoise_mask, initial_latents
+
class LTX2AudioVideoUnit_PipelineChecker(PipelineUnit):
def __init__(self):
@@ -246,7 +270,7 @@ class LTX2AudioVideoUnit_PipelineChecker(PipelineUnit):
if not (hasattr(pipe, "stage2_lora_path") and pipe.stage2_lora_path is not None):
raise ValueError("Two-stage pipeline requested, but stage2_lora_path is not set in the pipeline.")
if not (hasattr(pipe, "upsampler") and pipe.upsampler is not None):
- raise ValueError("Two-stage pipeline requested, but upsampler model is not loaded in the pipeline.")
+ raise ValueError("Two-stage pipeline requested, but upsampler model is not loaded in the pipeline.")
return inputs_shared, inputs_posi, inputs_nega
@@ -459,6 +483,44 @@ class LTX2AudioVideoUnit_InputVideoEmbedder(PipelineUnit):
# TODO: implement video-to-video
raise NotImplementedError("Video-to-video not implemented yet.")
+class LTX2AudioVideoUnit_InputImagesEmbedder(PipelineUnit):
+ def __init__(self):
+ super().__init__(
+ input_params=("input_images", "input_images_indexes", "input_images_strength", "video_latents", "height", "width", "num_frames", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "use_two_stage_pipeline"),
+ output_params=("video_latents"),
+ onload_model_names=("video_vae_encoder")
+ )
+
+ def get_image_latent(self, pipe, input_image, height, width, tiled, tile_size_in_pixels, tile_overlap_in_pixels):
+ image = ltx2_preprocess(np.array(input_image.resize((width, height))))
+ image = torch.Tensor(np.array(image, dtype=np.float32)).to(dtype=pipe.torch_dtype, device=pipe.device)
+ image = image / 127.5 - 1.0
+ image = repeat(image, f"H W C -> B C F H W", B=1, F=1)
+ latent = pipe.video_vae_encoder.encode(image, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(pipe.device)
+ return latent
+
+ def process(self, pipe: LTX2AudioVideoPipeline, input_images, input_images_indexes, input_images_strength, video_latents, height, width, num_frames, tiled, tile_size_in_pixels, tile_overlap_in_pixels, use_two_stage_pipeline=False):
+ if input_images is None or len(input_images) == 0:
+ return {"video_latents": video_latents}
+ else:
+ pipe.load_models_to_device(self.onload_model_names)
+ output_dicts = {}
+ stage1_height = height // 2 if use_two_stage_pipeline else height
+ stage1_width = width // 2 if use_two_stage_pipeline else width
+ stage1_latents = [
+ self.get_image_latent(pipe, img, stage1_height, stage1_width, tiled, tile_size_in_pixels,
+ tile_overlap_in_pixels) for img in input_images
+ ]
+ video_latents, denoise_mask_video, initial_latents = pipe.apply_input_images_to_latents(video_latents, stage1_latents, input_images_indexes, input_images_strength, num_frames=num_frames)
+ output_dicts.update({"video_latents": video_latents, "denoise_mask_video": denoise_mask_video, "input_latents_video": initial_latents})
+ if use_two_stage_pipeline:
+ stage2_latents = [
+ self.get_image_latent(pipe, img, height, width, tiled, tile_size_in_pixels,
+ tile_overlap_in_pixels) for img in input_images
+ ]
+ output_dicts.update({"stage2_input_latents": stage2_latents})
+ return output_dicts
+
def model_fn_ltx2(
dit: LTXModel,
@@ -471,19 +533,23 @@ def model_fn_ltx2(
audio_positions=None,
audio_patchifier=None,
timestep=None,
+ denoise_mask_video=None,
use_gradient_checkpointing=False,
use_gradient_checkpointing_offload=False,
**kwargs,
):
+ timestep = timestep.float() / 1000.
+
# patchify
b, c_v, f, h, w = video_latents.shape
- _, c_a, _, mel_bins = audio_latents.shape
video_latents = video_patchifier.patchify(video_latents)
- audio_latents = audio_patchifier.patchify(audio_latents)
- #TODO: support gradient checkpointing
- timestep = timestep.float() / 1000.
video_timesteps = timestep.repeat(1, video_latents.shape[1], 1)
+ if denoise_mask_video is not None:
+ video_timesteps = video_patchifier.patchify(denoise_mask_video) * video_timesteps
+ _, c_a, _, mel_bins = audio_latents.shape
+ audio_latents = audio_patchifier.patchify(audio_latents)
audio_timesteps = timestep.repeat(1, audio_latents.shape[1], 1)
+ #TODO: support gradient checkpointing in training
vx, ax = dit(
video_latents=video_latents,
video_positions=video_positions,
diff --git a/diffsynth/utils/data/media_io.py b/diffsynth/utils/data/media_io_ltx2.py
similarity index 69%
rename from diffsynth/utils/data/media_io.py
rename to diffsynth/utils/data/media_io_ltx2.py
index 0450186..5526ca9 100644
--- a/diffsynth/utils/data/media_io.py
+++ b/diffsynth/utils/data/media_io_ltx2.py
@@ -4,6 +4,9 @@ import torch
import av
from tqdm import tqdm
from PIL import Image
+import numpy as np
+from io import BytesIO
+from collections.abc import Generator, Iterator
def _resample_audio(
@@ -105,3 +108,42 @@ def write_video_audio_ltx2(
_write_audio(container, audio_stream, audio, audio_sample_rate)
container.close()
+
+
+def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None:
+ container = av.open(output_file, "w", format="mp4")
+ try:
+ stream = container.add_stream("libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"})
+ # Round to nearest multiple of 2 for compatibility with video codecs
+ height = image_array.shape[0] // 2 * 2
+ width = image_array.shape[1] // 2 * 2
+ image_array = image_array[:height, :width]
+ stream.height = height
+ stream.width = width
+ av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(format="yuv420p")
+ container.mux(stream.encode(av_frame))
+ container.mux(stream.encode())
+ finally:
+ container.close()
+
+
+def decode_single_frame(video_file: str) -> np.array:
+ container = av.open(video_file)
+ try:
+ stream = next(s for s in container.streams if s.type == "video")
+ frame = next(container.decode(stream))
+ finally:
+ container.close()
+ return frame.to_ndarray(format="rgb24")
+
+
+def ltx2_preprocess(image: np.array, crf: float = 33) -> np.array:
+ if crf == 0:
+ return image
+
+ with BytesIO() as output_file:
+ encode_single_frame(output_file, image, crf)
+ video_bytes = output_file.getvalue()
+ with BytesIO(video_bytes) as video_file:
+ image_array = decode_single_frame(video_file)
+ return image_array
diff --git a/docs/en/Model_Details/LTX-2.md b/docs/en/Model_Details/LTX-2.md
new file mode 100644
index 0000000..8652914
--- /dev/null
+++ b/docs/en/Model_Details/LTX-2.md
@@ -0,0 +1,109 @@
+# LTX-2
+
+LTX-2 is a series of audio-video generation models developed by Lightricks.
+
+## Installation
+
+Before using this project for model inference and training, please install DiffSynth-Studio first.
+
+```shell
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+
+For more information about installation, please refer to [Installation Dependencies](/docs/en/Pipeline_Usage/Setup.md).
+
+## Quick Start
+
+Run the following code to quickly load the [Lightricks/LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2) model and perform inference. VRAM management has been enabled, and the framework will automatically control model parameter loading based on remaining VRAM. It can run with a minimum of 8GB VRAM.
+
+```python
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\""
+negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+height, width, num_frames = 512, 768, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_onestage.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
+```
+
+## Model Overview
+|Model ID|Additional Parameters|Inference|Low VRAM Inference|Full Training|Validation After Full Training|LoRA Training|Validation After LoRA Training|
+|-|-|-|-|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py)|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py)|-|-|-|-|
+
+## Model Inference
+
+Models are loaded through `LTX2AudioVideoPipeline.from_pretrained`, see [Loading Models](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models) for details.
+
+Input parameters for `LTX2AudioVideoPipeline` inference include:
+
+* `prompt`: Prompt describing the content appearing in the video.
+* `negative_prompt`: Negative prompt describing content that should not appear in the video, default value is `""`.
+* `cfg_scale`: Classifier-free guidance parameter, default value is 3.0.
+* `input_images`: List of input images for image-to-video generation.
+* `input_images_indexes`: Frame index list of input images in the video.
+* `input_images_strength`: Strength of input images, default value is 1.0.
+* `denoising_strength`: Denoising strength, range is 0~1, default value is 1.0.
+* `seed`: Random seed. Default is `None`, which means completely random.
+* `rand_device`: Computing device for generating random Gaussian noise matrix, default is `"cpu"`. When set to `cuda`, different results will be generated on different GPUs.
+* `height`: Video height, must be a multiple of 32 (single-stage) or 64 (two-stage).
+* `width`: Video width, must be a multiple of 32 (single-stage) or 64 (two-stage).
+* `num_frames`: Number of video frames, default value is 121, must be a multiple of 8 + 1.
+* `num_inference_steps`: Number of inference steps, default value is 40.
+* `tiled`: Whether to enable VAE tiling inference, default is `True`. When set to `True`, it can significantly reduce VRAM usage during VAE encoding/decoding stages, with slight errors and minor inference time extension.
+* `tile_size_in_pixels`: Pixel tiling size during VAE encoding/decoding stages, default is 512.
+* `tile_overlap_in_pixels`: Pixel tiling overlap size during VAE encoding/decoding stages, default is 128.
+* `tile_size_in_frames`: Frame tiling size during VAE encoding/decoding stages, default is 128.
+* `tile_overlap_in_frames`: Frame tiling overlap size during VAE encoding/decoding stages, default is 24.
+* `use_two_stage_pipeline`: Whether to use two-stage pipeline, default is `False`.
+* `use_distilled_pipeline`: Whether to use distilled pipeline, default is `False`.
+* `progress_bar_cmd`: Progress bar, default is `tqdm.tqdm`. Can be set to `lambda x:x` to hide the progress bar.
+
+If VRAM is insufficient, please enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). We provide recommended low VRAM configurations for each model in the example code, see the table in the previous "Supported Inference Scripts" section.
+
+## Model Training
+
+The LTX-2 series models currently do not support training functionality. We will add related support as soon as possible.
diff --git a/docs/zh/Model_Details/LTX-2.md b/docs/zh/Model_Details/LTX-2.md
new file mode 100644
index 0000000..a50f47b
--- /dev/null
+++ b/docs/zh/Model_Details/LTX-2.md
@@ -0,0 +1,109 @@
+# LTX-2
+
+LTX-2 是由 Lightricks 开发的音视频生成模型系列。
+
+## 安装
+
+在使用本项目进行模型推理和训练前,请先安装 DiffSynth-Studio。
+
+```shell
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+
+更多关于安装的信息,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)。
+
+## 快速开始
+
+运行以下代码可以快速加载 [Lightricks/LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 8GB 显存即可运行。
+
+```python
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+height, width, num_frames = 512, 768, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_onestage.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
+```
+
+## 模型总览
+|模型 ID|额外参数|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证|
+|-|-|-|-|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py)|-|-|-|-|
+|[Lightricks/LTX-2: OneStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: TwoStagePipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py)|-|-|-|-|
+|[Lightricks/LTX-2: DistilledPipeline-I2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)|`input_images`|[code](/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py)|-|-|-|-|
+
+## 模型推理
+
+模型通过 `LTX2AudioVideoPipeline.from_pretrained` 加载,详见[加载模型](/docs/zh/Pipeline_Usage/Model_Inference.md#加载模型)。
+
+`LTX2AudioVideoPipeline` 推理的输入参数包括:
+
+* `prompt`: 提示词,描述视频中出现的内容。
+* `negative_prompt`: 负向提示词,描述视频中不应该出现的内容,默认值为 `""`。
+* `cfg_scale`: Classifier-free guidance 的参数,默认值为 3.0。
+* `input_images`: 输入图像列表,用于图生视频。
+* `input_images_indexes`: 输入图像在视频中的帧索引列表。
+* `input_images_strength`: 输入图像的强度,默认值为 1.0。
+* `denoising_strength`: 去噪强度,范围是 0~1,默认值为 1.0。
+* `seed`: 随机种子。默认为 `None`,即完全随机。
+* `rand_device`: 生成随机高斯噪声矩阵的计算设备,默认为 `"cpu"`。当设置为 `cuda` 时,在不同 GPU 上会导致不同的生成结果。
+* `height`: 视频高度,需保证高度为 32 的倍数(单阶段)或 64 的倍数(两阶段)。
+* `width`: 视频宽度,需保证宽度为 32 的倍数(单阶段)或 64 的倍数(两阶段)。
+* `num_frames`: 视频帧数,默认值为 121,需保证为 8 的倍数 + 1。
+* `num_inference_steps`: 推理次数,默认值为 40。
+* `tiled`: 是否启用 VAE 分块推理,默认为 `True`。设置为 `True` 时可显著减少 VAE 编解码阶段的显存占用,会产生少许误差,以及少量推理时间延长。
+* `tile_size_in_pixels`: VAE 编解码阶段的像素分块大小,默认为 512。
+* `tile_overlap_in_pixels`: VAE 编解码阶段的像素分块重叠大小,默认为 128。
+* `tile_size_in_frames`: VAE 编解码阶段的帧分块大小,默认为 128。
+* `tile_overlap_in_frames`: VAE 编解码阶段的帧分块重叠大小,默认为 24。
+* `use_two_stage_pipeline`: 是否使用两阶段管道,默认为 `False`。
+* `use_distilled_pipeline`: 是否使用蒸馏管道,默认为 `False`。
+* `progress_bar_cmd`: 进度条,默认为 `tqdm.tqdm`。可通过设置为 `lambda x:x` 来屏蔽进度条。
+
+如果显存不足,请开启[显存管理](/docs/zh/Pipeline_Usage/VRAM_management.md),我们在示例代码中提供了每个模型推荐的低显存配置,详见前文"支持的推理脚本"中的表格。
+
+## 模型训练
+
+LTX-2 系列模型目前暂不支持训练功能。我们将尽快添加相关支持。
diff --git a/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py b/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py
new file mode 100644
index 0000000..b8e0811
--- /dev/null
+++ b/examples/ltx2/model_inference/LTX-2-I2AV-DistilledPipeline.py
@@ -0,0 +1,69 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+from PIL import Image
+from modelscope import dataset_snapshot_download
+
+vram_config = {
+ "offload_dtype": torch.bfloat16,
+ "offload_device": "cpu",
+ "onload_dtype": torch.bfloat16,
+ "onload_device": "cuda",
+ "preparing_dtype": torch.bfloat16,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+)
+
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+dataset_snapshot_download(
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+ local_dir="./",
+ allow_file_pattern=["data/examples/ltx-2/first_frame.jpg"]
+)
+image = Image.open("data/examples/ltx-2/first_frame.jpg").convert("RGB").resize((width, height))
+# first frame
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_distilled_pipeline=True,
+ input_images=[image],
+ input_images_indexes=[0],
+ input_images_strength=1.0,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_distilled_i2av_first.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py b/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py
new file mode 100644
index 0000000..1614c1a
--- /dev/null
+++ b/examples/ltx2/model_inference/LTX-2-I2AV-OneStage.py
@@ -0,0 +1,55 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+from PIL import Image
+from modelscope import dataset_snapshot_download
+
+vram_config = {
+ "offload_dtype": torch.bfloat16,
+ "offload_device": "cpu",
+ "onload_dtype": torch.bfloat16,
+ "onload_device": "cuda",
+ "preparing_dtype": torch.bfloat16,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+)
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+height, width, num_frames = 512 * 2, 768 * 2, 121
+dataset_snapshot_download(
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+ local_dir="./",
+ allow_file_pattern=["data/examples/ltx-2/first_frame.jpg"]
+)
+image = Image.open("data/examples/ltx-2/first_frame.jpg").convert("RGB").resize((width, height))
+# first frame
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=False,
+ input_images=[image],
+ input_images_indexes=[0],
+ input_images_strength=1.0,
+ num_inference_steps=40,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_onestage_i2av_first.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py b/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py
new file mode 100644
index 0000000..e73ef3d
--- /dev/null
+++ b/examples/ltx2/model_inference/LTX-2-I2AV-TwoStage.py
@@ -0,0 +1,72 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+from PIL import Image
+from modelscope import dataset_snapshot_download
+
+vram_config = {
+ "offload_dtype": torch.bfloat16,
+ "offload_device": "cpu",
+ "onload_dtype": torch.bfloat16,
+ "onload_device": "cuda",
+ "preparing_dtype": torch.bfloat16,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
+)
+
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+height, width, num_frames = 512 * 2, 768 * 2, 121
+dataset_snapshot_download(
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+ local_dir="./",
+ allow_file_pattern=["data/examples/ltx-2/first_frame.jpg"]
+)
+image = Image.open("data/examples/ltx-2/first_frame.jpg").convert("RGB").resize((width, height))
+# first frame
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=42,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_two_stage_pipeline=True,
+ num_inference_steps=40,
+ input_images=[image],
+ input_images_indexes=[0],
+ input_images_strength=1.0,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_twostage_i2av_first.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py b/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py
index 8ee36bf..2b87dd3 100644
--- a/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py
+++ b/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py
@@ -1,6 +1,6 @@
import torch
from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
-from diffsynth.utils.data.media_io import write_video_audio_ltx2
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
vram_config = {
"offload_dtype": torch.bfloat16,
@@ -23,7 +23,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
)
-prompt = "A girl is speaking: “I enjoy working with Diffsynth-Studio, it's a great tool.”"
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
negative_prompt = (
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
diff --git a/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py b/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py
index 4331392..ade78d0 100644
--- a/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py
+++ b/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py
@@ -1,6 +1,6 @@
import torch
from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
-from diffsynth.utils.data.media_io import write_video_audio_ltx2
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
vram_config = {
"offload_dtype": torch.bfloat16,
@@ -21,7 +21,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
],
tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
)
-prompt = "A girl is speaking: “I enjoy working with Diffsynth-Studio, it's a great tool.”"
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
height, width, num_frames = 512, 768, 121
video, audio = pipe(
diff --git a/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py b/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py
index 73e68f0..84bbc0c 100644
--- a/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py
+++ b/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py
@@ -1,6 +1,6 @@
import torch
from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
-from diffsynth.utils.data.media_io import write_video_audio_ltx2
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
vram_config = {
"offload_dtype": torch.bfloat16,
@@ -24,7 +24,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
)
-prompt = "A girl is speaking: “I enjoy working with Diffsynth-Studio, it's a great tool.”"
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
negative_prompt = (
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py b/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py
new file mode 100644
index 0000000..7020b40
--- /dev/null
+++ b/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-DistilledPipeline.py
@@ -0,0 +1,70 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+from PIL import Image
+from modelscope import dataset_snapshot_download
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+dataset_snapshot_download(
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+ local_dir="./",
+ allow_file_pattern=["data/examples/ltx-2/first_frame.jpg"]
+)
+image = Image.open("data/examples/ltx-2/first_frame.jpg").convert("RGB").resize((width, height))
+# first frame
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_distilled_pipeline=True,
+ input_images=[image],
+ input_images_indexes=[0],
+ input_images_strength=1.0,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_distilled_i2av_first.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py b/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py
new file mode 100644
index 0000000..48ca23b
--- /dev/null
+++ b/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-OneStage.py
@@ -0,0 +1,56 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+from PIL import Image
+from modelscope import dataset_snapshot_download
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+height, width, num_frames = 512 * 2, 768 * 2, 121
+dataset_snapshot_download(
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+ local_dir="./",
+ allow_file_pattern=["data/examples/ltx-2/first_frame.jpg"]
+)
+image = Image.open("data/examples/ltx-2/first_frame.jpg").convert("RGB").resize((width, height))
+# first frame
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=False,
+ input_images=[image],
+ input_images_indexes=[0],
+ input_images_strength=1.0,
+ num_inference_steps=40,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_onestage_i2av_first.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py b/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py
new file mode 100644
index 0000000..5411b8c
--- /dev/null
+++ b/examples/ltx2/model_inference_low_vram/LTX-2-I2AV-TwoStage.py
@@ -0,0 +1,72 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+from PIL import Image
+from modelscope import dataset_snapshot_download
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+dataset_snapshot_download(
+ dataset_id="DiffSynth-Studio/examples_in_diffsynth",
+ local_dir="./",
+ allow_file_pattern=["data/examples/ltx-2/first_frame.jpg"]
+)
+image = Image.open("data/examples/ltx-2/first_frame.jpg").convert("RGB").resize((width, height))
+# first frame
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=42,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_two_stage_pipeline=True,
+ num_inference_steps=40,
+ input_images=[image],
+ input_images_indexes=[0],
+ input_images_strength=1.0,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_twostage_i2av_first.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py
new file mode 100644
index 0000000..d8b6a5d
--- /dev/null
+++ b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-DistilledPipeline.py
@@ -0,0 +1,58 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_distilled_pipeline=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_distilled.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py
new file mode 100644
index 0000000..894c417
--- /dev/null
+++ b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py
@@ -0,0 +1,43 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+height, width, num_frames = 512, 768, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_onestage.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)
diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py
new file mode 100644
index 0000000..65650d0
--- /dev/null
+++ b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py
@@ -0,0 +1,59 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io_ltx2 import write_video_audio_ltx2
+
+vram_config = {
+ "offload_dtype": torch.float8_e5m2,
+ "offload_device": "cpu",
+ "onload_dtype": torch.float8_e5m2,
+ "onload_device": "cpu",
+ "preparing_dtype": torch.float8_e5m2,
+ "preparing_device": "cuda",
+ "computation_dtype": torch.bfloat16,
+ "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_configs=[
+ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config),
+ ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+ ],
+ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+ stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
+ vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+
+prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”"
+negative_prompt = (
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+ "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+ "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+ "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+ "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+ "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+ "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+ "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+ "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+ "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+video, audio = pipe(
+ prompt=prompt,
+ negative_prompt=negative_prompt,
+ seed=43,
+ height=height,
+ width=width,
+ num_frames=num_frames,
+ tiled=True,
+ use_two_stage_pipeline=True,
+)
+write_video_audio_ltx2(
+ video=video,
+ audio=audio,
+ output_path='ltx2_twostage.mp4',
+ fps=24,
+ audio_sample_rate=24000,
+)