support ltx2 distilled pipeline

2026-04-08 08:58:20 +00:00 · 2026-01-30 17:40:30 +08:00
parent 4f23caa55f
commit 9f07d65ebb
5 changed files with 82 additions and 27 deletions
--- a/diffsynth/diffusion/flow_match.py
+++ b/diffsynth/diffusion/flow_match.py
@@ -129,7 +129,7 @@ class FlowMatchScheduler():
        if special_case == "stage2":
            sigmas = torch.Tensor([0.909375, 0.725, 0.421875])
        elif special_case == "ditilled_stage1":
-            sigmas = torch.Tensor([0.95, 0.8, 0.5, 0.2])
+            sigmas = torch.Tensor([1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875])
        else:
            dynamic_shift_len = dynamic_shift_len or 4096
            sigma_shift = FlowMatchScheduler._calculate_shift_qwen_image(
--- a/diffsynth/pipelines/ltx2_audio_video.py
+++ b/diffsynth/pipelines/ltx2_audio_video.py
@@ -61,7 +61,6 @@ class LTX2AudioVideoPipeline(BasePipeline):
            LTX2AudioVideoUnit_InputVideoEmbedder(),
        ]
        self.model_fn = model_fn_ltx2
-        # self.lora_loader = LTX2LoRALoader

    @staticmethod
    def from_pretrained(
@@ -89,12 +88,12 @@ class LTX2AudioVideoPipeline(BasePipeline):
        pipe.video_vae_decoder = model_pool.fetch_model("ltx2_video_vae_decoder")
        pipe.audio_vae_decoder = model_pool.fetch_model("ltx2_audio_vae_decoder")
        pipe.audio_vocoder = model_pool.fetch_model("ltx2_audio_vocoder")
+        pipe.upsampler = model_pool.fetch_model("ltx2_latent_upsampler")

        # Stage 2
        if stage2_lora_config is not None:
            stage2_lora_config.download_if_necessary()
            pipe.stage2_lora_path = stage2_lora_config.path
-            pipe.upsampler = model_pool.fetch_model("ltx2_latent_upsampler")
        # Optional, currently not used
        # pipe.audio_vae_encoder = model_pool.fetch_model("ltx2_audio_vae_encoder")

@@ -122,8 +121,8 @@ class LTX2AudioVideoPipeline(BasePipeline):
            noise_pred = noise_pred_posi
        return noise_pred

-    def stage2_denoise(self, cfg_scale, inputs_shared, inputs_posi, inputs_nega, use_two_stage_pipeline=True, progress_bar_cmd=tqdm):
-        if use_two_stage_pipeline:
+    def stage2_denoise(self, inputs_shared, inputs_posi, inputs_nega, progress_bar_cmd=tqdm):
+        if inputs_shared["use_two_stage_pipeline"]:
            latent = self.video_vae_encoder.per_channel_statistics.un_normalize(inputs_shared["video_latents"])
            self.load_models_to_device(self.in_iteration_models + ('upsampler',))
            latent = self.upsampler(latent)
@@ -135,12 +134,13 @@ class LTX2AudioVideoPipeline(BasePipeline):
            inputs_shared["audio_latents"] = self.audio_patchifier.patchify(inputs_shared["audio_latents"])
            inputs_shared["audio_latents"] = self.scheduler.sigmas[0] * inputs_shared["audio_noise"] + (1 - self.scheduler.sigmas[0]) * inputs_shared["audio_latents"]
            self.load_models_to_device(self.in_iteration_models)
-            self.load_lora(self.dit, self.stage2_lora_path, alpha=0.8)
+            if not inputs_shared["use_distilled_pipeline"]:
+                self.load_lora(self.dit, self.stage2_lora_path, alpha=0.8)
            models = {name: getattr(self, name) for name in self.in_iteration_models}
            for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
                timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device)
                noise_pred_video, noise_pred_audio = self.cfg_guided_model_fn(
-                    self.model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega,
+                    self.model_fn, 1.0, inputs_shared, inputs_posi, inputs_nega,
                    **models, timestep=timestep, progress_id=progress_id
                )
                inputs_shared["video_latents"] = self.step(self.scheduler, inputs_shared["video_latents"], progress_id=progress_id,
@@ -185,7 +185,8 @@ class LTX2AudioVideoPipeline(BasePipeline):
        progress_bar_cmd=tqdm,
    ):
        # Scheduler
-        self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength)
+        self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength,
+                                     special_case="ditilled_stage1" if use_distilled_pipeline else None)
        # self.load_lora(self.dit, self.stage2_lora_path)
        # Inputs
        inputs_posi = {
@@ -223,7 +224,7 @@ class LTX2AudioVideoPipeline(BasePipeline):
        inputs_shared["audio_latents"] = self.audio_patchifier.unpatchify(inputs_shared["audio_latents"], inputs_shared["audio_latent_shape"])

        # Denoise Stage 2
-        inputs_shared = self.stage2_denoise(cfg_scale, inputs_shared, inputs_posi, inputs_nega, use_two_stage_pipeline, progress_bar_cmd)
+        inputs_shared = self.stage2_denoise(inputs_shared, inputs_posi, inputs_nega, progress_bar_cmd)

        # Decode
        self.load_models_to_device(['video_vae_decoder'])
@@ -241,11 +242,17 @@ class LTX2AudioVideoUnit_PipelineChecker(PipelineUnit):
        super().__init__(take_over=True)

    def process(self, pipe: LTX2AudioVideoPipeline, inputs_shared, inputs_posi, inputs_nega):
+        if inputs_shared.get("use_distilled_pipeline", False):
+            inputs_shared["use_two_stage_pipeline"] = True
+            inputs_shared["cfg_scale"] = 1.0
+            print(f"Distilled pipeline requested, setting use_two_stage_pipeline to True, disable CFG by setting cfg_scale to 1.0.")
        if inputs_shared.get("use_two_stage_pipeline", False):
-            if not (hasattr(pipe, "stage2_lora_path") and pipe.stage2_lora_path is not None):
-                raise ValueError("Two-stage pipeline requested, but stage2_lora_path is not set in the pipeline.")
+            # distill pipeline also uses two-stage, but it does not needs lora
+            if not inputs_shared.get("use_distilled_pipeline", False):
+                if not (hasattr(pipe, "stage2_lora_path") and pipe.stage2_lora_path is not None):
+                    raise ValueError("Two-stage pipeline requested, but stage2_lora_path is not set in the pipeline.")
            if not (hasattr(pipe, "upsampler") and pipe.upsampler is not None):
-                raise ValueError("Two-stage pipeline requested, but upsampler model is not loaded in the pipeline.")
+                raise ValueError("Two-stage pipeline requested, but upsampler model is not loaded in the pipeline.")            
        return inputs_shared, inputs_posi, inputs_nega


--- a/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py
+++ b/examples/ltx2/model_inference/LTX-2-T2AV-DistilledPipeline.py
@@ -0,0 +1,57 @@
+import torch
+from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig
+from diffsynth.utils.data.media_io import write_video_audio_ltx2
+
+vram_config = {
+    "offload_dtype": torch.bfloat16,
+    "offload_device": "cpu",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cuda",
+    "preparing_dtype": torch.bfloat16,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = LTX2AudioVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config),
+        ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled.safetensors", **vram_config),
+        ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config),
+    ],
+    tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
+)
+
+prompt = "A girl is speaking: “I enjoy working with Diffsynth-Studio, it's a great tool.”"
+negative_prompt = (
+    "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+    "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+    "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+    "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+    "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+    "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+    "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+    "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+    "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+    "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+    "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)
+height, width, num_frames = 512 * 2, 768 * 2, 121
+video, audio = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    seed=43,
+    height=height,
+    width=width,
+    num_frames=num_frames,
+    tiled=True,
+    use_distilled_pipeline=True,
+)
+write_video_audio_ltx2(
+    video=video,
+    audio=audio,
+    output_path='ltx2_distilled.mp4',
+    fps=24,
+    audio_sample_rate=24000,
+)
--- a/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py
+++ b/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py
@@ -21,17 +21,13 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
    ],
    tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"),
 )
-prompt = """
-INT. OVEN – DAY. Static camera from inside the oven, looking outward through the slightly fogged glass door. Warm golden light glows around freshly baked cookies. The baker’s face fills the frame, eyes wide with focus, his breath fogging the glass as he leans in. Subtle reflections move across the glass as steam rises.
-Baker (whispering dramatically): “Today… I achieve perfection.”
-He leans even closer, nose nearly touching the glass.
-"""
+prompt = "A girl is speaking: “I enjoy working with Diffsynth-Studio, it's a great tool.”"
 negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
 height, width, num_frames = 512, 768, 121
 video, audio = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
-    seed=10,
+    seed=43,
    height=height,
    width=width,
    num_frames=num_frames,
@@ -40,7 +36,7 @@ video, audio = pipe(
 write_video_audio_ltx2(
    video=video,
    audio=audio,
-    output_path='ltx2_onestage_oven.mp4',
+    output_path='ltx2_onestage.mp4',
    fps=24,
    audio_sample_rate=24000,
 )
--- a/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py
+++ b/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py
@@ -24,11 +24,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
    stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"),
 )

-prompt = """
-INT. OVEN – DAY. Static camera from inside the oven, looking outward through the slightly fogged glass door. Warm golden light glows around freshly baked cookies. The baker’s face fills the frame, eyes wide with focus, his breath fogging the glass as he leans in. Subtle reflections move across the glass as steam rises.
-Baker (whispering dramatically): “Today… I achieve perfection.”
-He leans even closer, nose nearly touching the glass.
-"""
+prompt = "A girl is speaking: “I enjoy working with Diffsynth-Studio, it's a great tool.”"
 negative_prompt = (
    "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
    "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
@@ -46,18 +42,17 @@ height, width, num_frames = 512 * 2, 768 * 2, 121
 video, audio = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
-    seed=0,
+    seed=43,
    height=height,
    width=width,
    num_frames=num_frames,
    tiled=True,
    use_two_stage_pipeline=True,
-    num_inference_steps=40,
 )
 write_video_audio_ltx2(
    video=video,
    audio=audio,
-    output_path='ltx2_twostage_oven.mp4',
+    output_path='ltx2_twostage.mp4',
    fps=24,
    audio_sample_rate=24000,
 )