Merge pull request #1343 from mi804/ltx2.3_multiref

Ltx2.3 multiref
2026-03-18 22:08:13 +00:00 · 2026-03-10 17:31:05 +08:00
parent f3ebd6f714
commit c927062546
4 changed files with 113 additions and 36 deletions
--- a/diffsynth/diffusion/base_pipeline.py
+++ b/diffsynth/diffusion/base_pipeline.py
@@ -147,6 +147,19 @@ class BasePipeline(torch.nn.Module):
        video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
        return video

+    def output_audio_format_check(self, audio_output):
+        # output standard foramt: [C, T], output dtype: float()
+        # remove batch dim
+        if audio_output.ndim == 3:
+            audio_output = audio_output.squeeze(0)
+        # Transform to stereo
+        if audio_output.shape[0] == 1:
+            audio_output = audio_output.repeat(2, 1)
+        elif audio_output.shape[0] == 2:
+            pass
+        else:
+            raise ValueError("The output audio should be [C, T] or [1, C, T] or [2, C, T].")
+        return audio_output.float()

    def load_models_to_device(self, model_names):
        if self.vram_management_enabled: