From f48662e8637f29147e593e2ff6b6f1d31a1b1b87 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Thu, 26 Feb 2026 11:10:00 +0800 Subject: [PATCH] update docs --- README.md | 34 +++++++++++- README_zh.md | 34 +++++++++++- diffsynth/configs/model_configs.py | 16 +++++- docs/en/Model_Details/LTX-2.md | 52 ++++++++++++++++-- docs/zh/Model_Details/LTX-2.md | 54 +++++++++++++++++-- .../model_inference/LTX-2-T2AV-OneStage.py | 19 +++++++ .../model_inference/LTX-2-T2AV-TwoStage.py | 22 +++++++- .../LTX-2-T2AV-OneStage.py | 20 +++++++ .../LTX-2-T2AV-TwoStage.py | 22 ++++++++ 9 files changed, 258 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e055ae7..5fe2017 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,9 @@ We believe that a well-developed open-source code framework can lower the thresh > DiffSynth-Studio has undergone major version updates, and some old features are no longer maintained. If you need to use old features, please switch to the [last historical version](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3) before the major version update. > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand. -- **February 10, 2026** Added inference and training support for the LTX-2 audio-video generation model. See the documentation for details. +- **February 26, 2026** Added full and lora training support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. + +- **February 10, 2026** Added inference support for the LTX-2 audio-video generation model. See the [documentation](/docs/en/Model_Details/LTX-2.md) for details. Support for model training will be implemented in the future. - **February 2, 2026** The first document of the Research Tutorial series is now available, guiding you through training a small 0.1B text-to-image model from scratch. For details, see the [documentation](/docs/en/Research_Tutorial/train_from_scratch.md) and [model](https://modelscope.cn/models/DiffSynth-Studio/AAAMyModel). We hope DiffSynth-Studio can evolve into a more powerful training framework for Diffusion models. @@ -557,12 +559,26 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), - ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors", **vram_config), ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), ], tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), @@ -570,6 +586,20 @@ pipe = LTX2AudioVideoPipeline.from_pretrained( vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), +# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +# ) + prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\"" negative_prompt = ( "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " diff --git a/README_zh.md b/README_zh.md index 519ebe6..74843db 100644 --- a/README_zh.md +++ b/README_zh.md @@ -32,7 +32,9 @@ DiffSynth 目前包括两个开源项目: > DiffSynth-Studio 经历了大版本更新,部分旧功能已停止维护,如需使用旧版功能,请切换到大版本更新前的[最后一个历史版本](https://github.com/modelscope/DiffSynth-Studio/tree/afd101f3452c9ecae0c87b79adfa2e22d65ffdc3)。 > 目前本项目的开发人员有限,大部分工作由 [Artiprocher](https://github.com/Artiprocher) 负责,因此新功能的开发进展会比较缓慢,issue 的回复和解决速度有限,我们对此感到非常抱歉,请各位开发者理解。 -- **2026年2月10日** 新增对[LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2)音视频生成模型的推理和训练支持,详见[文档](docs/zh/Model_Details/LTX-2.md)。 +- **2026年2月26日** 新增对[LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2)音视频生成模型全量微调与LoRA训练支持,详见[文档](docs/zh/Model_Details/LTX-2.md)。 + +- **2026年2月10日** 新增对[LTX-2](https://www.modelscope.cn/models/Lightricks/LTX-2)音视频生成模型的推理支持,详见[文档](docs/zh/Model_Details/LTX-2.md),后续将推进模型训练的支持。 - **2026年2月2日** Research Tutorial 的第一篇文档上线,带你从零开始训练一个 0.1B 的小型文生图模型,详见[文档](/docs/zh/Research_Tutorial/train_from_scratch.md)、[模型](https://modelscope.cn/models/DiffSynth-Studio/AAAMyModel),我们希望 DiffSynth-Studio 能够成为一个更强大的 Diffusion 模型训练框架。 @@ -557,12 +559,26 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), - ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors", **vram_config), ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), ], tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), @@ -570,6 +586,20 @@ pipe = LTX2AudioVideoPipeline.from_pretrained( vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), +# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +# ) + prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\"" negative_prompt = ( "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " diff --git a/diffsynth/configs/model_configs.py b/diffsynth/configs/model_configs.py index dbad638..fbca133 100644 --- a/diffsynth/configs/model_configs.py +++ b/diffsynth/configs/model_configs.py @@ -598,7 +598,14 @@ z_image_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter", }, ] - +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" ltx2_series = [ { # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors") @@ -608,6 +615,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors") "model_hash": "c567aaa37d5ed7454c73aa6024458661", "model_name": "ltx2_dit", "model_class": "diffsynth.models.ltx2_dit.LTXModel", @@ -621,6 +629,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors") "model_hash": "7f7e904a53260ec0351b05f32153754b", "model_name": "ltx2_video_vae_encoder", "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder", @@ -634,6 +643,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors") "model_hash": "dc6029ca2825147872b45e35a2dc3a97", "model_name": "ltx2_video_vae_decoder", "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder", @@ -647,6 +657,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors") "model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb", "model_name": "ltx2_audio_vae_decoder", "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder", @@ -660,6 +671,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors") "model_hash": "f471360f6b24bef702ab73133d9f8bb9", "model_name": "ltx2_audio_vocoder", "model_class": "diffsynth.models.ltx2_audio_vae.LTX2Vocoder", @@ -673,6 +685,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_encoder.safetensors") "model_hash": "29338f3b95e7e312a3460a482e4f4554", "model_name": "ltx2_audio_vae_encoder", "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder", @@ -686,6 +699,7 @@ ltx2_series = [ "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter", }, { + # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors") "model_hash": "981629689c8be92a712ab3c5eb4fc3f6", "model_name": "ltx2_text_encoder_post_modules", "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules", diff --git a/docs/en/Model_Details/LTX-2.md b/docs/en/Model_Details/LTX-2.md index e3f4ca8..68ab351 100644 --- a/docs/en/Model_Details/LTX-2.md +++ b/docs/en/Model_Details/LTX-2.md @@ -33,19 +33,62 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), - ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), ], tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) + +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), +# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +# ) + prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\"" -negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." -height, width, num_frames = 512, 768, 121 +negative_prompt = ( + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." +) +height, width, num_frames = 512 * 2, 768 * 2, 121 video, audio = pipe( prompt=prompt, negative_prompt=negative_prompt, @@ -54,11 +97,12 @@ video, audio = pipe( width=width, num_frames=num_frames, tiled=True, + use_two_stage_pipeline=True, ) write_video_audio_ltx2( video=video, audio=audio, - output_path='ltx2_onestage.mp4', + output_path='ltx2_twostage.mp4', fps=24, audio_sample_rate=24000, ) diff --git a/docs/zh/Model_Details/LTX-2.md b/docs/zh/Model_Details/LTX-2.md index 86abbcd..558de9d 100644 --- a/docs/zh/Model_Details/LTX-2.md +++ b/docs/zh/Model_Details/LTX-2.md @@ -33,19 +33,62 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), - ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors", **vram_config), + ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), ], tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) -prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" -negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." -height, width, num_frames = 512, 768, 121 + +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), +# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +# ) + +prompt = "A girl is very happy, she is speaking: \"I enjoy working with Diffsynth-Studio, it's a perfect framework.\"" +negative_prompt = ( + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." +) +height, width, num_frames = 512 * 2, 768 * 2, 121 video, audio = pipe( prompt=prompt, negative_prompt=negative_prompt, @@ -54,11 +97,12 @@ video, audio = pipe( width=width, num_frames=num_frames, tiled=True, + use_two_stage_pipeline=True, ) write_video_audio_ltx2( video=video, audio=audio, - output_path='ltx2_onestage.mp4', + output_path='ltx2_twostage.mp4', fps=24, audio_sample_rate=24000, ) diff --git a/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py b/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py index 295d73b..9e56209 100644 --- a/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py +++ b/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py @@ -12,6 +12,15 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", @@ -25,6 +34,16 @@ pipe = LTX2AudioVideoPipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), ) +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# ) prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." height, width, num_frames = 512, 768, 121 diff --git a/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py b/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py index 9a85a9f..e08ed50 100644 --- a/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py +++ b/examples/ltx2/model_inference/LTX-2-T2AV-TwoStage.py @@ -12,6 +12,15 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", @@ -28,7 +37,18 @@ pipe = LTX2AudioVideoPipeline.from_pretrained( tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), ) - +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), +# ) prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" negative_prompt = ( "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py index fb08384..c08332f 100644 --- a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py +++ b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py @@ -12,6 +12,15 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", @@ -26,6 +35,17 @@ pipe = LTX2AudioVideoPipeline.from_pretrained( tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +# ) prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." height, width, num_frames = 512, 768, 121 diff --git a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py index a10b6ef..98ed966 100644 --- a/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py +++ b/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-TwoStage.py @@ -12,6 +12,15 @@ vram_config = { "computation_dtype": torch.bfloat16, "computation_device": "cuda", } +""" +Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2 +Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage +For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")) +and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported. +We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules, +and avoid redundant memory usage when users only want to use part of the model. +""" +# use the repackaged modelconfig from "DiffSynth-Studio/LTX-2-Repackage" to avoid redundant model loading pipe = LTX2AudioVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", @@ -29,6 +38,19 @@ pipe = LTX2AudioVideoPipeline.from_pretrained( stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) +# use the following modelconfig if you want to initialize model from offical checkpoints from "Lightricks/LTX-2" +# pipe = LTX2AudioVideoPipeline.from_pretrained( +# torch_dtype=torch.bfloat16, +# device="cuda", +# model_configs=[ +# ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors", **vram_config), +# ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-spatial-upscaler-x2-1.0.safetensors", **vram_config), +# ], +# tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), +# stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-distilled-lora-384.safetensors"), +# vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +# ) prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" negative_prompt = (