support ltx2 gradient_checkpointing

2026-03-18 22:08:13 +00:00 · 2026-02-26 19:19:59 +08:00
parent a87910bc65
commit a18966c300
10 changed files with 36 additions and 33 deletions
--- a/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh
+++ b/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh
@@ -6,7 +6,7 @@ accelerate launch examples/ltx2/model_training/train.py \
  --extra_inputs "input_audio" \
  --height 512 \
  --width 768 \
-  --num_frames 49 \
+  --num_frames 121 \
  --dataset_repeat 1 \
  --model_id_with_origin_paths "DiffSynth-Studio/LTX-2-Repackage:text_encoder_post_modules.safetensors,DiffSynth-Studio/LTX-2-Repackage:video_vae_encoder.safetensors,DiffSynth-Studio/LTX-2-Repackage:audio_vae_encoder.safetensors,google/gemma-3-12b-it-qat-q4_0-unquantized:model-*.safetensors" \
  --learning_rate 1e-4 \
@@ -23,7 +23,7 @@ accelerate launch --config_file examples/qwen_image/model_training/full/accelera
  --extra_inputs "input_audio" \
  --height 512 \
  --width 768 \
-  --num_frames 49 \
+  --num_frames 121 \
  --dataset_repeat 100 \
  --model_id_with_origin_paths "DiffSynth-Studio/LTX-2-Repackage:transformer.safetensors" \
  --learning_rate 1e-4 \
--- a/examples/ltx2/model_training/lora/LTX-2-T2AV-noaudio.sh
+++ b/examples/ltx2/model_training/lora/LTX-2-T2AV-noaudio.sh
@@ -24,7 +24,7 @@ accelerate launch examples/ltx2/model_training/train.py \
  --dataset_metadata_path data/example_video_dataset/ltx2_t2av.csv \
  --height 512 \
  --width 768 \
-  --num_frames 49\
+  --num_frames 121\
  --dataset_repeat 1 \
  --model_id_with_origin_paths "DiffSynth-Studio/LTX-2-Repackage:text_encoder_post_modules.safetensors,DiffSynth-Studio/LTX-2-Repackage:video_vae_encoder.safetensors,DiffSynth-Studio/LTX-2-Repackage:audio_vae_encoder.safetensors,google/gemma-3-12b-it-qat-q4_0-unquantized:model-*.safetensors" \
  --learning_rate 1e-4 \
@@ -42,7 +42,7 @@ accelerate launch examples/ltx2/model_training/train.py \
  --dataset_base_path ./models/train/LTX2-T2AV-noaudio_lora-splited-cache \
  --height 512 \
  --width 768 \
-  --num_frames 49\
+  --num_frames 121\
  --dataset_repeat 100 \
  --model_id_with_origin_paths "DiffSynth-Studio/LTX-2-Repackage:transformer.safetensors" \
  --learning_rate 1e-4 \
--- a/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh
+++ b/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh
@@ -27,7 +27,7 @@ accelerate launch examples/ltx2/model_training/train.py \
  --extra_inputs "input_audio" \
  --height 512 \
  --width 768 \
-  --num_frames 49 \
+  --num_frames 121 \
  --dataset_repeat 1 \
  --model_id_with_origin_paths "DiffSynth-Studio/LTX-2-Repackage:text_encoder_post_modules.safetensors,DiffSynth-Studio/LTX-2-Repackage:video_vae_encoder.safetensors,DiffSynth-Studio/LTX-2-Repackage:audio_vae_encoder.safetensors,google/gemma-3-12b-it-qat-q4_0-unquantized:model-*.safetensors" \
  --learning_rate 1e-4 \
@@ -46,7 +46,7 @@ accelerate launch examples/ltx2/model_training/train.py \
  --extra_inputs "input_audio" \
  --height 512 \
  --width 768 \
-  --num_frames 49 \
+  --num_frames 121 \
  --dataset_repeat 100 \
  --model_id_with_origin_paths "DiffSynth-Studio/LTX-2-Repackage:transformer.safetensors" \
  --learning_rate 1e-4 \
--- a/examples/ltx2/model_training/train.py
+++ b/examples/ltx2/model_training/train.py
@@ -118,10 +118,10 @@ if __name__ == "__main__":
            max_pixels=args.max_pixels,
            height=args.height,
            width=args.width,
-            height_division_factor=16,
-            width_division_factor=16,
+            height_division_factor=32,
+            width_division_factor=32,
            num_frames=args.num_frames,
-            time_division_factor=4,
+            time_division_factor=8,
            time_division_remainder=1,
        ),
        special_operator_map={
--- a/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py
+++ b/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py
@@ -27,7 +27,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
 )
 prompt = "A beautiful sunset over the ocean."
 negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
-height, width, num_frames = 512, 768, 49
+height, width, num_frames = 512, 768, 121
 video, audio = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
--- a/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py
+++ b/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py
@@ -28,7 +28,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
 pipe.load_lora(pipe.dit, "models/train/LTX2-T2AV_lora/epoch-4.safetensors")
 prompt = "A beautiful sunset over the ocean."
 negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
-height, width, num_frames = 512, 768, 49
+height, width, num_frames = 512, 768, 121
 video, audio = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
--- a/examples/ltx2/model_training/validate_lora/LTX-2-T2AV_noaudio.py
+++ b/examples/ltx2/model_training/validate_lora/LTX-2-T2AV_noaudio.py
@@ -28,7 +28,7 @@ pipe = LTX2AudioVideoPipeline.from_pretrained(
 pipe.load_lora(pipe.dit, "models/train/LTX2-T2AV-noaudio_lora/epoch-4.safetensors")
 prompt = "A beautiful sunset over the ocean."
 negative_prompt = "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
-height, width, num_frames = 512, 768, 49
+height, width, num_frames = 512, 768, 121
 video, audio = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,