Mova (#1337)

* support mova inference * mova media_io * add unified audio_video api & fix bug of mono audio input for ltx * support mova train * mova docs * fix bug
2026-03-18 22:08:13 +00:00 · 2026-03-13 13:06:07 +08:00
parent 4741542523
commit 681df93a85
37 changed files with 3102 additions and 181 deletions
--- a/examples/mova/model_inference_low_vram/MOVA-360p-I2AV.py
+++ b/examples/mova/model_inference_low_vram/MOVA-360p-I2AV.py
@@ -0,0 +1,53 @@
+import torch
+from PIL import Image
+from diffsynth.pipelines.mova_audio_video import ModelConfig, MovaAudioVideoPipeline
+from diffsynth.utils.data.audio_video import write_video_audio
+
+vram_config = {
+    "offload_dtype": torch.bfloat16,
+    "offload_device": "cpu",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cuda",
+    "preparing_dtype": torch.bfloat16,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = MovaAudioVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="video_dit/diffusion_pytorch_model-*.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="video_dit_2/diffusion_pytorch_model-*.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors", **vram_config),
+        ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="Wan2.1_VAE.safetensors", **vram_config),
+        ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.safetensors", **vram_config),
+    ],
+    tokenizer_config=ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
+)
+negative_prompt = (
+    "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，"
+    "整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指"
+)
+
+prompt = "Two cute orange cats, wearing boxing gloves, stand on a boxing ring and fight each other."
+height, width, num_frames = 352, 640, 121
+frame_rate = 24
+input_image = Image.open("data/examples/wan/cat_fightning.jpg").resize((width, height)).convert("RGB")
+# Image-to-video
+video, audio = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    num_frames=num_frames,
+    input_image=input_image,
+    num_inference_steps=50,
+    seed=0,
+    tiled=True,
+    frame_rate=frame_rate,
+)
+write_video_audio(video, audio, "MOVA-360p-cat.mp4", fps=24, audio_sample_rate=pipe.audio_vae.sample_rate)
--- a/examples/mova/model_inference_low_vram/MOVA-720p-I2AV.py
+++ b/examples/mova/model_inference_low_vram/MOVA-720p-I2AV.py
@@ -0,0 +1,53 @@
+import torch
+from PIL import Image
+from diffsynth.utils.data.audio_video import write_video_audio
+from diffsynth.pipelines.mova_audio_video import MovaAudioVideoPipeline, ModelConfig
+
+vram_config = {
+    "offload_dtype": torch.bfloat16,
+    "offload_device": "cpu",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cuda",
+    "preparing_dtype": torch.bfloat16,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = MovaAudioVideoPipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="video_dit/diffusion_pytorch_model-*.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="video_dit_2/diffusion_pytorch_model-*.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors", **vram_config),
+        ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors", **vram_config),
+        ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="Wan2.1_VAE.safetensors", **vram_config),
+        ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.safetensors", **vram_config),
+    ],
+    tokenizer_config=ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
+)
+
+negative_prompt = (
+    "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，"
+    "整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指"
+)
+prompt = "Two cute orange cats, wearing boxing gloves, stand on a boxing ring and fight each other."
+height, width, num_frames = 720, 1280, 121
+frame_rate = 24
+input_image = Image.open("data/examples/wan/cat_fightning.jpg").resize((width, height)).convert("RGB")
+# Image-to-video
+video, audio = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    num_frames=num_frames,
+    input_image=input_image,
+    num_inference_steps=50,
+    seed=0,
+    tiled=True,
+    frame_rate=frame_rate,
+)
+write_video_audio(video, audio, "MOVA-720p-cat.mp4", fps=24, audio_sample_rate=pipe.audio_vae.sample_rate)