Files
Hong Zhang 681df93a85 Mova (#1337)
* support mova inference

* mova media_io

* add unified audio_video api & fix bug of mono audio input for ltx

* support mova train

* mova docs

* fix bug
2026-03-13 13:06:07 +08:00

54 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import torch
from PIL import Image
from diffsynth.pipelines.mova_audio_video import ModelConfig, MovaAudioVideoPipeline
from diffsynth.utils.data.audio_video import write_video_audio
from diffsynth.utils.data import VideoData
vram_config = {
"offload_dtype": torch.bfloat16,
"offload_device": "cpu",
"onload_dtype": torch.bfloat16,
"onload_device": "cuda",
"preparing_dtype": torch.bfloat16,
"preparing_device": "cuda",
"computation_dtype": torch.bfloat16,
"computation_device": "cuda",
}
pipe = MovaAudioVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(path="./models/train/MOVA-360p-I2AV_high_noise_full/epoch-4.safetensors", **vram_config),
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="video_dit_2/diffusion_pytorch_model-*.safetensors", **vram_config),
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors", **vram_config),
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors", **vram_config),
ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="Wan2.1_VAE.safetensors", **vram_config),
ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.safetensors", **vram_config),
],
tokenizer_config=ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="tokenizer/"),
)
negative_prompt = (
"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,"
"整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指"
)
prompt = "A beautiful sunset over the ocean."
height, width, num_frames = 352, 640, 121
frame_rate = 24
input_image = VideoData("data/example_video_dataset/ltx2/video.mp4", height=height, width=width)[0]
# Image-to-video
video, audio = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=num_frames,
input_image=input_image,
num_inference_steps=50,
seed=0,
tiled=True,
frame_rate=frame_rate,
)
write_video_audio(video, audio, "MOVA-360p.mp4", fps=24, audio_sample_rate=pipe.audio_vae.sample_rate)