mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
* support mova inference * mova media_io * add unified audio_video api & fix bug of mono audio input for ltx * support mova train * mova docs * fix bug
54 lines
2.6 KiB
Python
54 lines
2.6 KiB
Python
import torch
|
||
from PIL import Image
|
||
from diffsynth.pipelines.mova_audio_video import ModelConfig, MovaAudioVideoPipeline
|
||
from diffsynth.utils.data.audio_video import write_video_audio
|
||
|
||
vram_config = {
|
||
"offload_dtype": torch.bfloat16,
|
||
"offload_device": "cpu",
|
||
"onload_dtype": torch.bfloat16,
|
||
"onload_device": "cuda",
|
||
"preparing_dtype": torch.bfloat16,
|
||
"preparing_device": "cuda",
|
||
"computation_dtype": torch.bfloat16,
|
||
"computation_device": "cuda",
|
||
}
|
||
pipe = MovaAudioVideoPipeline.from_pretrained(
|
||
torch_dtype=torch.bfloat16,
|
||
device="cuda",
|
||
model_configs=[
|
||
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="video_dit/diffusion_pytorch_model-*.safetensors", **vram_config),
|
||
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="video_dit_2/diffusion_pytorch_model-*.safetensors", **vram_config),
|
||
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors", **vram_config),
|
||
ModelConfig(model_id="openmoss/MOVA-360p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors", **vram_config),
|
||
ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors", **vram_config),
|
||
ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="Wan2.1_VAE.safetensors", **vram_config),
|
||
ModelConfig(model_id="DiffSynth-Studio/Wan-Series-Converted-Safetensors", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.safetensors", **vram_config),
|
||
],
|
||
tokenizer_config=ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="tokenizer/"),
|
||
vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2,
|
||
)
|
||
negative_prompt = (
|
||
"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,"
|
||
"整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指"
|
||
)
|
||
|
||
prompt = "Two cute orange cats, wearing boxing gloves, stand on a boxing ring and fight each other."
|
||
height, width, num_frames = 352, 640, 121
|
||
frame_rate = 24
|
||
input_image = Image.open("data/examples/wan/cat_fightning.jpg").resize((width, height)).convert("RGB")
|
||
# Image-to-video
|
||
video, audio = pipe(
|
||
prompt=prompt,
|
||
negative_prompt=negative_prompt,
|
||
height=height,
|
||
width=width,
|
||
num_frames=num_frames,
|
||
input_image=input_image,
|
||
num_inference_steps=50,
|
||
seed=0,
|
||
tiled=True,
|
||
frame_rate=frame_rate,
|
||
)
|
||
write_video_audio(video, audio, "MOVA-360p-cat.mp4", fps=24, audio_sample_rate=pipe.audio_vae.sample_rate)
|