mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
release ExVideo
This commit is contained in:
83
examples/ExVideo/ExVideo_svd.py
Normal file
83
examples/ExVideo/ExVideo_svd.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline
|
||||
from diffsynth import ModelManager
|
||||
import torch, os
|
||||
|
||||
|
||||
def generate_image():
|
||||
# Load models
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "True"
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
|
||||
"models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
|
||||
"models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
|
||||
"models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
|
||||
])
|
||||
pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
# Generate an image
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="bonfire, on the stone",
|
||||
negative_prompt="错误的眼睛,糟糕的人脸,毁容,糟糕的艺术,变形,多余的肢体,模糊的颜色,模糊,重复,病态,残缺,",
|
||||
num_inference_steps=50, height=1024, width=1024,
|
||||
)
|
||||
return image
|
||||
|
||||
|
||||
def generate_video(image):
|
||||
# Load models
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/stable_video_diffusion/svd_xt.safetensors",
|
||||
"models/stable_video_diffusion/model.fp16.safetensors"
|
||||
])
|
||||
pipe = SVDVideoPipeline.from_model_manager(model_manager)
|
||||
|
||||
# Generate a video
|
||||
torch.manual_seed(1)
|
||||
video = pipe(
|
||||
input_image=image.resize((512, 512)),
|
||||
num_frames=128, fps=30, height=512, width=512,
|
||||
motion_bucket_id=127,
|
||||
num_inference_steps=50,
|
||||
min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2
|
||||
)
|
||||
return video
|
||||
|
||||
|
||||
def upscale_video(image, video):
|
||||
# Load models
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/stable_video_diffusion/svd_xt.safetensors",
|
||||
"models/stable_video_diffusion/model.fp16.safetensors"
|
||||
])
|
||||
pipe = SVDVideoPipeline.from_model_manager(model_manager)
|
||||
|
||||
# Generate a video
|
||||
torch.manual_seed(2)
|
||||
video = pipe(
|
||||
input_image=image.resize((1024, 1024)),
|
||||
input_video=[frame.resize((1024, 1024)) for frame in video], denoising_strength=0.5,
|
||||
num_frames=128, fps=30, height=1024, width=1024,
|
||||
motion_bucket_id=127,
|
||||
num_inference_steps=25,
|
||||
min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2
|
||||
)
|
||||
return video
|
||||
|
||||
|
||||
# We use Hunyuan DiT to generate the first frame.
|
||||
# If you want to use your own image,
|
||||
# please use `image = Image.open("your_image_file.png")` to replace the following code.
|
||||
image = generate_image()
|
||||
image.save("image.png")
|
||||
|
||||
# Now, generate a video with resolution of 512.
|
||||
video = generate_video(image)
|
||||
save_video(video, "video_512.mp4", fps=30)
|
||||
|
||||
# Upscale the video.
|
||||
video = upscale_video(image, video)
|
||||
save_video(video, "video_1024.mp4", fps=30)
|
||||
16
examples/ExVideo/README.md
Normal file
16
examples/ExVideo/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# ExVideo
|
||||
|
||||
ExVideo is a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
|
||||
|
||||
* [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
|
||||
* [Source Code](https://github.com/modelscope/DiffSynth-Studio/tree/main/examples/ExVideo)
|
||||
* Technical report
|
||||
* Extended models
|
||||
* [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
|
||||
* [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1)
|
||||
|
||||
## Example: Text-to-video via extended Stable Video Diffusion
|
||||
|
||||
Generate a video using a text-to-image model and our image-to-video model. See [ExVideo_svd.py](./ExVideo_svd.py).
|
||||
|
||||
https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc
|
||||
Reference in New Issue
Block a user