mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-19 06:48:12 +00:00
load hunyuani2v model
This commit is contained in:
88
examples/HunyuanVideo/hunyuanvideo_i2v.py
Normal file
88
examples/HunyuanVideo/hunyuanvideo_i2v.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import torch
|
||||
from diffsynth import ModelManager, HunyuanVideoPipeline, download_models, save_video
|
||||
from diffsynth.prompters.hunyuan_video_prompter import HunyuanVideoPrompter
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import torchvision.transforms as transforms
|
||||
|
||||
|
||||
def generate_crop_size_list(base_size=256, patch_size=32, max_ratio=4.0):
|
||||
num_patches = round((base_size / patch_size)**2)
|
||||
assert max_ratio >= 1.0
|
||||
crop_size_list = []
|
||||
wp, hp = num_patches, 1
|
||||
while wp > 0:
|
||||
if max(wp, hp) / min(wp, hp) <= max_ratio:
|
||||
crop_size_list.append((wp * patch_size, hp * patch_size))
|
||||
if (hp + 1) * wp <= num_patches:
|
||||
hp += 1
|
||||
else:
|
||||
wp -= 1
|
||||
return crop_size_list
|
||||
|
||||
|
||||
def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
|
||||
aspect_ratio = float(height) / float(width)
|
||||
closest_ratio_id = np.abs(ratios - aspect_ratio).argmin()
|
||||
closest_ratio = min(ratios, key=lambda ratio: abs(float(ratio) - aspect_ratio))
|
||||
return buckets[closest_ratio_id], float(closest_ratio)
|
||||
|
||||
|
||||
def prepare_vae_inputs(semantic_images, i2v_resolution="720p"):
|
||||
if i2v_resolution == "720p":
|
||||
bucket_hw_base_size = 960
|
||||
elif i2v_resolution == "540p":
|
||||
bucket_hw_base_size = 720
|
||||
elif i2v_resolution == "360p":
|
||||
bucket_hw_base_size = 480
|
||||
else:
|
||||
raise ValueError(f"i2v_resolution: {i2v_resolution} must be in [360p, 540p, 720p]")
|
||||
origin_size = semantic_images[0].size
|
||||
|
||||
crop_size_list = generate_crop_size_list(bucket_hw_base_size, 32)
|
||||
aspect_ratios = np.array([round(float(h) / float(w), 5) for h, w in crop_size_list])
|
||||
closest_size, closest_ratio = get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
|
||||
ref_image_transform = transforms.Compose([
|
||||
transforms.Resize(closest_size),
|
||||
transforms.CenterCrop(closest_size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5])
|
||||
])
|
||||
|
||||
semantic_image_pixel_values = [ref_image_transform(semantic_image) for semantic_image in semantic_images]
|
||||
semantic_image_pixel_values = torch.cat(semantic_image_pixel_values).unsqueeze(0).unsqueeze(2)
|
||||
return semantic_image_pixel_values
|
||||
|
||||
|
||||
model_manager = ModelManager()
|
||||
|
||||
# The other modules are loaded in float16.
|
||||
|
||||
model_manager.load_models(
|
||||
[
|
||||
"models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
|
||||
],
|
||||
torch_dtype=torch.bfloat16, # you can use torch_dtype=torch.float8_e4m3fn to enable quantization.
|
||||
device="cuda"
|
||||
)
|
||||
|
||||
model_manager.load_models(
|
||||
[
|
||||
"models/HunyuanVideo/text_encoder/model.safetensors",
|
||||
"models/HunyuanVideoI2V/text_encoder_2",
|
||||
'models/HunyuanVideoI2V/vae/pytorch_model.pt'
|
||||
|
||||
],
|
||||
torch_dtype=torch.float16,
|
||||
device="cuda"
|
||||
)
|
||||
# The computation device is "cuda".
|
||||
pipe = HunyuanVideoPipeline.from_model_manager(
|
||||
model_manager,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
enable_vram_management=False
|
||||
)
|
||||
# Although you have enough VRAM, we still recommend you to enable offload.
|
||||
pipe.enable_cpu_offload()
|
||||
print()
|
||||
Reference in New Issue
Block a user