mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-23 09:28:12 +00:00
hunyuanvideo text encoder
This commit is contained in:
@@ -93,6 +93,7 @@ model_loader_configs = [
|
|||||||
(None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
|
(None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
|
||||||
(None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"),
|
(None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"),
|
||||||
(None, "77ff18050dbc23f50382e45d51a779fe", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
|
(None, "77ff18050dbc23f50382e45d51a779fe", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
|
||||||
|
(None, "5da81baee73198a7c19e6d2fe8b5148e", ["sd3_text_encoder_1"], [SD3TextEncoder1], "diffusers"),
|
||||||
]
|
]
|
||||||
huggingface_model_loader_configs = [
|
huggingface_model_loader_configs = [
|
||||||
# These configs are provided for detecting model type automatically.
|
# These configs are provided for detecting model type automatically.
|
||||||
@@ -101,10 +102,11 @@ huggingface_model_loader_configs = [
|
|||||||
("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
|
("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
|
||||||
("BloomForCausalLM", "transformers.models.bloom.modeling_bloom", "beautiful_prompt", None),
|
("BloomForCausalLM", "transformers.models.bloom.modeling_bloom", "beautiful_prompt", None),
|
||||||
("Qwen2ForCausalLM", "transformers.models.qwen2.modeling_qwen2", "qwen_prompt", None),
|
("Qwen2ForCausalLM", "transformers.models.qwen2.modeling_qwen2", "qwen_prompt", None),
|
||||||
("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
|
# ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
|
||||||
("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
|
("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
|
||||||
("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
|
("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
|
||||||
("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel")
|
("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
|
||||||
|
("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "hunyuan_video_text_encoder_2", "LlamaModel")
|
||||||
]
|
]
|
||||||
patch_model_loader_configs = [
|
patch_model_loader_configs = [
|
||||||
# These configs are provided for detecting model type automatically.
|
# These configs are provided for detecting model type automatically.
|
||||||
@@ -627,6 +629,22 @@ preset_models_on_modelscope = {
|
|||||||
("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
|
("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
|
||||||
("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
|
("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
|
||||||
],
|
],
|
||||||
|
"HunyuanVideo":{
|
||||||
|
"file_list": [
|
||||||
|
("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
|
||||||
|
("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00001-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
|
||||||
|
("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00002-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
|
||||||
|
("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00003-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
|
||||||
|
("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00004-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
|
||||||
|
("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "config.json", "models/HunyuanVideo/text_encoder_2"),
|
||||||
|
("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model.safetensors.index.json", "models/HunyuanVideo/text_encoder_2"),
|
||||||
|
|
||||||
|
],
|
||||||
|
"load_path": [
|
||||||
|
"models/HunyuanVideo/text_encoder/model.safetensors",
|
||||||
|
"models/HunyuanVideo/text_encoder_2",
|
||||||
|
],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
Preset_model_id: TypeAlias = Literal[
|
Preset_model_id: TypeAlias = Literal[
|
||||||
"HunyuanDiT",
|
"HunyuanDiT",
|
||||||
@@ -682,4 +700,5 @@ Preset_model_id: TypeAlias = Literal[
|
|||||||
"Annotators:Openpose",
|
"Annotators:Openpose",
|
||||||
"StableDiffusion3.5-large",
|
"StableDiffusion3.5-large",
|
||||||
"StableDiffusion3.5-medium",
|
"StableDiffusion3.5-medium",
|
||||||
|
"HunyuanVideo",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -9,4 +9,5 @@ from .flux_image import FluxImagePipeline
|
|||||||
from .cog_video import CogVideoPipeline
|
from .cog_video import CogVideoPipeline
|
||||||
from .omnigen_image import OmnigenImagePipeline
|
from .omnigen_image import OmnigenImagePipeline
|
||||||
from .pipeline_runner import SDVideoPipelineRunner
|
from .pipeline_runner import SDVideoPipelineRunner
|
||||||
|
from .hunyuan_video import HunyuanVideoPipeline
|
||||||
KolorsImagePipeline = SDXLImagePipeline
|
KolorsImagePipeline = SDXLImagePipeline
|
||||||
|
|||||||
51
diffsynth/pipelines/hunyuan_video.py
Normal file
51
diffsynth/pipelines/hunyuan_video.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
from ..models import ModelManager, SD3TextEncoder1
|
||||||
|
from .base import BasePipeline
|
||||||
|
from ..prompters import HunyuanVideoPrompter
|
||||||
|
import torch
|
||||||
|
from transformers import LlamaModel
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
class HunyuanVideoPipeline(BasePipeline):
|
||||||
|
|
||||||
|
def __init__(self, device="cuda", torch_dtype=torch.float16):
|
||||||
|
super().__init__(device=device, torch_dtype=torch_dtype)
|
||||||
|
# 参照diffsynth的排序,text_encoder_1指CLIP;text_encoder_2指llm,与hunyuanvideo源代码刚好相反
|
||||||
|
self.prompter = HunyuanVideoPrompter()
|
||||||
|
self.text_encoder_1: SD3TextEncoder1 = None
|
||||||
|
self.text_encoder_2: LlamaModel = None
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_models(self, model_manager: ModelManager):
|
||||||
|
self.text_encoder_1 = model_manager.fetch_model("sd3_text_encoder_1")
|
||||||
|
self.text_encoder_2 = model_manager.fetch_model("hunyuan_video_text_encoder_2")
|
||||||
|
self.prompter.fetch_models(self.text_encoder_1, self.text_encoder_2)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_model_manager(model_manager: ModelManager, device=None):
|
||||||
|
|
||||||
|
pipe = HunyuanVideoPipeline(
|
||||||
|
device=model_manager.device if device is None else device,
|
||||||
|
torch_dtype=model_manager.torch_dtype,
|
||||||
|
)
|
||||||
|
pipe.fetch_models(model_manager)
|
||||||
|
return pipe
|
||||||
|
|
||||||
|
def encode_prompt(self, prompt, positive=True, clip_sequence_length=77, llm_sequence_length=256):
|
||||||
|
prompt_emb, pooled_prompt_emb = self.prompter.encode_prompt(
|
||||||
|
prompt, device=self.device, positive=positive, clip_sequence_length=clip_sequence_length, llm_sequence_length=llm_sequence_length
|
||||||
|
)
|
||||||
|
return {"prompt_emb": prompt_emb, "pooled_prompt_emb": pooled_prompt_emb}
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
prompt,
|
||||||
|
negative_prompt="",
|
||||||
|
seed=None,
|
||||||
|
progress_bar_cmd=tqdm,
|
||||||
|
progress_bar_st=None,
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
|
||||||
|
prompt_emb_posi = self.encode_prompt(prompt, positive=True)
|
||||||
|
return prompt_emb_posi
|
||||||
@@ -7,3 +7,4 @@ from .kolors_prompter import KolorsPrompter
|
|||||||
from .flux_prompter import FluxPrompter
|
from .flux_prompter import FluxPrompter
|
||||||
from .omost import OmostPromter
|
from .omost import OmostPromter
|
||||||
from .cog_prompter import CogPrompter
|
from .cog_prompter import CogPrompter
|
||||||
|
from .hunyuan_video_prompter import HunyuanVideoPrompter
|
||||||
|
|||||||
149
diffsynth/prompters/hunyuan_video_prompter.py
Normal file
149
diffsynth/prompters/hunyuan_video_prompter.py
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
from .base_prompter import BasePrompter
|
||||||
|
from ..models.sd3_text_encoder import SD3TextEncoder1
|
||||||
|
from transformers import CLIPTokenizer, LlamaTokenizerFast, LlamaModel
|
||||||
|
import os, torch
|
||||||
|
|
||||||
|
PROMPT_TEMPLATE_ENCODE = (
|
||||||
|
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
|
||||||
|
"quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
||||||
|
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>")
|
||||||
|
|
||||||
|
PROMPT_TEMPLATE_ENCODE_VIDEO = (
|
||||||
|
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
|
||||||
|
"1. The main content and theme of the video."
|
||||||
|
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
||||||
|
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
||||||
|
"4. background environment, light, style and atmosphere."
|
||||||
|
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
|
||||||
|
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>")
|
||||||
|
|
||||||
|
PROMPT_TEMPLATE = {
|
||||||
|
"dit-llm-encode": {
|
||||||
|
"template": PROMPT_TEMPLATE_ENCODE,
|
||||||
|
"crop_start": 36,
|
||||||
|
},
|
||||||
|
"dit-llm-encode-video": {
|
||||||
|
"template": PROMPT_TEMPLATE_ENCODE_VIDEO,
|
||||||
|
"crop_start": 95,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
|
||||||
|
|
||||||
|
|
||||||
|
class HunyuanVideoPrompter(BasePrompter):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
tokenizer_1_path=None,
|
||||||
|
tokenizer_2_path=None,
|
||||||
|
):
|
||||||
|
if tokenizer_1_path is None:
|
||||||
|
base_path = os.path.dirname(os.path.dirname(__file__))
|
||||||
|
tokenizer_1_path = os.path.join(
|
||||||
|
base_path, "tokenizer_configs/hunyuan_video/tokenizer_1")
|
||||||
|
if tokenizer_2_path is None:
|
||||||
|
base_path = os.path.dirname(os.path.dirname(__file__))
|
||||||
|
tokenizer_2_path = os.path.join(
|
||||||
|
base_path, "tokenizer_configs/hunyuan_video/tokenizer_2")
|
||||||
|
super().__init__()
|
||||||
|
self.tokenizer_1 = CLIPTokenizer.from_pretrained(tokenizer_1_path)
|
||||||
|
self.tokenizer_2 = LlamaTokenizerFast.from_pretrained(tokenizer_2_path, padding_side='right')
|
||||||
|
self.text_encoder_1: SD3TextEncoder1 = None
|
||||||
|
self.text_encoder_2: LlamaModel = None
|
||||||
|
|
||||||
|
self.prompt_template = PROMPT_TEMPLATE['dit-llm-encode']
|
||||||
|
self.prompt_template_video = PROMPT_TEMPLATE['dit-llm-encode-video']
|
||||||
|
|
||||||
|
def fetch_models(self, text_encoder_1: SD3TextEncoder1 = None, text_encoder_2: LlamaModel = None):
|
||||||
|
self.text_encoder_1 = text_encoder_1
|
||||||
|
self.text_encoder_2 = text_encoder_2
|
||||||
|
|
||||||
|
def apply_text_to_template(self, text, template):
|
||||||
|
assert isinstance(template, str)
|
||||||
|
if isinstance(text, list):
|
||||||
|
return [self.apply_text_to_template(text_) for text_ in text]
|
||||||
|
elif isinstance(text, str):
|
||||||
|
# Will send string to tokenizer. Used for llm
|
||||||
|
return template.format(text)
|
||||||
|
else:
|
||||||
|
raise TypeError(f"Unsupported prompt type: {type(text)}")
|
||||||
|
|
||||||
|
def encode_prompt_using_clip(self, prompt, max_length, device):
|
||||||
|
input_ids = self.tokenizer_1(prompt,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding="max_length",
|
||||||
|
max_length=max_length,
|
||||||
|
truncation=True).input_ids.to(device)
|
||||||
|
return self.text_encoder_1(input_ids=input_ids)[0]
|
||||||
|
|
||||||
|
def encode_prompt_using_llm(self,
|
||||||
|
prompt,
|
||||||
|
max_length,
|
||||||
|
device,
|
||||||
|
crop_start,
|
||||||
|
hidden_state_skip_layer=2,
|
||||||
|
apply_final_norm=False,
|
||||||
|
use_attention_mask=True):
|
||||||
|
max_length += crop_start
|
||||||
|
inputs = self.tokenizer_2(prompt,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding="max_length",
|
||||||
|
max_length=max_length,
|
||||||
|
truncation=True)
|
||||||
|
input_ids = inputs.input_ids.to(device)
|
||||||
|
attention_mask = inputs.attention_mask.to(device)
|
||||||
|
output_hidden_states = hidden_state_skip_layer is not None
|
||||||
|
outputs = self.text_encoder_2(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
output_hidden_states=output_hidden_states)
|
||||||
|
|
||||||
|
if hidden_state_skip_layer is not None:
|
||||||
|
last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
|
||||||
|
if hidden_state_skip_layer > 0 and apply_final_norm:
|
||||||
|
last_hidden_state = self.text_encoder_2.norm(last_hidden_state)
|
||||||
|
else:
|
||||||
|
last_hidden_state = outputs['last_hidden_state']
|
||||||
|
# crop out
|
||||||
|
if crop_start > 0:
|
||||||
|
last_hidden_state = last_hidden_state[:, crop_start:]
|
||||||
|
attention_mask = (attention_mask[:, crop_start:] if use_attention_mask else None)
|
||||||
|
|
||||||
|
return last_hidden_state
|
||||||
|
|
||||||
|
def encode_prompt(self,
|
||||||
|
prompt,
|
||||||
|
positive=True,
|
||||||
|
device="cuda",
|
||||||
|
clip_sequence_length=77,
|
||||||
|
llm_sequence_length=256,
|
||||||
|
data_type='video',
|
||||||
|
use_template=True,
|
||||||
|
hidden_state_skip_layer=2,
|
||||||
|
apply_final_norm=False,
|
||||||
|
use_attention_mask=True):
|
||||||
|
|
||||||
|
prompt = self.process_prompt(prompt, positive=positive)
|
||||||
|
|
||||||
|
# apply template
|
||||||
|
if use_template:
|
||||||
|
template = self.prompt_template_video if data_type == 'video' else self.prompt_template
|
||||||
|
prompt_formated = self.apply_text_to_template(prompt, template['template'])
|
||||||
|
else:
|
||||||
|
prompt_formated = prompt
|
||||||
|
# Text encoder
|
||||||
|
if data_type == 'video':
|
||||||
|
crop_start = self.prompt_template_video.get("crop_start", 0)
|
||||||
|
else:
|
||||||
|
crop_start = self.prompt_template.get("crop_start", 0)
|
||||||
|
|
||||||
|
# CLIP
|
||||||
|
pooled_prompt_emb = self.encode_prompt_using_clip(prompt, clip_sequence_length, device)
|
||||||
|
|
||||||
|
# LLM
|
||||||
|
prompt_emb = self.encode_prompt_using_llm(
|
||||||
|
prompt_formated, llm_sequence_length, device, crop_start,
|
||||||
|
hidden_state_skip_layer, apply_final_norm, use_attention_mask)
|
||||||
|
|
||||||
|
return prompt_emb, pooled_prompt_emb
|
||||||
48895
diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt
Normal file
48895
diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|startoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": true,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"add_prefix_space": false,
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"49406": {
|
||||||
|
"content": "<|startoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": true,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"49407": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"bos_token": "<|startoftext|>",
|
||||||
|
"clean_up_tokenization_spaces": true,
|
||||||
|
"do_lower_case": true,
|
||||||
|
"eos_token": "<|endoftext|>",
|
||||||
|
"errors": "replace",
|
||||||
|
"model_max_length": 77,
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
"tokenizer_class": "CLIPTokenizer",
|
||||||
|
"unk_token": "<|endoftext|>"
|
||||||
|
}
|
||||||
49410
diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json
Normal file
49410
diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|begin_of_text|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|end_of_text|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<pad>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
1251020
diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json
Normal file
1251020
diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
16
examples/video_synthesis/hunyuanvideo.py
Normal file
16
examples/video_synthesis/hunyuanvideo.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from diffsynth import ModelManager, HunyuanVideoPipeline, download_models
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
# Download models (automatically)
|
||||||
|
download_models(["HunyuanVideo"])
|
||||||
|
|
||||||
|
# Load models
|
||||||
|
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||||
|
model_manager.load_models([
|
||||||
|
"t2i_models/HunyuanVideo/text_encoder/model.safetensors",
|
||||||
|
"t2i_models/HunyuanVideo/text_encoder_2",
|
||||||
|
])
|
||||||
|
pipe = HunyuanVideoPipeline.from_model_manager(model_manager)
|
||||||
|
prompt = 'A cat walks on the grass, realistic style.'
|
||||||
|
pipe(prompt)
|
||||||
Reference in New Issue
Block a user