From 0b1704976a87667207c4fd32b1fecb080eb229e2 Mon Sep 17 00:00:00 2001 From: Artiprocher Date: Thu, 27 Jun 2024 19:43:50 +0800 Subject: [PATCH] update examples and downloaders --- diffsynth/models/__init__.py | 157 +++++++++++++++--- diffsynth/models/sd_ipadapter.py | 2 +- diffsynth/pipelines/stable_diffusion_xl.py | 5 + diffsynth/prompts/sdxl_prompter.py | 4 + examples/Diffutoon/diffutoon_toon_shading.py | 14 +- ...utoon_toon_shading_with_editing_signals.py | 18 +- examples/Diffutoon/sd_toon_shading.py | 19 ++- examples/ExVideo/ExVideo_svd_test.py | 25 ++- examples/Ip-Adapter/README.md | 43 ++++- examples/Ip-Adapter/sd_ipadapter.py | 38 +++++ examples/Ip-Adapter/sdxl_ipadapter.py | 55 ++++-- .../sdxl_ipadapter_multi_reference.py | 34 ++++ examples/diffsynth/sd_video_rerender.py | 25 ++- examples/hunyuan_dit/README.md | 8 + .../image_synthesis/sd_prompt_refining.py | 7 +- examples/image_synthesis/sd_text_to_image.py | 18 +- .../image_synthesis/sdxl_text_to_image.py | 6 +- examples/image_synthesis/sdxl_turbo.py | 6 +- examples/video_synthesis/sd_text_to_video.py | 6 +- .../video_synthesis/sdxl_text_to_video.py | 8 +- examples/video_synthesis/svd_text_to_video.py | 11 +- 21 files changed, 409 insertions(+), 100 deletions(-) create mode 100644 examples/Ip-Adapter/sd_ipadapter.py create mode 100644 examples/Ip-Adapter/sdxl_ipadapter_multi_reference.py diff --git a/diffsynth/models/__init__.py b/diffsynth/models/__init__.py index ba2f601..8939add 100644 --- a/diffsynth/models/__init__.py +++ b/diffsynth/models/__init__.py @@ -48,23 +48,129 @@ preset_models_on_huggingface = { ], } preset_models_on_modelscope = { + # Hunyuan DiT "HunyuanDiT": [ ("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"), ("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"), ("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"), ("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"), ], + # Stable Video Diffusion "stable-video-diffusion-img2vid-xt": [ ("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"), ], + # ExVideo "ExVideo-SVD-128f-v1": [ ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"), ], + # Stable Diffusion + "StableDiffusion_v15": [ + ("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"), + ], + "DreamShaper_8": [ + ("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"), + ], + "AingDiffusion_v12": [ + ("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"), + ], + "Flat2DAnimerge_v45Sharp": [ + ("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"), + ], + # Textual Inversion + "TextualInversion_VeryBadImageNegative_v1.3": [ + ("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"), + ], + # Stable Diffusion XL + "StableDiffusionXL_v1": [ + ("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"), + ], + "BluePencilXL_v200": [ + ("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"), + ], + "StableDiffusionXL_Turbo": [ + ("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"), + ], + # ControlNet + "ControlNet_v11f1p_sd15_depth": [ + ("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"), + ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators") + ], + "ControlNet_v11p_sd15_softedge": [ + ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"), + ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators") + ], + "ControlNet_v11f1e_sd15_tile": [ + ("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet") + ], + "ControlNet_v11p_sd15_lineart": [ + ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"), + ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"), + ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators") + ], + # AnimateDiff + "AnimateDiff_v2": [ + ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"), + ], + "AnimateDiff_xl_beta": [ + ("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"), + ], + # RIFE + "RIFE": [ + ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"), + ], + # Beautiful Prompt + "BeautifulPrompt": [ + ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"), + ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"), + ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"), + ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"), + ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"), + ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"), + ], + # Translator + "opus-mt-zh-en": [ + ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"), + ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"), + ], + # IP-Adapter + "IP-Adapter-SD": [ + ("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"), + ("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"), + ], + "IP-Adapter-SDXL": [ + ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"), + ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"), + ], } Preset_model_id: TypeAlias = Literal[ "HunyuanDiT", "stable-video-diffusion-img2vid-xt", - "ExVideo-SVD-128f-v1" + "ExVideo-SVD-128f-v1", + "StableDiffusion_v15", + "DreamShaper_8", + "AingDiffusion_v12", + "Flat2DAnimerge_v45Sharp", + "TextualInversion_VeryBadImageNegative_v1.3", + "StableDiffusionXL_v1", + "BluePencilXL_v200", + "StableDiffusionXL_Turbo", + "ControlNet_v11f1p_sd15_depth", + "ControlNet_v11p_sd15_softedge", + "ControlNet_v11f1e_sd15_tile", + "ControlNet_v11p_sd15_lineart", + "AnimateDiff_v2", + "AnimateDiff_xl_beta", + "RIFE", + "BeautifulPrompt", + "opus-mt-zh-en", + "IP-Adapter-SD", + "IP-Adapter-SDXL", ] Preset_model_website: TypeAlias = Literal[ "HuggingFace", @@ -80,6 +186,26 @@ website_to_download_fn = { } +def download_models( + model_id_list: List[Preset_model_id] = [], + downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"], +): + downloaded_files = [] + for model_id in model_id_list: + for website in downloading_priority: + if model_id in website_to_preset_models[website]: + for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]: + # Check if the file is downloaded. + file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path)) + if file_to_download in downloaded_files: + continue + # Download + website_to_download_fn[website](model_id, origin_file_path, local_dir) + if os.path.basename(origin_file_path) in os.listdir(local_dir): + downloaded_files.append(file_to_download) + return downloaded_files + + class ModelManager: def __init__( self, @@ -94,28 +220,19 @@ class ModelManager: self.model = {} self.model_path = {} self.textual_inversion_dict = {} - downloaded_files = self.download_models(model_id_list, downloading_priority) + downloaded_files = download_models(model_id_list, downloading_priority) self.load_models(downloaded_files + file_path_list) - def download_models( + def load_model_from_origin( self, - model_id_list: List[Preset_model_id] = [], - downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"], + download_from: Preset_model_website = "ModelScope", + model_id = "", + origin_file_path = "", + local_dir = "" ): - downloaded_files = [] - for model_id in model_id_list: - for website in downloading_priority: - if model_id in website_to_preset_models[website]: - for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]: - # Check if the file is downloaded. - file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path)) - if file_to_download in downloaded_files: - continue - # Download - website_to_download_fn[website](model_id, origin_file_path, local_dir) - if os.path.basename(origin_file_path) in os.listdir(local_dir): - downloaded_files.append(file_to_download) - return downloaded_files + website_to_download_fn[download_from](model_id, origin_file_path, local_dir) + file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path)) + self.load_model(file_to_download) def is_stable_video_diffusion(self, state_dict): param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight" @@ -158,7 +275,7 @@ class ModelManager: def is_translator(self, state_dict): param_name = "model.encoder.layers.5.self_attn_layer_norm.weight" - return param_name in state_dict and len(state_dict) == 254 + return param_name in state_dict and len(state_dict) == 258 def is_ipadapter(self, state_dict): return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([3072, 1024]) diff --git a/diffsynth/models/sd_ipadapter.py b/diffsynth/models/sd_ipadapter.py index 723d53d..e1554f2 100644 --- a/diffsynth/models/sd_ipadapter.py +++ b/diffsynth/models/sd_ipadapter.py @@ -29,7 +29,7 @@ class SDIpAdapter(torch.nn.Module): def set_less_adapter(self): # IP-Adapter for SD v1.5 doesn't support this feature. - self.set_full_adapter(self) + self.set_full_adapter() def forward(self, hidden_states, scale=1.0): hidden_states = self.image_proj(hidden_states) diff --git a/diffsynth/pipelines/stable_diffusion_xl.py b/diffsynth/pipelines/stable_diffusion_xl.py index cf43fb3..aa4d73e 100644 --- a/diffsynth/pipelines/stable_diffusion_xl.py +++ b/diffsynth/pipelines/stable_diffusion_xl.py @@ -87,6 +87,7 @@ class SDXLImagePipeline(torch.nn.Module): input_image=None, ipadapter_images=None, ipadapter_scale=1.0, + ipadapter_use_instant_style=False, controlnet_image=None, denoising_strength=1.0, height=1024, @@ -134,6 +135,10 @@ class SDXLImagePipeline(torch.nn.Module): # IP-Adapter if ipadapter_images is not None: + if ipadapter_use_instant_style: + self.ipadapter.set_less_adapter() + else: + self.ipadapter.set_full_adapter() ipadapter_image_encoding = self.ipadapter_image_encoder(ipadapter_images) ipadapter_kwargs_list_posi = self.ipadapter(ipadapter_image_encoding, scale=ipadapter_scale) ipadapter_kwargs_list_nega = self.ipadapter(torch.zeros_like(ipadapter_image_encoding)) diff --git a/diffsynth/prompts/sdxl_prompter.py b/diffsynth/prompts/sdxl_prompter.py index e313db2..9c0a927 100644 --- a/diffsynth/prompts/sdxl_prompter.py +++ b/diffsynth/prompts/sdxl_prompter.py @@ -41,6 +41,10 @@ class SDXLPrompter(Prompter): add_text_embeds, prompt_emb_2 = text_encoder_2(input_ids_2, clip_skip=clip_skip_2) # Merge + if prompt_emb_1.shape[0] != prompt_emb_2.shape[0]: + max_batch_size = min(prompt_emb_1.shape[0], prompt_emb_2.shape[0]) + prompt_emb_1 = prompt_emb_1[: max_batch_size] + prompt_emb_2 = prompt_emb_2[: max_batch_size] prompt_emb = torch.concatenate([prompt_emb_1, prompt_emb_2], dim=-1) # For very long prompt, we only use the first 77 tokens to compute `add_text_embeds`. diff --git a/examples/Diffutoon/diffutoon_toon_shading.py b/examples/Diffutoon/diffutoon_toon_shading.py index 60e7733..2406a80 100644 --- a/examples/Diffutoon/diffutoon_toon_shading.py +++ b/examples/Diffutoon/diffutoon_toon_shading.py @@ -1,7 +1,7 @@ -from diffsynth import SDVideoPipelineRunner +from diffsynth import SDVideoPipelineRunner, download_models -# Download models +# Download models (automatically) # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575) # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) @@ -9,7 +9,13 @@ from diffsynth import SDVideoPipelineRunner # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) - +download_models([ + "AingDiffusion_v12", + "AnimateDiff_v2", + "ControlNet_v11p_sd15_lineart", + "ControlNet_v11f1e_sd15_tile", + "TextualInversion_VeryBadImageNegative_v1.3" +]) # The original video in the example is https://www.bilibili.com/video/BV1iG411a7sQ/. config = { @@ -63,7 +69,7 @@ config = { "end_frame_id": 30 } ], - "output_folder": "data/examples/diffutoon/output", + "output_folder": "output", "fps": 30 }, "pipeline": { diff --git a/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py b/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py index 428867e..d07236c 100644 --- a/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py +++ b/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py @@ -1,8 +1,8 @@ -from diffsynth import SDVideoPipelineRunner +from diffsynth import SDVideoPipelineRunner, download_models import os -# Download models +# Download models (automatically) # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575) # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) @@ -14,7 +14,15 @@ import os # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) - +download_models([ + "AingDiffusion_v12", + "AnimateDiff_v2", + "ControlNet_v11p_sd15_lineart", + "ControlNet_v11f1e_sd15_tile", + "ControlNet_v11f1p_sd15_depth", + "ControlNet_v11p_sd15_softedge", + "TextualInversion_VeryBadImageNegative_v1.3" +]) # The original video in the example is https://www.bilibili.com/video/BV1zu4y1s7Ec/. config_stage_1 = { @@ -67,7 +75,7 @@ config_stage_1 = { "end_frame_id": 30 } ], - "output_folder": "data/examples/diffutoon_edit/color_video", + "output_folder": "output/color_video", "fps": 25 }, "smoother_configs": [ @@ -153,7 +161,7 @@ config_stage_2 = { "end_frame_id": 30 } ], - "output_folder": "data/examples/diffutoon_edit/output", + "output_folder": "output/edited_video", "fps": 30 }, "pipeline": { diff --git a/examples/Diffutoon/sd_toon_shading.py b/examples/Diffutoon/sd_toon_shading.py index 8aadff3..1e2169f 100644 --- a/examples/Diffutoon/sd_toon_shading.py +++ b/examples/Diffutoon/sd_toon_shading.py @@ -1,9 +1,8 @@ -from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames -from diffsynth.extensions.RIFE import RIFESmoother +from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors`: [link](https://civitai.com/api/download/models/266360?type=Model&format=SafeTensor&size=pruned&fp=fp16) # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) @@ -11,8 +10,13 @@ import torch # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) -# `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing) - +download_models([ + "Flat2DAnimerge_v45Sharp", + "AnimateDiff_v2", + "ControlNet_v11p_sd15_lineart", + "ControlNet_v11f1e_sd15_tile", + "TextualInversion_VeryBadImageNegative_v1.3" +]) # Load models model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") @@ -22,7 +26,6 @@ model_manager.load_models([ "models/AnimateDiff/mm_sd_v15_v2.ckpt", "models/ControlNet/control_v11p_sd15_lineart.pth", "models/ControlNet/control_v11f1e_sd15_tile.pth", - "models/RIFE/flownet.pkl" ]) pipe = SDVideoPipeline.from_model_manager( model_manager, @@ -39,12 +42,11 @@ pipe = SDVideoPipeline.from_model_manager( ) ] ) -smoother = RIFESmoother.from_model_manager(model_manager) # Load video (we only use 60 frames for quick testing) # The original video is here: https://www.bilibili.com/video/BV19w411A7YJ/ video = VideoData( - video_file="data/bilibili_videos/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻 - 1.66 微笑调查队🌻(Av278681824,P1).mp4", + video_file="data/examples/bilibili/BV19w411A7YJ.mp4", height=1024, width=1024) input_video = [video[i] for i in range(40*60, 41*60)] @@ -59,7 +61,6 @@ output_video = pipe( animatediff_batch_size=32, animatediff_stride=16, vram_limit_level=0, ) -output_video = smoother(output_video) # Save video save_video(output_video, "output_video.mp4", fps=60) diff --git a/examples/ExVideo/ExVideo_svd_test.py b/examples/ExVideo/ExVideo_svd_test.py index 0f5e797..29a1910 100644 --- a/examples/ExVideo/ExVideo_svd_test.py +++ b/examples/ExVideo/ExVideo_svd_test.py @@ -1,4 +1,4 @@ -from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline +from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline, download_models from diffsynth import ModelManager import torch, os @@ -31,7 +31,14 @@ import torch, os def generate_image(): # Load models os.environ["TOKENIZERS_PARALLELISM"] = "True" - model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["HunyuanDiT"]) + download_models(["HunyuanDiT"]) + model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", + file_path_list=[ + "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin", + "models/HunyuanDiT/t2i/mt5/pytorch_model.bin", + "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt", + "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", + ]) pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager) # Generate an image @@ -47,7 +54,12 @@ def generate_image(): def generate_video(image): # Load models - model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"]) + download_models(["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"]) + model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", + file_path_list=[ + "models/stable_video_diffusion/svd_xt.safetensors", + "models/stable_video_diffusion/model.fp16.safetensors", + ]) pipe = SVDVideoPipeline.from_model_manager(model_manager) # Generate a video @@ -65,7 +77,12 @@ def generate_video(image): def upscale_video(image, video): # Load models - model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"]) + download_models(["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"]) + model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", + file_path_list=[ + "models/stable_video_diffusion/svd_xt.safetensors", + "models/stable_video_diffusion/model.fp16.safetensors", + ]) pipe = SVDVideoPipeline.from_model_manager(model_manager) # Generate a video diff --git a/examples/Ip-Adapter/README.md b/examples/Ip-Adapter/README.md index 2e2ca1d..86f9dab 100644 --- a/examples/Ip-Adapter/README.md +++ b/examples/Ip-Adapter/README.md @@ -1,3 +1,44 @@ # IP-Adapter -The features of IP-Adapter in DiffSynth Studio is not completed. Please wait for us. +IP-Adapter is a interesting model, which can adopt the content or style of another image to generate a new image. + +## Example: Content Controlling in Stable Diffusion + +Based on Stable Diffusion, we can transfer the object to another scene. See [`sd_ipadapter.py`](./sd_ipadapter.py). + +|First, we generate a car. The prompt is "masterpiece, best quality, a car".|Next, utilizing IP-Adapter, we move the car to the road. The prompt is "masterpiece, best quality, a car running on the road".| +|-|-| +|![car](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8530a2f0-f610-4269-a22c-ac6c2f21fc18)|![car_on_the_road](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b8ccddb2-c423-46d8-bd1a-327fcc074a36)| + +## Example: Content and Style Controlling in Stable Diffusion XL + +The IP-Adapter model based on Stable Diffusion XL is more powerful. You have the option to use the content or style. See [`sdxl_ipadapter.py`](./sdxl_ipadapter.py). + +* Content controlling (original usage of IP-Adapter) + +|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparision, disable IP-Adapter to see the generated image.| +|-|-|-| +|![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_jumping_rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b93c5495-0b77-4d97-bcd3-3942858288f2)|![rabbit_to_jumping_rabbit_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/52f37195-65b3-4a38-8d9b-73df37311c15)| + + +* Style controlling (InstantStyle) + +|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparision, disable IP-Adapter to see the generated image.| +|-|-|-| +|![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_cat](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/a006b281-f643-4ea9-b0da-712289c96059)|![rabbit_to_cat_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/189bd11e-7a10-4c09-8554-0eebde9150fd)| + +## Example: Image Fusing (Experimental) + +Since IP-Adapter can control the content based on more than one image, we can do something interesting. See [`sdxl_ipadapter_multi_reference.py`](sdxl_ipadapter_multi_reference.py). + +We have two pokemons here: + +|Charizard|Pikachu| +|-|-| +|![](https://media.52poke.com/wiki/7/7e/006Charizard.png)|![](https://media.52poke.com/wiki/0/0d/025Pikachu.png)| + +Fuse! + +|Pikazard ???| +|-| +|![Pikazard](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/807cdb31-94f5-4cc2-a978-3c6a7ffedc5b)| diff --git a/examples/Ip-Adapter/sd_ipadapter.py b/examples/Ip-Adapter/sd_ipadapter.py new file mode 100644 index 0000000..ac18d67 --- /dev/null +++ b/examples/Ip-Adapter/sd_ipadapter.py @@ -0,0 +1,38 @@ +from diffsynth import ModelManager, SDImagePipeline, download_models +import torch + + +# Download models (automatically) +# `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16) +# `models/IpAdapter/stable_diffusion/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/model.safetensors) +# `models/IpAdapter/stable_diffusion/ip-adapter_sd15.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/models/ip-adapter_sd15.bin) +# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) +download_models(["DreamShaper_8", "IP-Adapter-SD", "TextualInversion_VeryBadImageNegative_v1.3"]) + +# Load models +model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") +model_manager.load_textual_inversions("models/textual_inversion") +model_manager.load_models([ + "models/stable_diffusion/aingdiffusion_v12.safetensors", + "models/IpAdapter/stable_diffusion/image_encoder/model.safetensors", + "models/IpAdapter/stable_diffusion/ip-adapter_sd15.bin" +]) +pipe = SDImagePipeline.from_model_manager(model_manager) + +torch.manual_seed(1) +style_image = pipe( + prompt="masterpiece, best quality, a car", + negative_prompt="verybadimagenegative_v1.3", + cfg_scale=7, clip_skip=2, + height=512, width=512, num_inference_steps=50, +) +style_image.save("car.jpg") + +image = pipe( + prompt="masterpiece, best quality, a car running on the road", + negative_prompt="verybadimagenegative_v1.3", + cfg_scale=7, clip_skip=2, + height=512, width=512, num_inference_steps=50, + ipadapter_images=[style_image], ipadapter_scale=1.0 +) +image.save("car_on_the_road.jpg") diff --git a/examples/Ip-Adapter/sdxl_ipadapter.py b/examples/Ip-Adapter/sdxl_ipadapter.py index 706bef0..29f2da8 100644 --- a/examples/Ip-Adapter/sdxl_ipadapter.py +++ b/examples/Ip-Adapter/sdxl_ipadapter.py @@ -1,36 +1,61 @@ -from diffsynth import ModelManager, SDXLImagePipeline +from diffsynth import ModelManager, SDXLImagePipeline, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) -# `models/IpAdapter/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors) -# `models/IpAdapter/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors) +# `models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors) +# `models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors) +download_models(["StableDiffusionXL_v1", "IP-Adapter-SDXL"]) # Load models model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") model_manager.load_models([ "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors", - "models/IpAdapter/image_encoder/model.safetensors", - "models/IpAdapter/ip-adapter_sdxl.bin" + "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors", + "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin" ]) pipe = SDXLImagePipeline.from_model_manager(model_manager) -pipe.ipadapter.set_less_adapter() -torch.manual_seed(0) +torch.manual_seed(123456) style_image = pipe( - prompt="Starry Night, blue sky, by van Gogh", - negative_prompt="dark, gray", + prompt="a rabbit in a garden, colorful flowers", + negative_prompt="anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured", cfg_scale=5, - height=1024, width=1024, num_inference_steps=30, + height=1024, width=1024, num_inference_steps=50, ) -style_image.save("style_image.jpg") +style_image.save("rabbit.jpg") image = pipe( prompt="a cat", negative_prompt="", cfg_scale=5, - height=1024, width=1024, num_inference_steps=30, - ipadapter_images=[style_image] + height=1024, width=1024, num_inference_steps=50, + ipadapter_images=[style_image], ipadapter_use_instant_style=True ) -image.save("transferred_image.jpg") +image.save("rabbit_to_cat.jpg") + +image = pipe( + prompt="a rabbit is jumping", + negative_prompt="", + cfg_scale=5, + height=1024, width=1024, num_inference_steps=50, + ipadapter_images=[style_image], ipadapter_use_instant_style=False, ipadapter_scale=0.5 +) +image.save("rabbit_to_jumping_rabbit.jpg") + +image = pipe( + prompt="a cat", + negative_prompt="", + cfg_scale=5, + height=1024, width=1024, num_inference_steps=50, +) +image.save("rabbit_to_cat_without_ipa.jpg") + +image = pipe( + prompt="a rabbit is jumping", + negative_prompt="", + cfg_scale=5, + height=1024, width=1024, num_inference_steps=50, +) +image.save("rabbit_to_jumping_rabbit_without_ipa.jpg") \ No newline at end of file diff --git a/examples/Ip-Adapter/sdxl_ipadapter_multi_reference.py b/examples/Ip-Adapter/sdxl_ipadapter_multi_reference.py new file mode 100644 index 0000000..4094a70 --- /dev/null +++ b/examples/Ip-Adapter/sdxl_ipadapter_multi_reference.py @@ -0,0 +1,34 @@ +from diffsynth import ModelManager, SDXLImagePipeline, download_models +import torch, requests +from PIL import Image + + +# Download models (automatically) +# `models/stable_diffusion_xl/bluePencilXL_v200.safetensors`: [link](https://civitai.com/api/download/models/245614?type=Model&format=SafeTensor&size=pruned&fp=fp16) +# `models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors) +# `models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors) +download_models(["BluePencilXL_v200", "IP-Adapter-SDXL"]) + +# Load models +model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") +model_manager.load_models([ + "models/stable_diffusion_xl/bluePencilXL_v200.safetensors", + "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors", + "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin" +]) +pipe = SDXLImagePipeline.from_model_manager(model_manager) + +image_1 = Image.open(requests.get("https://media.52poke.com/wiki/7/7e/006Charizard.png", stream=True).raw).convert("RGB").resize((1024, 1024)) +image_1.save("Charizard.jpg") +image_2 = Image.open(requests.get("https://media.52poke.com/wiki/0/0d/025Pikachu.png", stream=True).raw).convert("RGB").resize((1024, 1024)) +image_2.save("Pikachu.jpg") + +torch.manual_seed(0) +image = pipe( + prompt="a pokemon, maybe Charizard, maybe Pikachu", + negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", + cfg_scale=5, + height=1024, width=1024, num_inference_steps=50, + ipadapter_images=[image_1, image_2], ipadapter_use_instant_style=False, ipadapter_scale=0.7 +) +image.save(f"Pikazard.jpg") diff --git a/examples/diffsynth/sd_video_rerender.py b/examples/diffsynth/sd_video_rerender.py index 0e82442..dc00954 100644 --- a/examples/diffsynth/sd_video_rerender.py +++ b/examples/diffsynth/sd_video_rerender.py @@ -1,24 +1,31 @@ -from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video +from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, download_models from diffsynth.processors.FastBlend import FastBlendSmoother from diffsynth.processors.PILEditor import ContrastEditor, SharpnessEditor from diffsynth.processors.sequencial_processor import SequencialProcessor import torch -# Download models +# Download models (automatically) # `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16) # `models/ControlNet/control_v11f1p_sd15_depth.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1p_sd15_depth.pth) # `models/ControlNet/control_v11p_sd15_softedge.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_softedge.pth) # `models/Annotators/dpt_hybrid-midas-501f0c75.pt`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt) # `models/Annotators/ControlNetHED.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth) +download_models([ + "ControlNet_v11f1p_sd15_depth", + "ControlNet_v11p_sd15_softedge", + "DreamShaper_8" +]) # Load models -model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") -model_manager.load_models([ - "models/stable_diffusion/dreamshaper_8.safetensors", - "models/ControlNet/control_v11f1p_sd15_depth.pth", - "models/ControlNet/control_v11p_sd15_softedge.pth" -]) +model_manager = ModelManager( + torch_dtype=torch.float16, device="cuda", + file_path_list=[ + "models/stable_diffusion/dreamshaper_8.safetensors", + "models/ControlNet/control_v11f1p_sd15_depth.pth", + "models/ControlNet/control_v11p_sd15_softedge.pth", + ] +) pipe = SDVideoPipeline.from_model_manager( model_manager, [ @@ -38,7 +45,7 @@ smoother = SequencialProcessor([FastBlendSmoother(), ContrastEditor(rate=1.1), S # Load video # Original video: https://pixabay.com/videos/flow-rocks-water-fluent-stones-159627/ -video = VideoData(video_file="data/pixabay100/159627 (1080p).mp4", height=512, width=768) +video = VideoData(video_file="data/examples/pixabay100/159627 (1080p).mp4", height=512, width=768) input_video = [video[i] for i in range(128)] # Rerender diff --git a/examples/hunyuan_dit/README.md b/examples/hunyuan_dit/README.md index a68ba47..b5b41ba 100644 --- a/examples/hunyuan_dit/README.md +++ b/examples/hunyuan_dit/README.md @@ -20,6 +20,14 @@ models/HunyuanDiT/ └── diffusion_pytorch_model.bin ``` +You can use the following code to download these files: + +```python +from diffsynth import download_models + +download_models(["HunyuanDiT"]) +``` + ## Inference ### Text-to-image with highres-fix diff --git a/examples/image_synthesis/sd_prompt_refining.py b/examples/image_synthesis/sd_prompt_refining.py index bd76804..7d9bae6 100644 --- a/examples/image_synthesis/sd_prompt_refining.py +++ b/examples/image_synthesis/sd_prompt_refining.py @@ -1,16 +1,15 @@ -from diffsynth import ModelManager, SDXLImagePipeline +from diffsynth import ModelManager, SDXLImagePipeline, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) # `models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd/`: [link](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) # `models/translator/opus-mt-zh-en/`: [link](https://huggingface.co/Helsinki-NLP/opus-mt-en-zh) - +download_models(["StableDiffusionXL_v1", "BeautifulPrompt", "opus-mt-zh-en"]) # Load models model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") -model_manager.load_textual_inversions("models/textual_inversion") model_manager.load_models([ "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd/model.safetensors", diff --git a/examples/image_synthesis/sd_text_to_image.py b/examples/image_synthesis/sd_text_to_image.py index 5f32bcd..4d553b2 100644 --- a/examples/image_synthesis/sd_text_to_image.py +++ b/examples/image_synthesis/sd_text_to_image.py @@ -1,23 +1,23 @@ -from diffsynth import ModelManager, SDImagePipeline, ControlNetConfigUnit +from diffsynth import ModelManager, SDImagePipeline, ControlNetConfigUnit, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575?type=Model&format=SafeTensor&size=full&fp=fp16) # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth) # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth) # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth) # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth) +download_models(["AingDiffusion_v12", "ControlNet_v11p_sd15_lineart", "ControlNet_v11f1e_sd15_tile"]) # Load models -model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") -model_manager.load_textual_inversions("models/textual_inversion") -model_manager.load_models([ - "models/stable_diffusion/aingdiffusion_v12.safetensors", - "models/ControlNet/control_v11f1e_sd15_tile.pth", - "models/ControlNet/control_v11p_sd15_lineart.pth" -]) +model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", + file_path_list=[ + "models/stable_diffusion/aingdiffusion_v12.safetensors", + "models/ControlNet/control_v11f1e_sd15_tile.pth", + "models/ControlNet/control_v11p_sd15_lineart.pth" + ]) pipe = SDImagePipeline.from_model_manager( model_manager, [ diff --git a/examples/image_synthesis/sdxl_text_to_image.py b/examples/image_synthesis/sdxl_text_to_image.py index 3d3e362..fcae3ab 100644 --- a/examples/image_synthesis/sdxl_text_to_image.py +++ b/examples/image_synthesis/sdxl_text_to_image.py @@ -1,10 +1,10 @@ -from diffsynth import ModelManager, SDXLImagePipeline +from diffsynth import ModelManager, SDXLImagePipeline, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion_xl/bluePencilXL_v200.safetensors`: [link](https://civitai.com/api/download/models/245614?type=Model&format=SafeTensor&size=pruned&fp=fp16) - +download_models(["BluePencilXL_v200"]) # Load models model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") diff --git a/examples/image_synthesis/sdxl_turbo.py b/examples/image_synthesis/sdxl_turbo.py index 8d40512..c39fb08 100644 --- a/examples/image_synthesis/sdxl_turbo.py +++ b/examples/image_synthesis/sdxl_turbo.py @@ -1,10 +1,10 @@ -from diffsynth import ModelManager, SDXLImagePipeline +from diffsynth import ModelManager, SDXLImagePipeline, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion_xl_turbo/sd_xl_turbo_1.0_fp16.safetensors`: [link](https://huggingface.co/stabilityai/sdxl-turbo/resolve/main/sd_xl_turbo_1.0_fp16.safetensors) - +download_models(["StableDiffusionXL_Turbo"]) # Load models model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") diff --git a/examples/video_synthesis/sd_text_to_video.py b/examples/video_synthesis/sd_text_to_video.py index a92d71a..55446e5 100644 --- a/examples/video_synthesis/sd_text_to_video.py +++ b/examples/video_synthesis/sd_text_to_video.py @@ -1,13 +1,13 @@ -from diffsynth import ModelManager, SDImagePipeline, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames +from diffsynth import ModelManager, SDImagePipeline, SDVideoPipeline, save_video, download_models from diffsynth.extensions.RIFE import RIFEInterpolater import torch -# Download models +# Download models (automatically) # `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16) # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt) # `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing) - +download_models(["DreamShaper_8", "AnimateDiff_v2", "RIFE"]) # Load models model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") diff --git a/examples/video_synthesis/sdxl_text_to_video.py b/examples/video_synthesis/sdxl_text_to_video.py index 96fc7a5..b96498a 100644 --- a/examples/video_synthesis/sdxl_text_to_video.py +++ b/examples/video_synthesis/sdxl_text_to_video.py @@ -1,11 +1,11 @@ -from diffsynth import ModelManager, SDXLVideoPipeline, save_video +from diffsynth import ModelManager, SDXLVideoPipeline, save_video, download_models import torch -# Download models +# Download models (automatically) # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) # `models/AnimateDiff/mm_sdxl_v10_beta.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sdxl_v10_beta.ckpt) - +download_models(["StableDiffusionXL_v1", "AnimateDiff_xl_beta"]) model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") model_manager.load_models([ @@ -25,4 +25,4 @@ video = pipe( height=1024, width=1024, num_frames=16, num_inference_steps=100, ) -save_video(video, "video.mp4", fps=16) +save_video(video, "output_video.mp4", fps=16) diff --git a/examples/video_synthesis/svd_text_to_video.py b/examples/video_synthesis/svd_text_to_video.py index b432f1f..ee91c92 100644 --- a/examples/video_synthesis/svd_text_to_video.py +++ b/examples/video_synthesis/svd_text_to_video.py @@ -1,12 +1,12 @@ -from diffsynth import save_video, SDXLImagePipeline, ModelManager, SVDVideoPipeline +from diffsynth import save_video, SDXLImagePipeline, ModelManager, SVDVideoPipeline, download_models from diffsynth import ModelManager import torch -# Download models +# Download models (automatically) # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) # `models/stable_video_diffusion/svd_xt.safetensors`: [link](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors) - +download_models(["StableDiffusionXL_v1", "stable-video-diffusion-img2vid-xt"]) prompt = "cloud, wind" torch.manual_seed(0) @@ -21,8 +21,7 @@ image = pipe( cfg_scale=6, height=1024, width=1024, num_inference_steps=50, ) -pipe.to("cpu") -torch.cuda.empty_cache() +model_manager.to("cpu") # 2. Image-to-video using SVD model_manager = ModelManager() @@ -34,4 +33,4 @@ video = pipe( motion_bucket_id=127, num_inference_steps=50 ) -save_video(video, "video.mp4", fps=15) +save_video(video, "output_video.mp4", fps=15)