update examples and downloaders

2026-03-18 22:08:13 +00:00 · 2024-06-27 19:43:50 +08:00
parent 0af60b9c73
commit 0b1704976a
21 changed files with 409 additions and 100 deletions
--- a/examples/Diffutoon/diffutoon_toon_shading.py
+++ b/examples/Diffutoon/diffutoon_toon_shading.py
@@ -1,7 +1,7 @@
-from diffsynth import SDVideoPipelineRunner
+from diffsynth import SDVideoPipelineRunner, download_models


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575)
 # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
 # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
@@ -9,7 +9,13 @@ from diffsynth import SDVideoPipelineRunner
 # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
 # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
 # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
-
+download_models([
+    "AingDiffusion_v12",
+    "AnimateDiff_v2",
+    "ControlNet_v11p_sd15_lineart",
+    "ControlNet_v11f1e_sd15_tile",
+    "TextualInversion_VeryBadImageNegative_v1.3"
+])
 # The original video in the example is https://www.bilibili.com/video/BV1iG411a7sQ/.

 config = {
@@ -63,7 +69,7 @@ config = {
                "end_frame_id": 30
            }
        ],
-        "output_folder": "data/examples/diffutoon/output",
+        "output_folder": "output",
        "fps": 30
    },
    "pipeline": {
--- a/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py
+++ b/examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py
@@ -1,8 +1,8 @@
-from diffsynth import SDVideoPipelineRunner
+from diffsynth import SDVideoPipelineRunner, download_models
 import os


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575)
 # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
 # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
@@ -14,7 +14,15 @@ import os
 # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
 # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
 # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
-
+download_models([
+    "AingDiffusion_v12",
+    "AnimateDiff_v2",
+    "ControlNet_v11p_sd15_lineart",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "TextualInversion_VeryBadImageNegative_v1.3"
+])
 # The original video in the example is https://www.bilibili.com/video/BV1zu4y1s7Ec/.

 config_stage_1 = {
@@ -67,7 +75,7 @@ config_stage_1 = {
                "end_frame_id": 30
            }
        ],
-        "output_folder": "data/examples/diffutoon_edit/color_video",
+        "output_folder": "output/color_video",
        "fps": 25
    },
    "smoother_configs": [
@@ -153,7 +161,7 @@ config_stage_2 = {
                "end_frame_id": 30
            }
        ],
-        "output_folder": "data/examples/diffutoon_edit/output",
+        "output_folder": "output/edited_video",
        "fps": 30
    },
    "pipeline": {
--- a/examples/Diffutoon/sd_toon_shading.py
+++ b/examples/Diffutoon/sd_toon_shading.py
@@ -1,9 +1,8 @@
-from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames
-from diffsynth.extensions.RIFE import RIFESmoother
+from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors`: [link](https://civitai.com/api/download/models/266360?type=Model&format=SafeTensor&size=pruned&fp=fp16)
 # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
 # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
@@ -11,8 +10,13 @@ import torch
 # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
 # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
 # `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
-# `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing)
-
+download_models([
+    "Flat2DAnimerge_v45Sharp",
+    "AnimateDiff_v2",
+    "ControlNet_v11p_sd15_lineart",
+    "ControlNet_v11f1e_sd15_tile",
+    "TextualInversion_VeryBadImageNegative_v1.3"
+])

 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
@@ -22,7 +26,6 @@ model_manager.load_models([
    "models/AnimateDiff/mm_sd_v15_v2.ckpt",
    "models/ControlNet/control_v11p_sd15_lineart.pth",
    "models/ControlNet/control_v11f1e_sd15_tile.pth",
-    "models/RIFE/flownet.pkl"
 ])
 pipe = SDVideoPipeline.from_model_manager(
    model_manager,
@@ -39,12 +42,11 @@ pipe = SDVideoPipeline.from_model_manager(
        )
    ]
 )
-smoother = RIFESmoother.from_model_manager(model_manager)

 # Load video (we only use 60 frames for quick testing)
 # The original video is here: https://www.bilibili.com/video/BV19w411A7YJ/
 video = VideoData(
-    video_file="data/bilibili_videos/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻 - 1.66 微笑调查队🌻(Av278681824,P1).mp4",
+    video_file="data/examples/bilibili/BV19w411A7YJ.mp4",
    height=1024, width=1024)
 input_video = [video[i] for i in range(40*60, 41*60)]

@@ -59,7 +61,6 @@ output_video = pipe(
    animatediff_batch_size=32, animatediff_stride=16,
    vram_limit_level=0,
 )
-output_video = smoother(output_video)

 # Save video
 save_video(output_video, "output_video.mp4", fps=60)
--- a/examples/ExVideo/ExVideo_svd_test.py
+++ b/examples/ExVideo/ExVideo_svd_test.py
@@ -1,4 +1,4 @@
-from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline
+from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline, download_models
 from diffsynth import ModelManager
 import torch, os

@@ -31,7 +31,14 @@ import torch, os
 def generate_image():
    # Load models
    os.environ["TOKENIZERS_PARALLELISM"] = "True"
-    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["HunyuanDiT"])
+    download_models(["HunyuanDiT"])
+    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                                 file_path_list=[
+                                     "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+                                     "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+                                     "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+                                     "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin",
+                                 ])
    pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)

    # Generate an image
@@ -47,7 +54,12 @@ def generate_image():

 def generate_video(image):
    # Load models
-    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"])
+    download_models(["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"])
+    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                                 file_path_list=[
+                                     "models/stable_video_diffusion/svd_xt.safetensors",
+                                     "models/stable_video_diffusion/model.fp16.safetensors",
+                                 ])
    pipe = SVDVideoPipeline.from_model_manager(model_manager)

    # Generate a video
@@ -65,7 +77,12 @@ def generate_video(image):

 def upscale_video(image, video):
    # Load models
-    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda", model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"])
+    download_models(["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"])
+    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                                 file_path_list=[
+                                     "models/stable_video_diffusion/svd_xt.safetensors",
+                                     "models/stable_video_diffusion/model.fp16.safetensors",
+                                 ])
    pipe = SVDVideoPipeline.from_model_manager(model_manager)

    # Generate a video
--- a/examples/Ip-Adapter/README.md
+++ b/examples/Ip-Adapter/README.md
@@ -1,3 +1,44 @@
 # IP-Adapter

-The features of IP-Adapter in DiffSynth Studio is not completed. Please wait for us.
+IP-Adapter is a interesting model, which can adopt the content or style of another image to generate a new image.
+
+## Example: Content Controlling in Stable Diffusion
+
+Based on Stable Diffusion, we can transfer the object to another scene. See [`sd_ipadapter.py`](./sd_ipadapter.py).
+
+|First, we generate a car. The prompt is "masterpiece, best quality, a car".|Next, utilizing IP-Adapter, we move the car to the road. The prompt is "masterpiece, best quality, a car running on the road".|
+|-|-|
+|![car](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8530a2f0-f610-4269-a22c-ac6c2f21fc18)|![car_on_the_road](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b8ccddb2-c423-46d8-bd1a-327fcc074a36)|
+
+## Example: Content and Style Controlling in Stable Diffusion XL
+
+The IP-Adapter model based on Stable Diffusion XL is more powerful. You have the option to use the content or style. See [`sdxl_ipadapter.py`](./sdxl_ipadapter.py).
+
+* Content controlling (original usage of IP-Adapter)
+
+|First, we generate a rabbit.|Next, enable IP-Adapter and let the rabbit jump.|For comparision, disable IP-Adapter to see the generated image.|
+|-|-|-|
+|![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_jumping_rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/b93c5495-0b77-4d97-bcd3-3942858288f2)|![rabbit_to_jumping_rabbit_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/52f37195-65b3-4a38-8d9b-73df37311c15)|
+
+
+* Style controlling (InstantStyle)
+
+|First, we generate a rabbit.|Next, enable InstantStyle and convert the rabbit to a cat.|For comparision, disable IP-Adapter to see the generated image.|
+|-|-|-|
+|![rabbit](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4b452634-ec57-414f-897a-f8c50c74a650)|![rabbit_to_cat](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/a006b281-f643-4ea9-b0da-712289c96059)|![rabbit_to_cat_without_ipa](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/189bd11e-7a10-4c09-8554-0eebde9150fd)|
+
+## Example: Image Fusing (Experimental)
+
+Since IP-Adapter can control the content based on more than one image, we can do something interesting. See [`sdxl_ipadapter_multi_reference.py`](sdxl_ipadapter_multi_reference.py).
+
+We have two pokemons here:
+
+|Charizard|Pikachu|
+|-|-|
+|![](https://media.52poke.com/wiki/7/7e/006Charizard.png)|![](https://media.52poke.com/wiki/0/0d/025Pikachu.png)|
+
+Fuse!
+
+|Pikazard ???|
+|-|
+|![Pikazard](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/807cdb31-94f5-4cc2-a978-3c6a7ffedc5b)|
--- a/examples/Ip-Adapter/sd_ipadapter.py
+++ b/examples/Ip-Adapter/sd_ipadapter.py
@@ -0,0 +1,38 @@
+from diffsynth import ModelManager, SDImagePipeline, download_models
+import torch
+
+
+# Download models (automatically)
+# `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16)
+# `models/IpAdapter/stable_diffusion/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/model.safetensors)
+# `models/IpAdapter/stable_diffusion/ip-adapter_sd15.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/models/ip-adapter_sd15.bin)
+# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
+download_models(["DreamShaper_8", "IP-Adapter-SD", "TextualInversion_VeryBadImageNegative_v1.3"])
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_textual_inversions("models/textual_inversion")
+model_manager.load_models([
+    "models/stable_diffusion/aingdiffusion_v12.safetensors",
+    "models/IpAdapter/stable_diffusion/image_encoder/model.safetensors",
+    "models/IpAdapter/stable_diffusion/ip-adapter_sd15.bin"
+])
+pipe = SDImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(1)
+style_image = pipe(
+    prompt="masterpiece, best quality, a car",
+    negative_prompt="verybadimagenegative_v1.3",
+    cfg_scale=7, clip_skip=2,
+    height=512, width=512, num_inference_steps=50,
+)
+style_image.save("car.jpg")
+
+image = pipe(
+    prompt="masterpiece, best quality, a car running on the road",
+    negative_prompt="verybadimagenegative_v1.3",
+    cfg_scale=7, clip_skip=2,
+    height=512, width=512, num_inference_steps=50,
+    ipadapter_images=[style_image], ipadapter_scale=1.0
+)
+image.save("car_on_the_road.jpg")
--- a/examples/Ip-Adapter/sdxl_ipadapter.py
+++ b/examples/Ip-Adapter/sdxl_ipadapter.py
@@ -1,36 +1,61 @@
-from diffsynth import ModelManager, SDXLImagePipeline
+from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors)
-# `models/IpAdapter/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors)
-# `models/IpAdapter/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors)
+# `models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors)
+# `models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors)
+download_models(["StableDiffusionXL_v1", "IP-Adapter-SDXL"])

 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors",
-    "models/IpAdapter/image_encoder/model.safetensors",
-    "models/IpAdapter/ip-adapter_sdxl.bin"
+    "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
+    "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin"
 ])
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
-pipe.ipadapter.set_less_adapter()

-torch.manual_seed(0)
+torch.manual_seed(123456)
 style_image = pipe(
-    prompt="Starry Night, blue sky, by van Gogh",
-    negative_prompt="dark, gray",
+    prompt="a rabbit in a garden, colorful flowers",
+    negative_prompt="anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
    cfg_scale=5,
-    height=1024, width=1024, num_inference_steps=30,
+    height=1024, width=1024, num_inference_steps=50,
 )
-style_image.save("style_image.jpg")
+style_image.save("rabbit.jpg")

 image = pipe(
    prompt="a cat",
    negative_prompt="",
    cfg_scale=5,
-    height=1024, width=1024, num_inference_steps=30,
-    ipadapter_images=[style_image]
+    height=1024, width=1024, num_inference_steps=50,
+    ipadapter_images=[style_image], ipadapter_use_instant_style=True
 )
-image.save("transferred_image.jpg")
+image.save("rabbit_to_cat.jpg")
+
+image = pipe(
+    prompt="a rabbit is jumping",
+    negative_prompt="",
+    cfg_scale=5,
+    height=1024, width=1024, num_inference_steps=50,
+    ipadapter_images=[style_image], ipadapter_use_instant_style=False, ipadapter_scale=0.5
+)
+image.save("rabbit_to_jumping_rabbit.jpg")
+
+image = pipe(
+    prompt="a cat",
+    negative_prompt="",
+    cfg_scale=5,
+    height=1024, width=1024, num_inference_steps=50,
+)
+image.save("rabbit_to_cat_without_ipa.jpg")
+
+image = pipe(
+    prompt="a rabbit is jumping",
+    negative_prompt="",
+    cfg_scale=5,
+    height=1024, width=1024, num_inference_steps=50,
+)
+image.save("rabbit_to_jumping_rabbit_without_ipa.jpg")
--- a/examples/Ip-Adapter/sdxl_ipadapter_multi_reference.py
+++ b/examples/Ip-Adapter/sdxl_ipadapter_multi_reference.py
@@ -0,0 +1,34 @@
+from diffsynth import ModelManager, SDXLImagePipeline, download_models
+import torch, requests
+from PIL import Image
+
+
+# Download models (automatically)
+# `models/stable_diffusion_xl/bluePencilXL_v200.safetensors`: [link](https://civitai.com/api/download/models/245614?type=Model&format=SafeTensor&size=pruned&fp=fp16)
+# `models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/image_encoder/model.safetensors)
+# `models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin`: [link](https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl.safetensors)
+download_models(["BluePencilXL_v200", "IP-Adapter-SDXL"])
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+    "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
+    "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin"
+])
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+image_1 = Image.open(requests.get("https://media.52poke.com/wiki/7/7e/006Charizard.png", stream=True).raw).convert("RGB").resize((1024, 1024))
+image_1.save("Charizard.jpg")
+image_2 = Image.open(requests.get("https://media.52poke.com/wiki/0/0d/025Pikachu.png", stream=True).raw).convert("RGB").resize((1024, 1024))
+image_2.save("Pikachu.jpg")
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a pokemon, maybe Charizard, maybe Pikachu",
+    negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+    cfg_scale=5,
+    height=1024, width=1024, num_inference_steps=50,
+    ipadapter_images=[image_1, image_2], ipadapter_use_instant_style=False, ipadapter_scale=0.7
+)
+image.save(f"Pikazard.jpg")
--- a/examples/diffsynth/sd_video_rerender.py
+++ b/examples/diffsynth/sd_video_rerender.py
@@ -1,24 +1,31 @@
-from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
+from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, download_models
 from diffsynth.processors.FastBlend import FastBlendSmoother
 from diffsynth.processors.PILEditor import ContrastEditor, SharpnessEditor
 from diffsynth.processors.sequencial_processor import SequencialProcessor
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16)
 # `models/ControlNet/control_v11f1p_sd15_depth.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1p_sd15_depth.pth)
 # `models/ControlNet/control_v11p_sd15_softedge.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_softedge.pth)
 # `models/Annotators/dpt_hybrid-midas-501f0c75.pt`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt)
 # `models/Annotators/ControlNetHED.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth)
+download_models([
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "DreamShaper_8"
+])

 # Load models
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
-    "models/stable_diffusion/dreamshaper_8.safetensors",
-    "models/ControlNet/control_v11f1p_sd15_depth.pth",
-    "models/ControlNet/control_v11p_sd15_softedge.pth"
-])
+model_manager = ModelManager(
+    torch_dtype=torch.float16, device="cuda",
+    file_path_list=[
+        "models/stable_diffusion/dreamshaper_8.safetensors",
+        "models/ControlNet/control_v11f1p_sd15_depth.pth",
+        "models/ControlNet/control_v11p_sd15_softedge.pth",
+    ]
+)
 pipe = SDVideoPipeline.from_model_manager(
    model_manager,
    [
@@ -38,7 +45,7 @@ smoother = SequencialProcessor([FastBlendSmoother(), ContrastEditor(rate=1.1), S

 # Load video
 # Original video: https://pixabay.com/videos/flow-rocks-water-fluent-stones-159627/
-video = VideoData(video_file="data/pixabay100/159627 (1080p).mp4", height=512, width=768)
+video = VideoData(video_file="data/examples/pixabay100/159627 (1080p).mp4", height=512, width=768)
 input_video = [video[i] for i in range(128)]

 # Rerender
--- a/examples/hunyuan_dit/README.md
+++ b/examples/hunyuan_dit/README.md
@@ -20,6 +20,14 @@ models/HunyuanDiT/
        └── diffusion_pytorch_model.bin
 ```

+You can use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["HunyuanDiT"])
+```
+
 ## Inference

 ### Text-to-image with highres-fix
--- a/examples/image_synthesis/sd_prompt_refining.py
+++ b/examples/image_synthesis/sd_prompt_refining.py
@@ -1,16 +1,15 @@
-from diffsynth import ModelManager, SDXLImagePipeline
+from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors)
 # `models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd/`: [link](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd)
 # `models/translator/opus-mt-zh-en/`: [link](https://huggingface.co/Helsinki-NLP/opus-mt-en-zh)
-
+download_models(["StableDiffusionXL_v1", "BeautifulPrompt", "opus-mt-zh-en"])

 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_textual_inversions("models/textual_inversion")
 model_manager.load_models([
    "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors",
    "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd/model.safetensors",
--- a/examples/image_synthesis/sd_text_to_image.py
+++ b/examples/image_synthesis/sd_text_to_image.py
@@ -1,23 +1,23 @@
-from diffsynth import ModelManager, SDImagePipeline, ControlNetConfigUnit
+from diffsynth import ModelManager, SDImagePipeline, ControlNetConfigUnit, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575?type=Model&format=SafeTensor&size=full&fp=fp16)
 # `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
 # `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth)
 # `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
 # `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
+download_models(["AingDiffusion_v12", "ControlNet_v11p_sd15_lineart", "ControlNet_v11f1e_sd15_tile"])


 # Load models
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_textual_inversions("models/textual_inversion")
-model_manager.load_models([
-    "models/stable_diffusion/aingdiffusion_v12.safetensors",
-    "models/ControlNet/control_v11f1e_sd15_tile.pth",
-    "models/ControlNet/control_v11p_sd15_lineart.pth"
-])
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=[
+                                 "models/stable_diffusion/aingdiffusion_v12.safetensors",
+                                 "models/ControlNet/control_v11f1e_sd15_tile.pth",
+                                 "models/ControlNet/control_v11p_sd15_lineart.pth"
+                             ])
 pipe = SDImagePipeline.from_model_manager(
    model_manager,
    [
--- a/examples/image_synthesis/sdxl_text_to_image.py
+++ b/examples/image_synthesis/sdxl_text_to_image.py
@@ -1,10 +1,10 @@
-from diffsynth import ModelManager, SDXLImagePipeline
+from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion_xl/bluePencilXL_v200.safetensors`: [link](https://civitai.com/api/download/models/245614?type=Model&format=SafeTensor&size=pruned&fp=fp16)
-
+download_models(["BluePencilXL_v200"])

 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
--- a/examples/image_synthesis/sdxl_turbo.py
+++ b/examples/image_synthesis/sdxl_turbo.py
@@ -1,10 +1,10 @@
-from diffsynth import ModelManager, SDXLImagePipeline
+from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion_xl_turbo/sd_xl_turbo_1.0_fp16.safetensors`: [link](https://huggingface.co/stabilityai/sdxl-turbo/resolve/main/sd_xl_turbo_1.0_fp16.safetensors)
-
+download_models(["StableDiffusionXL_Turbo"])

 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
--- a/examples/video_synthesis/sd_text_to_video.py
+++ b/examples/video_synthesis/sd_text_to_video.py
@@ -1,13 +1,13 @@
-from diffsynth import ModelManager, SDImagePipeline, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames
+from diffsynth import ModelManager, SDImagePipeline, SDVideoPipeline, save_video, download_models
 from diffsynth.extensions.RIFE import RIFEInterpolater
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion/dreamshaper_8.safetensors`: [link](https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16)
 # `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
 # `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing)
-
+download_models(["DreamShaper_8", "AnimateDiff_v2", "RIFE"])

 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
--- a/examples/video_synthesis/sdxl_text_to_video.py
+++ b/examples/video_synthesis/sdxl_text_to_video.py
@@ -1,11 +1,11 @@
-from diffsynth import ModelManager, SDXLVideoPipeline, save_video
+from diffsynth import ModelManager, SDXLVideoPipeline, save_video, download_models
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors)
 # `models/AnimateDiff/mm_sdxl_v10_beta.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sdxl_v10_beta.ckpt)
-
+download_models(["StableDiffusionXL_v1", "AnimateDiff_xl_beta"])

 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
@@ -25,4 +25,4 @@ video = pipe(
    height=1024, width=1024, num_frames=16,
    num_inference_steps=100,
 )
-save_video(video, "video.mp4", fps=16)
+save_video(video, "output_video.mp4", fps=16)
--- a/examples/video_synthesis/svd_text_to_video.py
+++ b/examples/video_synthesis/svd_text_to_video.py
@@ -1,12 +1,12 @@
-from diffsynth import save_video, SDXLImagePipeline, ModelManager, SVDVideoPipeline
+from diffsynth import save_video, SDXLImagePipeline, ModelManager, SVDVideoPipeline, download_models
 from diffsynth import ModelManager
 import torch


-# Download models
+# Download models (automatically)
 # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors)
 # `models/stable_video_diffusion/svd_xt.safetensors`: [link](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors)
-
+download_models(["StableDiffusionXL_v1", "stable-video-diffusion-img2vid-xt"])

 prompt = "cloud, wind"
 torch.manual_seed(0)
@@ -21,8 +21,7 @@ image = pipe(
    cfg_scale=6,
    height=1024, width=1024, num_inference_steps=50,
 )
-pipe.to("cpu")
-torch.cuda.empty_cache()
+model_manager.to("cpu")

 # 2. Image-to-video using SVD
 model_manager = ModelManager()
@@ -34,4 +33,4 @@ video = pipe(
    motion_bucket_id=127,
    num_inference_steps=50
 )
-save_video(video, "video.mp4", fps=15)
+save_video(video, "output_video.mp4", fps=15)