Update FLUX.md

Update StableDiffusion3.md
2026-03-19 14:58:12 +00:00 · 2024-11-08 00:44:06 -06:00 · 2024-11-08 00:34:29 -06:00 · 2024-11-08 00:11:51 -06:00 · 2024-11-07 19:20:19 +08:00 · 2024-11-06 00:07:50 -06:00
63 changed files with 3944 additions and 0 deletions
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=source
 set BUILDDIR=build
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.https://www.sphinx-doc.org/
 	exit /b 1
 )
 if "%1" == "" goto help
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/docs/source/.readthedocs.yaml
+++ b/docs/source/.readthedocs.yaml
@@ -0,0 +1,27 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Set the version of Python and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.11"
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
  configuration: docs/source/conf.py
 python:
  install:
    - requirements: docs/source/requirement.txt
 # We recommend specifying your dependencies to enable reproducible builds:
 # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 # python:
 #   install:
 #   - requirements: docs/requirements.txt
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -0,0 +1,49 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 import os
 import sys
 sys.path.insert(0, os.path.abspath('../../diffsynth'))
 project = 'DiffSynth-Studio'
 copyright = '2024, ModelScope'
 author = 'ModelScope'
 release = '0.1.0'
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.napoleon',
    'sphinx.ext.doctest',
    'sphinx.ext.intersphinx',
    'sphinx.ext.todo',
    'sphinx.ext.coverage',
    'sphinx.ext.imgmath',
    'sphinx.ext.viewcode',
    'recommonmark',
    'sphinx_markdown_tables'
 ]
 templates_path = ['_templates']
 exclude_patterns = []
 source_suffix = ['.rst', '.md']
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 html_theme = 'sphinx_rtd_theme'
 html_static_path = ['_static']
 # multi-language docs
 language = 'zh_CN'
 locale_dirs = ['../locales/']   # path is example but recommended.
 gettext_compact = False  # optional.
 gettext_uuid = True  # optional.
--- a/docs/source/creating/AdaptersForImageSynthesis.md
+++ b/docs/source/creating/AdaptersForImageSynthesis.md
@@ -0,0 +1,133 @@
 # ControlNet、LoRA、IP-Adapter——精准控制技术
 在文生图模型的基础上，还可以使用各种 Adapter 架构的模型对生成过程进行控制。
 接下来的例子会用到很多模型，我们先把它们下载好。
 * 一个广受好评的 Stable Diffusion XL 架构动漫风格模型
 * 一个支持多种控制模式的 ControlNet 模型
 * 一个 Stable Diffusion XL 模型的 LoRA 模型
 * 一个 IP-Adapter 模型及其对应的图像编码器
 ```python
 from diffsynth import download_models
 download_models([
    "BluePencilXL_v200",
    "ControlNet_union_sdxl_promax",
    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
    "IP-Adapter-SDXL"
 ])
 ```
 用基础文生图功能生成一张图
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models(["models/stable_diffusion_xl/bluePencilXL_v200.safetensors"])
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(1)
 image = pipe(
    prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    cfg_scale=6, num_inference_steps=60,
 )
 image.save("image.jpg")
 ```
 ![image](https://github.com/user-attachments/assets/cc094e8f-ff6a-4f9e-ba05-7a5c2e0e609f)
 接下来，我们让这位水下翩翩起舞的少女变成火系魔法师！启用 ControlNet 保持画面结构的同时，修改提示词。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
 ])
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
 ])
 torch.manual_seed(2)
 image = pipe(
    prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg")
 )
 image.save("image_controlnet.jpg")
 ```
 ![image_controlnet](https://github.com/user-attachments/assets/d50d173e-e81a-4d7e-93e3-b2787d69953e)
 很酷对不对？还有更酷的，加个 LoRA，让画面更贴近手绘漫画的扁平风格。这个 LoRA 需要一定的触发词才能生效，这在原作者的模型页面有提到，记得在提示词的开头加上触发词哦。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
 ])
 model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
 ])
 torch.manual_seed(3)
 image = pipe(
    prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg")
 )
 image.save("image_lora.jpg")
 ```
 ![image_lora](https://github.com/user-attachments/assets/c599b2f8-8351-4be5-a6ae-8380889cb9d8)
 还没结束呢！找一张水墨风的中国画作为风格引导，启动 IP-Adapter，让古典艺术和现代美学碰撞！
 |就用这张图作为风格引导吧|![ink_style](https://github.com/user-attachments/assets/e47c5a03-9c7b-402b-b260-d8bfd56abbc5)|
 |-|-|
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
    "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
    "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
 ])
 model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
 ])
 torch.manual_seed(2)
 image = pipe(
    prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg"),
    ipadapter_images=[Image.open("ink_style.jpg")],
    ipadapter_use_instant_style=True, ipadapter_scale=0.5
 )
 image.save("image_ipadapter.jpg")
 ```
 ![image_ipadapter](https://github.com/user-attachments/assets/e5924aef-03b0-4462-811f-a60e2523fd7f)
 用 Diffusion 生成图像的乐趣在于，各种生态模型的组合，可以实现各种奇思妙想。
--- a/docs/source/creating/BasicImageSynthesis.md
+++ b/docs/source/creating/BasicImageSynthesis.md
@@ -0,0 +1,65 @@
 # 文生图、图生图、高分辨率修复——初识绚丽的 Diffusion
 加载文生图模型，这里我们使用一个 Civiai 上一个动漫风格的模型作为例子。
 ```python
 import torch
 from diffsynth import ModelManager, SDImagePipeline, download_models
 download_models(["AingDiffusion_v12"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
 pipe = SDImagePipeline.from_model_manager(model_manager)
 ```
 生成一张图小试身手。
 ```python
 torch.manual_seed(0)
 image = pipe(
    prompt="masterpiece, best quality, a girl with long silver hair",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    height=512, width=512, num_inference_steps=80,
 )
 image.save("image.jpg")
 ```
 嗯，一个可爱的小姐姐。
 ![image](https://github.com/user-attachments/assets/999100d2-1c39-4f18-b37e-aa9d5b4e519c)
 用图生图功能把她的头发变成红色，只需要添加 `input_image` 和 `denoising_strength` 两个参数。其中 `denoising_strength` 用于控制加噪声的强度，为 0 时生成的图与输入的图完全一致，为 1 时完全随机生成图。
 ```python
 torch.manual_seed(1)
 image_edited = pipe(
    prompt="masterpiece, best quality, a girl with long red hair",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    height=512, width=512, num_inference_steps=80,
    input_image=image, denoising_strength=0.6,
 )
 image_edited.save("image_edited.jpg")
 ```
 嗯，一个红色头发的可爱小姐姐。
 ![image_edited](https://github.com/user-attachments/assets/e3de8bc1-037f-4d4d-aacf-8919143c2375)
 由于模型本身是在 512*512 分辨率下训练的，所以图片看起来有点模糊，不过我们可以利用模型自身的能力润色这张图，为其填充细节。具体来说，就是提高分辨率后进行图生图。
 ```python
 torch.manual_seed(2)
 image_highres = pipe(
    prompt="masterpiece, best quality, a girl with long red hair",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    height=1024, width=1024, num_inference_steps=80,
    input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
 )
 image_highres.save("image_highres.jpg")
 ```
 嗯，一个清晰的红色头发可爱小姐姐。
 ![image_highres](https://github.com/user-attachments/assets/4466353e-662c-49f5-9211-b11bb0bb7fb7)
 值得注意的是，图生图和高分辨率修复功能是全局支持的，目前我们所有的图像生成流水线都可以这样使用。
--- a/docs/source/creating/ModelQuantization.md
+++ b/docs/source/creating/ModelQuantization.md
@@ -0,0 +1,108 @@
 # 量化、卸载——显存优化的技术
 Flux.1 的发布让文生图开源社区再次活跃起来，但是其12B的参数量限制了显存低于24GB设备的运行。Diffsynth对Flux支持了量化（quantization）和卸载（offload）这两种优化显存的技术，降低了使用Flux的硬件设备门槛，本篇文章将介绍它们的原理和使用方式。
 ## 量化
 模型量化指的是一种将高精度数据类型映射成低精度数据类型的技术，从而以损失少量精度为代价降低计算的时间和空间消耗。Flux.1 默认使用的数据类型为bfloat16，即每个参数占用16 bit（2 byte），我们使用torch支持的float8_e4m3fn加载模型，就能以更低的显存(节约10GB左右显存)消耗生成和原先几乎相同质量的图片。
 ```python
 import torch
 from diffsynth import download_models, ModelManager, FluxImagePipeline
 download_models(["FLUX.1-dev"])
 model_manager = ModelManager(
    torch_dtype=torch.bfloat16,
 )
 model_manager.load_models([
    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
    "models/FLUX/FLUX.1-dev/text_encoder_2",
    "models/FLUX/FLUX.1-dev/ae.safetensors",
 ])
 model_manager.load_models(
    ["models/FLUX/FLUX.1-dev/flux1-dev.safetensors"],
    torch_dtype=torch.float8_e4m3fn # Load the DiT model in FP8 format.
 )
 pipe = FluxImagePipeline.from_model_manager(model_manager, device="cuda")
 pipe.dit.quantize() 
 prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her."
 negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
 torch.manual_seed(9)
 image = pipe(
    prompt=prompt,
    num_inference_steps=50, embedded_guidance=3.5
 )
 image.save("image_1024.jpg")
 ```
 <div align="center">
    <figure style="display: inline-block; margin-right: 20px;">
        <img src="https://github.com/user-attachments/assets/d4c1699c-447b-4a5b-b453-4aa4d5ac066f" alt="图片1" width="300">
        <figcaption>float8_e4m3fn</figcaption>
    </figure>
    <figure style="display: inline-block;">
        <img src="https://github.com/user-attachments/assets/51b8854d-fafa-4d11-b1c6-8004bbd792e7" alt="图片2" width="300">
        <figcaption>bfloat16</figcaption>
    </figure>
 </div>
 <br>
 Diffsynth还支持ControlNet的量化，只需要在加载模型时指定数据类型为  ```torch.float8_e4m3fn```, 并且在生成图片前调用对应ControlNet模型的```quantize()```方法即可：
 ```python
 model_manager.load_models(
    ["models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors"],
    torch_dtype=torch.float8_e4m3fn 
 )
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="tile",
        model_path="models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors",
        scale=0.7
    ),
 ],device="cuda")
 for model in pipe.controlnet.models:
    model.quantize()
 ```
 除了推理阶段，Diffsynth也支持在Lora训练阶段使用模型量化，只需要在训练参数中额外添加`--quantize "float8_e4m3fn"`。
 ## 卸载
 模型卸载技术的思想很简单，只在需要模型进行计算的时候才将模型加载到GPU显存上，使用完毕后将模型卸载至CPU内存中，牺牲模型加载和卸载的时间换取显存消耗。除了本体外，文生图模型的pipeline通常还包括VAE、Text Encoder等模型，在生成图片时会依次调用它们。使用卸载技术可以将显存需求降低至它们之中最大的模型。
 Diffsynth支持对所有文生图模型使用卸载技术，要启用模型卸载，需要指定模型被加载至CPU上，pipeline运行在GPU上，再调用`enable_cpu_offload()`启用模型卸载，以Flux为例：
 ```python
 model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu") 
 model_manager.load_models([
    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
    "models/FLUX/FLUX.1-dev/text_encoder_2",
    "models/FLUX/FLUX.1-dev/ae.safetensors",
    "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
 ])
 pipe = FluxImagePipeline.from_model_manager(model_manager,device="cuda")
 pipe.enable_cpu_offload()
 ```
 ## 总结
 模型量化和卸载都能有效降低显存消耗，并且互相兼容。模型卸载不会降低生成的图像质量，并且额外消耗的时间不多（受模型大小和设备通讯影响，通常每张图不超过3秒），因此在显存不足时优先推荐使用模型卸载。模型量化会损失部分图像质量，但在float8下质量差别不大。两种显存优化技术同时使用，可以将运行Flux的显存消耗从37GB降低至15GB。
 ## 支持量化的模型
 ### Flux
 * https://modelscope.cn/models/AI-ModelScope/FLUX.1-dev
 * https://modelscope.cn/models/AI-ModelScope/FLUX.1-schnell
 ### ControlNets
 * https://modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha
 * https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Depth
 * https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Surface-Normals
 * https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler
 * https://modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha
 * https://modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta
--- a/docs/source/creating/MultiControlnet.md
+++ b/docs/source/creating/MultiControlnet.md
@@ -0,0 +1,425 @@
 # 猫猫、少女、FLUX、ControlNet——多 ControlNet 模型的灵活运用
 文生图模型 FLUX 发布之后，开源社区为其适配了用于控制生成内容的模型——ControlNet，DiffSynth-Studio 为这些模型提供了支持，我们支持任意多个 ControlNet 模型的组合调用，即使这些模型的结构不同。本篇文章将展示这些 ControlNet 模型的灵活用法。
 ## Canny/Depth/Normal: 点对点结构控制
 结构控制是 ControlNet 模型最基础的能力，通过使用 Canny 提取出边缘信息，或者使用深度图和法线贴图，都可以用于表示图像的结构，进而作为图像生成过程中的控制信息。
 例如，我们生成一只猫猫，然后使用支持多控制条件的模型 InstantX/FLUX.1-dev-Controlnet-Union-alpha，同时启用 Canny 和 Depth 控制，让环境变为黄昏。
 模型链接：https://modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha
 ```python
 from diffsynth import ModelManager, FluxImagePipeline, ControlNetConfigUnit, download_models, download_customized_models
 import torch
 from PIL import Image
 import numpy as np
 download_models(["Annotators:Depth"])
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["FLUX.1-dev", "InstantX/FLUX.1-dev-Controlnet-Union-alpha"])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="canny",
        model_path="models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors",
        scale=0.3
    ),
    ControlNetConfigUnit(
        processor_id="depth",
        model_path="models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors",
        scale=0.3
    ),
 ])
 image_1 = pipe(
    prompt="a cat is running",
    height=1024, width=1024,
    seed=4
 )
 image_1.save("image_5.jpg")
 image_2 = pipe(
    prompt="sunshine, a cat is running",
    controlnet_image=image_1,
    height=1024, width=1024,
    seed=5
 )
 image_2.save("image_6.jpg")
 ```
 <div align="center">
    <img src="https://github.com/user-attachments/assets/19d2abc4-36ae-4163-a8da-df5732d1a737" alt="图片1" width="300" style="margin-right: 20px;">
    <img src="https://github.com/user-attachments/assets/28378271-3782-484c-bd51-3d3311dd85c6" alt="图片2" width="300">
 </div>
 <br>
 ControlNet 对于结构的控制力度是可以调节的，例如在下面这里例子中，我们把小姐姐从夏天移动到冬天时，适当调低 ControlNet 的控制力度，模型就会根据画面内容作出调整，为小姐姐换上温暖的衣服。
 ```python
 from diffsynth import ModelManager, FluxImagePipeline, ControlNetConfigUnit, download_models, download_customized_models
 import torch
 from PIL import Image
 import numpy as np
 download_models(["Annotators:Depth"])
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["FLUX.1-dev", "InstantX/FLUX.1-dev-Controlnet-Union-alpha"])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="canny",
        model_path="models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors",
        scale=0.3
    ),
    ControlNetConfigUnit(
        processor_id="depth",
        model_path="models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors",
        scale=0.3
    ),
 ])
 image_1 = pipe(
    prompt="a beautiful Asian girl, full body, red dress, summer",
    height=1024, width=1024,
    seed=6
 )
 image_1.save("image_7.jpg")
 image_2 = pipe(
    prompt="a beautiful Asian girl, full body, red dress, winter",
    controlnet_image=image_1,
    height=1024, width=1024,
    seed=7
 )
 image_2.save("image_8.jpg")
 ```
 <div align="center">
    <img src="https://github.com/user-attachments/assets/a7b8555b-bfd9-4e92-aa77-16bca81b07e3" alt="图片1" width="300" style="margin-right: 20px;">
    <img src="https://github.com/user-attachments/assets/a1bab36b-6cce-4f29-8233-4cb824b524a8" alt="图片2" width="300">
 </div>
 <br>
 ## Upscaler/Tile/Blur: 高清图像生成
 支持高清化的 ControlNet 模型有很多，例如
 模型链接: https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler, https://modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha, https://modelscope.cn/models/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro
 这些模型可以把模糊的、含噪点的低质量图像处理成清晰的图像。在 DiffSynth-Studio 中，框架原生支持的高分辨率分块处理技术可以突破模型的分辨率限制，实现 2048 甚至更高分辨率的图像生成，进一步放大了这些模型的能力。在下面的例子中，我们可以看到高清放大到 2048 分辨率的图片中，猫猫的毛发纤毫毕现，人物的皮肤纹理精致逼真。
 ```python
 from diffsynth import ModelManager, FluxImagePipeline, ControlNetConfigUnit, download_models, download_customized_models
 import torch
 from PIL import Image
 import numpy as np
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["FLUX.1-dev", "jasperai/Flux.1-dev-Controlnet-Upscaler"])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="tile",
        model_path="models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors",
        scale=0.7
    ),
 ])
 image_1 = pipe(
    prompt="a photo of a cat, highly detailed",
    height=768, width=768,
    seed=0
 )
 image_1.save("image_1.jpg")
 image_2 = pipe(
    prompt="a photo of a cat, highly detailed",
    controlnet_image=image_1.resize((2048, 2048)),
    input_image=image_1.resize((2048, 2048)), denoising_strength=0.99,
    height=2048, width=2048, tiled=True,
    seed=1
 )
 image_2.save("image_2.jpg")
 ```
 <div align="center">
    <img src="https://github.com/user-attachments/assets/9038158a-118c-4ad7-ab01-22865f6a06fc" alt="图片1" width="300" style="margin-right: 20px;">
    <img src="https://github.com/user-attachments/assets/88583a33-cd74-4cb9-8fd4-c6e14c0ada0c" alt="图片2" width="300">
 </div>
 <br>
 ```python
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["FLUX.1-dev", "jasperai/Flux.1-dev-Controlnet-Upscaler"])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="tile",
        model_path="models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors",
        scale=0.7
    ),
 ])
 image_1 = pipe(
    prompt="a beautiful Chinese girl, delicate skin texture",
    height=768, width=768,
    seed=2
 )
 image_1.save("image_3.jpg")
 image_2 = pipe(
    prompt="a beautiful Chinese girl, delicate skin texture",
    controlnet_image=image_1.resize((2048, 2048)),
    input_image=image_1.resize((2048, 2048)), denoising_strength=0.99,
    height=2048, width=2048, tiled=True,
    seed=3
 )
 image_2.save("image_4.jpg")
 ```
 <div align="center">
    <img src="https://github.com/user-attachments/assets/13061ecf-bb57-448a-82c6-7e4655c9cd85" alt="图片1" width="300" style="margin-right: 20px;">
    <img src="https://github.com/user-attachments/assets/0b7ae80f-de58-4d1d-a49c-ad17e7631bdc" alt="图片2" width="300"">
 </div>
 <br>
 ## Inpaint: 局部重绘
 Inpaint 模型可以对图像中的特定区域进行重绘，比如，我们可以给猫猫戴上墨镜。
 模型链接: https://modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta
 ```python
 from diffsynth import ModelManager, FluxImagePipeline, ControlNetConfigUnit, download_models, download_customized_models
 import torch
 from PIL import Image
 import numpy as np
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["FLUX.1-dev", "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta"])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="inpaint",
        model_path="models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta/diffusion_pytorch_model.safetensors",
        scale=0.9
    ),
 ])
 image_1 = pipe(
    prompt="a cat sitting on a chair",
    height=1024, width=1024,
    seed=8
 )
 image_1.save("image_9.jpg")
 mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
 mask[100:350, 350: -300] = 255
 mask = Image.fromarray(mask)
 mask.save("mask_9.jpg")
 image_2 = pipe(
    prompt="a cat sitting on a chair, wearing sunglasses",
    controlnet_image=image_1, controlnet_inpaint_mask=mask,
    height=1024, width=1024,
    seed=9
 )
 image_2.save("image_10.jpg")
 ```
 <div align="center">
    <img src="https://github.com/user-attachments/assets/babddad0-2d67-4624-b77a-c953250ebdab" alt="图片1" width="200" style="margin-right: 10px;">
    <img src="https://github.com/user-attachments/assets/d5bc2878-1817-457a-bdfa-200f955233d3" alt="图片2" width="200" style="margin-right: 10px;">
    <img src="https://github.com/user-attachments/assets/e3197f2c-190b-4522-83ab-a2e0451b39f6" alt="图片2" width="200">
 </div>
 <br>
 但是我们注意到，猫猫的头部动作发生了变化，如果我们想要保留原来的结构特征，可以使用 canny、depth、normal 模型，DiffSynth-Studio 为不同结构的 ControlNet 提供了无缝的兼容支持。配合一个 normal ControlNet，我们可以保证局部重绘时画面结构不变。
 模型链接：https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Surface-Normals
 ```python
 from diffsynth import ModelManager, FluxImagePipeline, ControlNetConfigUnit, download_models, download_customized_models
 import torch
 from PIL import Image
 import numpy as np
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=[
    "FLUX.1-dev",
    "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta"
 ])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="inpaint",
        model_path="models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta/diffusion_pytorch_model.safetensors",
        scale=0.9
    ),
    ControlNetConfigUnit(
        processor_id="normal",
        model_path="models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals/diffusion_pytorch_model.safetensors",
        scale=0.6
    ),
 ])
 image_1 = pipe(
    prompt="a beautiful Asian woman looking at the sky, wearing a blue t-shirt.",
    height=1024, width=1024,
    seed=10
 )
 image_1.save("image_11.jpg")
 mask = np.zeros((1024, 1024, 3), dtype=np.uint8)
 mask[-400:, 10:-40] = 255
 mask = Image.fromarray(mask)
 mask.save("mask_11.jpg")
 image_2 = pipe(
    prompt="a beautiful Asian woman looking at the sky, wearing a yellow t-shirt.",
    controlnet_image=image_1, controlnet_inpaint_mask=mask,
    height=1024, width=1024,
    seed=11
 )
 image_2.save("image_12.jpg")
 ```
 <div align="center">
    <img src="https://github.com/user-attachments/assets/c028e6fc-5125-4cba-b35a-b6211c2e6600" alt="图片1" width="200" style="margin-right: 10px;">
    <img src="https://github.com/user-attachments/assets/1928ee9a-7594-4c6e-9c71-5bd0b043d8f4" alt="图片2" width="200" style="margin-right: 10px;">
    <img src="https://github.com/user-attachments/assets/97b3b9e1-f821-405e-971b-9e1c31a209aa" alt="图片2" width="200">
 </div>
 <br>
 ## MultiControlNet+MultiDiffusion: 精细的高阶控制
 DiffSynth-Studio 不仅支持多个不同结构的 ControlNet 同时生效，还支持使用不同提示词分区控制图中内容，还支持超高分辨率大图的分块处理，这让我们能够作出极为精细的高阶控制。接下来，我们展示一张精美图片的创作过程。
 首先使用提示词“a beautiful Asian woman and a cat on a bed. The woman wears a dress”生成一只猫猫和一位少女。
 ![image_13](https://github.com/user-attachments/assets/8da006e4-0e68-4fa5-b407-31ef5dbe8e5a)
 然后，启用 Inpaint ControlNet 和 Canny ControlNet
 模型链接: https://modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta, https://modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha
 分两个区域进行控制：
 |Prompt: an orange cat, highly detailed|Prompt: a girl wearing a red camisole|
 |:-:|:-:|
 |![mask_13_1](https://github.com/user-attachments/assets/188530a0-913c-48db-a7f1-62f0384bfdc3)|![mask_13_2](https://github.com/user-attachments/assets/99c4d0d5-8cc3-47a0-8e56-ceb37db4dfdc)|
 生成的结果：
 ![image_14](https://github.com/user-attachments/assets/f5b9d3dd-a690-4597-91a8-a019c6fc2523)
 背景有点模糊，我们使用去模糊 LoRA，进行图生图
 模型链接：https://modelscope.cn/models/LiblibAI/FLUX.1-dev-LoRA-AntiBlur
 ![image_15](https://github.com/user-attachments/assets/32ed2667-2260-4d80-aaa9-4435d6920a2a)
 整个画面清晰多了，接下来使用高清化模型，把分辨率增加到 4096*4096！
 模型链接：https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler
 ![image_17](https://github.com/user-attachments/assets/1a688a12-1544-4973-8aca-aa3a23cb34c1)
 放大来看看
 ![image_17_cropped](https://github.com/user-attachments/assets/461a1fbc-9ffa-4da5-80fd-e1af9667c804)
 这一系列例子可以用以下代码“一条龙”式地生成：
 ```python
 from diffsynth import ModelManager, FluxImagePipeline, ControlNetConfigUnit, download_models, download_customized_models
 import torch
 from PIL import Image
 import numpy as np
 download_models(["Annotators:Depth", "Annotators:Normal"])
 download_customized_models(
    model_id="LiblibAI/FLUX.1-dev-LoRA-AntiBlur",
    origin_file_path="FLUX-dev-lora-AntiBlur.safetensors",
    local_dir="models/lora"
 )
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=[
    "FLUX.1-dev",
    "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
    "jasperai/Flux.1-dev-Controlnet-Upscaler",
 ])
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="inpaint",
        model_path="models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta/diffusion_pytorch_model.safetensors",
        scale=0.9
    ),
    ControlNetConfigUnit(
        processor_id="canny",
        model_path="models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha/diffusion_pytorch_model.safetensors",
        scale=0.5
    ),
 ])
 image_1 = pipe(
    prompt="a beautiful Asian woman and a cat on a bed. The woman wears a dress.",
    height=1024, width=1024,
    seed=100
 )
 image_1.save("image_13.jpg")
 mask_global = np.zeros((1024, 1024, 3), dtype=np.uint8)
 mask_global = Image.fromarray(mask_global)
 mask_global.save("mask_13_global.jpg")
 mask_1 = np.zeros((1024, 1024, 3), dtype=np.uint8)
 mask_1[300:-100, 30: 450] = 255
 mask_1 = Image.fromarray(mask_1)
 mask_1.save("mask_13_1.jpg")
 mask_2 = np.zeros((1024, 1024, 3), dtype=np.uint8)
 mask_2[500:-100, -400:] = 255
 mask_2[-200:-100, -500:-400] = 255
 mask_2 = Image.fromarray(mask_2)
 mask_2.save("mask_13_2.jpg")
 image_2 = pipe(
    prompt="a beautiful Asian woman and a cat on a bed. The woman wears a dress.",
    controlnet_image=image_1, controlnet_inpaint_mask=mask_global,
    local_prompts=["an orange cat, highly detailed", "a girl wearing a red camisole"], masks=[mask_1, mask_2], mask_scales=[10.0, 10.0],
    height=1024, width=1024,
    seed=101
 )
 image_2.save("image_14.jpg")
 model_manager.load_lora("models/lora/FLUX-dev-lora-AntiBlur.safetensors", lora_alpha=2)
 image_3 = pipe(
    prompt="a beautiful Asian woman wearing a red camisole and an orange cat on a bed. clear background.",
    negative_prompt="blur, blurry",
    input_image=image_2, denoising_strength=0.7,
    height=1024, width=1024,
    cfg_scale=2.0, num_inference_steps=50,
    seed=102
 )
 image_3.save("image_15.jpg")
 pipe = FluxImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit(
        processor_id="tile",
        model_path="models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler/diffusion_pytorch_model.safetensors",
        scale=0.7
    ),
 ])
 image_4 = pipe(
    prompt="a beautiful Asian woman wearing a red camisole and an orange cat on a bed. highly detailed, delicate skin texture, clear background.",
    controlnet_image=image_3.resize((2048, 2048)),
    input_image=image_3.resize((2048, 2048)), denoising_strength=0.99,
    height=2048, width=2048, tiled=True,
    seed=103
 )
 image_4.save("image_16.jpg")
 image_5 = pipe(
    prompt="a beautiful Asian woman wearing a red camisole and an orange cat on a bed. highly detailed, delicate skin texture, clear background.",
    controlnet_image=image_4.resize((4096, 4096)),
    input_image=image_4.resize((4096, 4096)), denoising_strength=0.99,
    height=4096, width=4096, tiled=True,
    seed=104
 )
 image_5.save("image_17.jpg")
 ```
 DiffSynth-Studio 和 ControlNet 的强大潜力已经展现在你的眼前了，快去体验 AIGC 技术的乐趣吧！
 ## 已支持的 FLUX ControlNet 列表
 * https://modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha
 * https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Depth
 * https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Surface-Normals
 * https://modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler
 * https://modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha
 * https://modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta
 * https://modelscope.cn/models/Shakker-Labs/FLUX.1-dev-ControlNet-Depth
 * https://modelscope.cn/models/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro
--- a/docs/source/creating/PromptRefine.md
+++ b/docs/source/creating/PromptRefine.md
@@ -0,0 +1,78 @@
 # 翻译、润色——提示词的魔法
 在生成图像时，我们需要编写提示词，用来描述图像的内容。提示词会直接影响生成的效果，但提示词的编写也是一门学问，好的提示词可以生成具有高度美感的图像，我们提供了一系列模型来帮助用户处理提示词。
 ## 翻译
 目前大多数文生图模型都是只支持英文提示词的，对于非英文母语的用户，使用起来有些困难，我们可以使用开源的翻译模型把提示词翻译成英文。在下面这个例子中，我们以“一个女孩”为提示词，使用模型 opus-mt-zh-en（可在 [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) 或 [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en) 下载）进行翻译。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, Translator
 import torch
 model_manager = ModelManager(
    torch_dtype=torch.float16, device="cuda",
    model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
 )
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
 torch.manual_seed(0)
 prompt = "一个女孩"
 image = pipe(
    prompt=prompt, negative_prompt="",
    height=1024, width=1024, num_inference_steps=30
 )
 image.save("image_1.jpg")
 ```
 ![image_1](https://github.com/user-attachments/assets/c8070a6b-3d2f-4faf-a806-c403b91f1a94)
 ## 润色
 详细的提示词可以生成细节更丰富的图像，我们可以使用提示词润色模型 BeautifulPrompt（可在 [HuggingFace](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd) 下载）对简单的提示词进行润色，这个模型能够让整体画面风格更加华丽。
 这个模块可以和翻译模块同时启用，但请注意顺序，先翻译，后润色。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
 import torch
 model_manager = ModelManager(
    torch_dtype=torch.float16, device="cuda",
    model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
 )
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
 torch.manual_seed(0)
 prompt = "一个女孩"
 image = pipe(
    prompt=prompt, negative_prompt="",
    height=1024, width=1024, num_inference_steps=30
 )
 image.save("image_2.jpg")
 ```
 ![image_2](https://github.com/user-attachments/assets/94f64a7d-b14a-41e2-a013-c9a74635a84d)
 我们还内置了一个通义千问模型，这个模型可以一步到位地完成提示词的翻译和润色工作。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
 import torch
 model_manager = ModelManager(
    torch_dtype=torch.float16, device="cuda",
    model_id_list=["BluePencilXL_v200", "QwenPrompt"]
 )
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
 torch.manual_seed(0)
 prompt = "一个女孩"
 image = pipe(
    prompt=prompt, negative_prompt="",
    height=1024, width=1024, num_inference_steps=30
 )
 image.save("image_3.jpg")
 ```
 ![image_3](https://github.com/user-attachments/assets/fc1a201d-aef1-4e6a-81d6-2e2249ffa230)
--- a/docs/source/creating/ToonShading.md
+++ b/docs/source/creating/ToonShading.md
@@ -0,0 +1,95 @@
 # 当图像模型遇见 AnimateDiff——模型组合技术
 我们已经领略到了 Stable Diffusion 模型及其生态模型的强大图像生成能力，现在我们引入一个新的模块：AnimateDiff，这样一来就可以把图像模型的能力迁移到视频中。在本篇文章中，我们为您展示基于 DiffSynth-Studio 搭建的动漫风格视频渲染方案：Diffutoon。
 ## 下载模型
 接下来的例子会用到很多模型，我们先把它们下载好。
 * 一个动漫风格的 Stable Diffusion 架构模型
 * 两个 ControlNet 模型
 * 一个 Textual Inversion 模型
 * 一个 AnimateDiff 模型
 ```python
 from diffsynth import download_models
 download_models([
    "AingDiffusion_v12",
    "AnimateDiff_v2",
    "ControlNet_v11p_sd15_lineart",
    "ControlNet_v11f1e_sd15_tile",
    "TextualInversion_VeryBadImageNegative_v1.3"
 ])
 ```
 ## 下载视频
 你可以随意选择任何你喜欢的视频，我们使用[这个视频](https://www.bilibili.com/video/BV1iG411a7sQ)作为演示，你可以通过以下命令下载这个视频文件，但请注意，在没有获得视频原作者的商用版权时，请不要将其用作商业用途。
 ```
 modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
 ```
 ## 生成动漫
 ```python
 from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
 import torch
 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion/aingdiffusion_v12.safetensors",
    "models/AnimateDiff/mm_sd_v15_v2.ckpt",
    "models/ControlNet/control_v11p_sd15_lineart.pth",
    "models/ControlNet/control_v11f1e_sd15_tile.pth",
 ])
 # Build pipeline
 pipe = SDVideoPipeline.from_model_manager(
    model_manager,
    [
        ControlNetConfigUnit(
            processor_id="tile",
            model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
            scale=0.5
        ),
        ControlNetConfigUnit(
            processor_id="lineart",
            model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
            scale=0.5
        )
    ]
 )
 pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
 # Load video
 video = VideoData(
    video_file="data/examples/diffutoon/input_video.mp4",
    height=1536, width=1536
 )
 input_video = [video[i] for i in range(30)]
 # Generate
 torch.manual_seed(0)
 output_video = pipe(
    prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
    negative_prompt="verybadimagenegative_v1.3",
    cfg_scale=7, clip_skip=2,
    input_frames=input_video, denoising_strength=1.0,
    controlnet_frames=input_video, num_frames=len(input_video),
    num_inference_steps=10, height=1536, width=1536,
    animatediff_batch_size=16, animatediff_stride=8,
 )
 # Save video
 save_video(output_video, "output_video.mp4", fps=30)
 ```
 ## 效果展示
 <video width="512" height="256" controls>
  <source src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
--- a/docs/source/finetune/overview.md
+++ b/docs/source/finetune/overview.md
@@ -0,0 +1,101 @@
 # 训练框架
 我们实现了一个用于文本到图像扩散模型的训练框架，使用户能够轻松地使用我们的框架训练 LoRA 模型。我们提供的脚本具有以下特点：
 * **功能全面**：我们的训练框架支持多GPU和多机器配置，便于使用 DeepSpeed 加速，并包括梯度检查点优化，适用于内存需求较大的模型。
 * **代码简洁**：我们避免了大块复杂的代码。通用模块实现于 `diffsynth/trainers/text_to_image.py` 中，而模型特定的训练脚本仅包含与模型架构相关的最少代码，便于学术研究人员使用。
 * **模块化设计**：基于通用的 Pytorch-Lightning 框架，我们的训练框架在功能上是解耦的，允许开发者通过修改我们的脚本轻松引入额外的训练技术，以满足他们的需求。
 LoRA 微调的图像示例。提示词为 "一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉"（针对中文模型）或 "a dog is jumping, flowers around the dog, the background is mountains and clouds"（针对英文模型）。
 ||<div style="width:150px">FLUX.1-dev</div>|<div style="width:150px">Kolors</div>|<div style="width:150px">Stable Diffusion 3</div>|<div style="width:150px">Hunyuan-DiT</div>|
 |-|:-:|:-:|:-:|:-:|
 |Without LoRA|<img src="https://github.com/user-attachments/assets/df62cef6-d54f-4e3d-a602-5dd290079d49" width="150"  alt="image_without_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/9d79ed7a-e8cf-4d98-800a-f182809db318" width="150"  alt="image_without_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/ddb834a5-6366-412b-93dc-6d957230d66e" width="150"  alt="image_without_lora">|<img src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e" width="150"  alt="image_without_lora">|
 |With LoRA|<img src="https://github.com/user-attachments/assets/4fd39890-0291-4d19-8a88-d70d0ae18533" width="150"  alt="image_with_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/02f62323-6ee5-4788-97a1-549732dbe4f0" width="150"  alt="image_with_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8e7b2888-d874-4da4-a75b-11b6b214b9bf" width="150"  alt="image_with_lora">|<img src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282" width="150"  alt="image_with_lora">|
 ## 安装额外包
 ```
 pip install peft lightning
 ```
 ## 准备数据集
 我们提供了一个[示例数据集](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files)。你需要将训练数据集按照如下形式组织：
 ```
 data/dog/
 └── train
    ├── 00.jpg
    ├── 01.jpg
    ├── 02.jpg
    ├── 03.jpg
    ├── 04.jpg
    └── metadata.csv
 ```
 `metadata.csv`:
 ```
 file_name,text
 00.jpg,a dog
 01.jpg,a dog
 02.jpg,a dog
 03.jpg,a dog
 04.jpg,a dog
 ```
 请注意，如果模型是中文模型（例如，Hunyuan-DiT 和 Kolors），我们建议在数据集中使用中文文本。例如：
 ```
 file_name,text
 00.jpg,一只小狗
 01.jpg,一只小狗
 02.jpg,一只小狗
 03.jpg,一只小狗
 04.jpg,一只小狗
 ```
 ## 训练 LoRA 模型
 通用参数选项：
 ```
  --lora_target_modules LORA_TARGET_MODULES
                        LoRA 模块所在的层。
  --dataset_path DATASET_PATH
                        数据集的路径。
  --output_path OUTPUT_PATH
                        模型保存路径。
  --steps_per_epoch STEPS_PER_EPOCH
                        每个周期的步数。
  --height HEIGHT       图像高度。
  --width WIDTH         图像宽度。
  --center_crop         是否将输入图像中心裁剪到指定分辨率。如果未设置，图像将被随机裁剪。图像会在裁剪前先调整到指定分辨率。
  --random_flip         是否随机水平翻转图像。
  --batch_size BATCH_SIZE
                        训练数据加载器的批量大小（每设备）。
  --dataloader_num_workers DATALOADER_NUM_WORKERS
                        数据加载使用的子进程数量。0 表示数据将在主进程中加载。
  --precision {32,16,16-mixed}
                        训练精度。
  --learning_rate LEARNING_RATE
                        学习率。
  --lora_rank LORA_RANK
                        LoRA 更新矩阵的维度。
  --lora_alpha LORA_ALPHA
                        LoRA 更新矩阵的权重。
  --use_gradient_checkpointing
                        是否使用梯度检查点。
  --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
                        梯度累积的批次数量。
  --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
                        训练策略。
  --max_epochs MAX_EPOCHS
                        训练轮数。
  --modelscope_model_id MODELSCOPE_MODEL_ID
                        ModelScope 上的模型 ID (https://www.modelscope.cn/)。如果提供模型 ID，模型将自动上传到 ModelScope。
  --modelscope_access_token MODELSCOPE_ACCESS_TOKEN
                        在 ModelScope (https://www.modelscope.cn/) 上获取访问密钥。您需要此密钥将模型上传到 ModelScope。
 ```
--- a/docs/source/finetune/train_flux_lora.md
+++ b/docs/source/finetune/train_flux_lora.md
@@ -0,0 +1,77 @@
 # 训练 FLUX LoRA
 以下文件将会被用于构建 FLUX 模型。 你可以从[huggingface](https://huggingface.co/black-forest-labs/FLUX.1-dev)或[modelscope](https://www.modelscope.cn/models/ai-modelscope/flux.1-dev)下载，也可以使用以下代码下载这些文件:
 ```python
 from diffsynth import download_models
 download_models(["FLUX.1-dev"])
 ```
 ```
 models/FLUX/
 └── FLUX.1-dev
    ├── ae.safetensors
    ├── flux1-dev.safetensors
    ├── text_encoder
    │   └── model.safetensors
    └── text_encoder_2
        ├── config.json
        ├── model-00001-of-00002.safetensors
        ├── model-00002-of-00002.safetensors
        └── model.safetensors.index.json
 ```
 使用以下命令启动训练任务（需要39G显存）：
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/flux/train_flux_lora.py \
  --pretrained_text_encoder_path models/FLUX/FLUX.1-dev/text_encoder/model.safetensors \
  --pretrained_text_encoder_2_path models/FLUX/FLUX.1-dev/text_encoder_2 \
  --pretrained_dit_path models/FLUX/FLUX.1-dev/flux1-dev.safetensors \
  --pretrained_vae_path models/FLUX/FLUX.1-dev/ae.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 100 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "bf16" \
  --learning_rate 1e-4 \
  --lora_rank 16 \
  --lora_alpha 16 \
  --use_gradient_checkpointing \
  --align_to_opensource_format
 ```
 通过添加参数 `--quantize "float8_e4m3fn"`，你可以节省大约 10G 的显存。
 **`--align_to_opensource_format` 表示此脚本将以开源格式导出 LoRA 权重。此格式可以在 DiffSynth-Studio 和其他代码库中加载。**
 有关参数的更多信息，请使用 `python examples/train/flux/train_flux_lora.py -h` 查看详细信息。
 训练完成后，使用 model_manager.load_lora 来加载 LoRA 以进行推理。
 ```python
 from diffsynth import ModelManager, FluxImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda",
                             file_path_list=[
                                 "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
                                 "models/FLUX/FLUX.1-dev/text_encoder_2",
                                 "models/FLUX/FLUX.1-dev/ae.safetensors",
                                 "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
                             ])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = FluxImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
    num_inference_steps=30, embedded_guidance=3.5
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source/finetune/train_hunyuan_dit_lora.md
+++ b/docs/source/finetune/train_hunyuan_dit_lora.md
@@ -0,0 +1,72 @@
 # 训练 Hunyuan-DiT LoRA
 构建 Hunyuan DiT 需要四个文件。你可以从 [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) 或 [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary) 下载这些文件。你可以使用以下代码下载这些文件：
 ```python
 from diffsynth import download_models
 download_models(["HunyuanDiT"])
 ```
 ```
 models/HunyuanDiT/
 ├── Put Hunyuan DiT checkpoints here.txt
 └── t2i
    ├── clip_text_encoder
    │   └── pytorch_model.bin
    ├── model
    │   └── pytorch_model_ema.pt
    ├── mt5
    │   └── pytorch_model.bin
    └── sdxl-vae-fp16-fix
        └── diffusion_pytorch_model.bin
 ```
 使用以下命令启动训练任务：
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
  --pretrained_path models/HunyuanDiT/t2i \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 有关参数的更多信息，请使用 `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` 查看详细信息。
 训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
 ```python
 from diffsynth import ModelManager, HunyuanDiTImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=[
                                 "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
                                 "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
                                 "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
                                 "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
                             ])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉", 
    negative_prompt="",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source/finetune/train_kolors_lora.md
+++ b/docs/source/finetune/train_kolors_lora.md
@@ -0,0 +1,78 @@
 # 训练 Kolors LoRA
 以下文件将用于构建 Kolors。你可以从 [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) 或 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) 下载 Kolors。由于精度溢出问题，我们需要下载额外的 VAE 模型（从 [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix)）。你可以使用以下代码下载这些文件：
 ```python
 from diffsynth import download_models
 download_models(["Kolors", "SDXL-vae-fp16-fix"])
 ```
 ```
 models
 ├── kolors
 │   └── Kolors
 │       ├── text_encoder
 │       │   ├── config.json
 │       │   ├── pytorch_model-00001-of-00007.bin
 │       │   ├── pytorch_model-00002-of-00007.bin
 │       │   ├── pytorch_model-00003-of-00007.bin
 │       │   ├── pytorch_model-00004-of-00007.bin
 │       │   ├── pytorch_model-00005-of-00007.bin
 │       │   ├── pytorch_model-00006-of-00007.bin
 │       │   ├── pytorch_model-00007-of-00007.bin
 │       │   └── pytorch_model.bin.index.json
 │       ├── unet
 │       │   └── diffusion_pytorch_model.safetensors
 │       └── vae
 │           └── diffusion_pytorch_model.safetensors
 └── sdxl-vae-fp16-fix
    └── diffusion_pytorch_model.safetensors
 ```
 使用下面的命令启动训练任务：
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
  --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
  --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
  --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 有关参数的更多信息，请使用 `python examples/train/kolors/train_kolors_lora.py -h` 查看详细信息。
 训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
 ```python
 from diffsynth import ModelManager, SD3ImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source/finetune/train_sd3_lora.md
+++ b/docs/source/finetune/train_sd3_lora.md
@@ -0,0 +1,59 @@
 # 训练 Stable Diffusion 3 LoRA
 训练脚本只需要一个文件。你可以使用 [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)（没有 T5 Encoder）或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)（有 T5 Encoder）。请使用以下代码下载这些文件：
 ```python
 from diffsynth import download_models
 download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
 ```
 ```
 models/stable_diffusion_3/
 ├── Put Stable Diffusion 3 checkpoints here.txt
 ├── sd3_medium_incl_clips.safetensors
 └── sd3_medium_incl_clips_t5xxlfp16.safetensors
 ```
 使用下面的命令启动训练任务：
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
  --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 有关参数的更多信息，请使用 `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` 查看详细信息。
 训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
 ```python
 from diffsynth import ModelManager, SD3ImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source/finetune/train_sd_lora.md
+++ b/docs/source/finetune/train_sd_lora.md
@@ -0,0 +1,59 @@
 # 训练 Stable Diffusion LoRA
 训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下，我们使用基础的 Stable Diffusion v1.5。你可以从 [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors) 下载。你可以使用以下代码下载这个文件：
 ```python
 from diffsynth import download_models
 download_models(["StableDiffusion_v15"])
 ```
 ```
 models/stable_diffusion
 ├── Put Stable Diffusion checkpoints here.txt
 └── v1-5-pruned-emaonly.safetensors
 ```
 使用以下命令启动训练任务：
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
  --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 512 \
  --width 512 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 有关参数的更多信息，请使用 `python examples/train/stable_diffusion/train_sd_lora.py -h` 查看详细信息。
 训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
 ```python
 from diffsynth import ModelManager, SDImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SDImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=512, height=512,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source/finetune/train_sdxl_lora.md
+++ b/docs/source/finetune/train_sdxl_lora.md
@@ -0,0 +1,57 @@
 # 训练 Stable Diffusion XL LoRA
 训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下，我们使用基础的 Stable Diffusion XL。你可以从 [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors) 下载。也可以使用以下代码下载这个文件：
 ```python
 from diffsynth import download_models
 download_models(["StableDiffusionXL_v1"])
 ```
 ```
 models/stable_diffusion_xl
 ├── Put Stable Diffusion XL checkpoints here.txt
 └── sd_xl_base_1.0.safetensors
 ```
 我们观察到 Stable Diffusion XL 在 float16 精度下会出现数值精度溢出，因此我们建议用户使用 float32 精度训练，使用以下命令启动训练任务：
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
  --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "32" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 有关参数的更多信息，请使用 `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` 查看详细信息。
 训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -0,0 +1,66 @@
 .. DiffSynth-Studio documentation master file, created by
   sphinx-quickstart on Thu Sep  5 16:39:24 2024.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 DiffSynth-Studio 文档
 ==============================
 欢迎来到 Diffusion 的魔法世界，这里是 DiffSynth-Studio，一个开源的 Diffusion 引擎，我们希望通过这样一个开源项目，构建统一、互联、创新的 Diffusion 模型生态！
 .. toctree::
   :maxdepth: 1
   :caption: 简介
   introduction/introduction.md
 .. toctree::
   :maxdepth: 1
   :caption: 快速开始
   tutorial/ASimpleExample.md
   tutorial/Installation.md
   tutorial/DownloadModels.md
   tutorial/Models.md
   tutorial/Pipelines.md
   tutorial/Extensions.md
   tutorial/Schedulers.md
 .. toctree::
   :maxdepth: 1
   :caption: 开启创作之旅
   creating/BasicImageSynthesis.md
   creating/AdaptersForImageSynthesis.md
   creating/MultiControlnet.md
   creating/ToonShading.md
   creating/PromptRefine.md
 .. toctree::
   :maxdepth: 1
   :caption: 模型列表
   model/StableDiffusion.md
   model/StableDiffusionXL.md
   model/ControlNet.md
   model/AnimateDiff.md
   model/IPAdapter.md
   model/HunyuanDiT.md
   model/Kolors.md
   model/StableDiffusion3.md
   model/StableVideoDiffusion.md
   model/ExVideo.md
   model/FLUX.md
   model/CogVideo.md
 .. toctree::
   :maxdepth: 1
   :caption: 微调
   finetune/overview.md
   finetune/train_flux_lora.md
   finetune/train_kolors_lora.md
   finetune/train_sd3_lora.md
   finetune/train_hunyuan_dit_lora.md
   finetune/train_sdxl_lora.md
   finetune/train_sd_lora.md
--- a/docs/source/introduction/introduction.md
+++ b/docs/source/introduction/introduction.md
@@ -0,0 +1,77 @@
 # 欢迎来到 Diffusion 的魔法世界
 欢迎来到 Diffusion 的魔法世界，这里是 DiffSynth-Studio，一个开源的 Diffusion 引擎，我们希望通过这样一个开源项目，构建统一、互联、创新的 Diffusion 模型生态！
 ## 统一
 目前的开源 Diffusion 模型结构五花八门，以文生图模型为例，有 Stable Diffusion、Kolors、FLUX 等。
 |<div style="width:150px">FLUX</div>|<div style="width:150px">Stable Diffusion 3</div>|<div style="width:150px">Kolors</div> |<div style="width:150px">Hunyuan-DiT</div>|<div style="width:150px">Stable Diffusion</div>|<div style="width:150px">Stable Diffusion XL</div>|
 |:-:|:-:|:-:|:-:|:-:|:-:|
 | <img src="https://github.com/user-attachments/assets/984561e9-553d-4952-9443-79ce144f379f" width="150" /> | <img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098" width="150" /> | <img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/53ef6f41-da11-4701-8665-9f64392607bf" width="150" /> | <img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5" width="150" /> | <img src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5" width="150" /> | <img src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90" width="150" /> |
 我们设计了统一的框架，实现了通用的增强模块，例如提示词分区控制技术。
 <div align="center">
 <video width="512" height="256" controls>
  <source src="https://github.com/user-attachments/assets/59613157-de51-4109-99b3-97cbffd88076" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 </div>
 以及一站式的训练脚本。
 ||<div style="width:150px">FLUX.1-dev</div>|<div style="width:150px">Kolors</div>|<div style="width:150px">Stable Diffusion 3</div>|<div style="width:150px">Hunyuan-DiT</div>|
 |-|:-:|:-:|:-:|:-:|
 |Without LoRA|<img src="https://github.com/user-attachments/assets/df62cef6-d54f-4e3d-a602-5dd290079d49" width="150"  alt="image_without_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/9d79ed7a-e8cf-4d98-800a-f182809db318" width="150"  alt="image_without_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/ddb834a5-6366-412b-93dc-6d957230d66e" width="150"  alt="image_without_lora">|<img src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e" width="150"  alt="image_without_lora">|
 |With LoRA|<img src="https://github.com/user-attachments/assets/4fd39890-0291-4d19-8a88-d70d0ae18533" width="150"  alt="image_with_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/02f62323-6ee5-4788-97a1-549732dbe4f0" width="150"  alt="image_with_lora">|<img src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8e7b2888-d874-4da4-a75b-11b6b214b9bf" width="150"  alt="image_with_lora">|<img src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282" width="150"  alt="image_with_lora">|
 ## 互联
 与语言模型不同，Diffusion 模型存在生态模型，包括 LoRA、ControlNet、IP-Adapter 等，这些模型由不同的开发者开发、训练、开源，我们为这些模型提供了一站式的推理支持。例如基于 Stable Diffusion XL，你可以随意使用这些相关的生态模型组装出丰富的功能。
 |<div style="width:150px">底模生成</div>|使用 ControlNet 保持画面结构重新生成|继续叠加 LoRA 使画面更扁平|叠加 IP-Adapter 转换为水墨风格|
 |:-:|:-:|:-:|:-:|
 |<img src="https://github.com/user-attachments/assets/cc094e8f-ff6a-4f9e-ba05-7a5c2e0e609f" width="150" >|<img src="https://github.com/user-attachments/assets/d50d173e-e81a-4d7e-93e3-b2787d69953e" width="150" >|<img src="https://github.com/user-attachments/assets/c599b2f8-8351-4be5-a6ae-8380889cb9d8" width="150" >|<img src="https://github.com/user-attachments/assets/e5924aef-03b0-4462-811f-a60e2523fd7f" width="150" >|
 你甚至可以继续叠加 AnimateDiff 构建视频转绘方案。
 <div align="center">
 <video width="512" height="256" controls>
  <source src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 </div>
 ## 创新
 DiffSynth-Studio 集成了多个开源模型，这是属于开源社区的奇迹。我们致力于用强工程基础驱动算法上的创新，目前我们公开了多项创新性生成技术。
 * ExVideo: 视频生成模型的扩展训练技术
    * 项目页面: [https://ecnu-cilab.github.io/ExVideoProjectPage/](https://ecnu-cilab.github.io/ExVideoProjectPage/)
    * 技术报告: [https://arxiv.org/abs/2406.14130](https://arxiv.org/abs/2406.14130)
    * 模型 (ExVideo-CogVideoX)
        * HuggingFace: [https://huggingface.co/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1](https://huggingface.co/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1)
        * ModelScope: [https://modelscope.cn/models/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1](https://modelscope.cn/models/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1)
    * 模型 (ExVideo-SVD)
        * HuggingFace: [https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
        * ModelScope: [https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
 * Diffutoon: 动漫风格视频渲染方案
    * 项目页面: [https://ecnu-cilab.github.io/DiffutoonProjectPage/](https://ecnu-cilab.github.io/DiffutoonProjectPage/)
    * 技术报告: [https://arxiv.org/abs/2401.16224](https://arxiv.org/abs/2401.16224)
    * 样例代码: [https://github.com/modelscope/DiffSynth-Studio/tree/main/examples/Diffutoon](https://github.com/modelscope/DiffSynth-Studio/tree/main/examples/Diffutoon)
 * FastBlend: 视频去闪烁算法
    * 独立仓库: [https://github.com/Artiprocher/sd-webui-fastblend](https://github.com/Artiprocher/sd-webui-fastblend)
    * 视频演示
        * [https://www.bilibili.com/video/BV1d94y1W7PE](https://www.bilibili.com/video/BV1d94y1W7PE)
        * [https://www.bilibili.com/video/BV1Lw411m71p](https://www.bilibili.com/video/BV1Lw411m71p)
        * [https://www.bilibili.com/video/BV1RB4y1Z7LF](https://www.bilibili.com/video/BV1RB4y1Z7LF)
    * 技术报告: [https://arxiv.org/abs/2311.09265](https://arxiv.org/abs/2311.09265)
 * DiffSynth: DiffSynth-Studio 的前身
    * 项目页面: [https://ecnu-cilab.github.io/DiffSynth.github.io/](https://ecnu-cilab.github.io/DiffSynth.github.io/)
    * 早期代码: [https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth)
    * 技术报告: [https://arxiv.org/abs/2308.03463](https://arxiv.org/abs/2308.03463)
--- a/docs/source/model/AnimateDiff.md
+++ b/docs/source/model/AnimateDiff.md
@@ -0,0 +1,69 @@
 # AnimateDiff
 ## 相关链接
 * 论文：
    * [AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning](https://arxiv.org/abs/2307.04725)
 * 模型
    * AnimateDiff
        * [HuggingFace](https://huggingface.co/guoyww/animatediff)
        * [ModelScope](https://www.modelscope.cn/models/Shanghai_AI_Laboratory/animatediff)
 ## 模型介绍
 AnimateDiff 是一种文生图模型的扩展方法，可以将文生图模型扩展为动画生成器，而无需对文生图模型做任何微调。扩展的基本思路是从大型视频数据集中学习到运动先验知识并保存到运动模块中，使用时将运动模块插入文生图模型即可。以下为其生成的视频效果：
 <div align="center">
 <video width="256" height="256" controls>
  <source src="https://github.com/user-attachments/assets/d5c22c05-ddb3-4b05-982a-1e65dd19b1ef" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 </div>
 AnnimateDiff 的训练主要分为三个阶段，分别对应了三个可训练的模块：Domain Adapter，Motion Module 和 MotionLoRA，如下图所示。
 ![](https://github.com/user-attachments/assets/a788caf8-9cc8-45bb-ba20-d80684d80e08)
 第一阶段中主要训练 Domain Adapter。由于公开可用的视频训练数据集的质量远低于图像数据集的质量，直接从这种数据集上训练 Motion Module 可能就降低其视频生成质量。视频和图像数据集质量的差距被成为域差距。为了减小这一差距对 Motion Module 的影响，作者提出使用 Domain Adapter 来单独拟合这些域差距。Domain Adapter 具体通过LoRA来实现，即在文生图模型中的 Self/Cross-Attention 层中插入 LoRA 模块。以 Query Projection 为例，插入 LoRA 后的输出如下公式所示。其中，$\alpha$ 为 Domain Adapter 权重。在推理的时候，设置 $\alpha=0$ 以去除 Domain Adapter 的影响。
 $$
 Q=\mathcal{W}^Q z+\text { AdapterLayer }(z)=\mathcal{W}^Q z+\alpha \cdot A B^T z
 $$
 第二阶段主要训练 Motion Module，这一模块主要目的是学习视频的运动先验信息。如上图所示， Motion Module 主要结构为 Temporal Transformer，由输入输出映射层和若干个 Self-Attention 组成。将 Motion Module 插入文生图模型后，模型的输入维度为：$b\times c\times f \times h \times w$。在数据到达文生图模型的原始模块（上图白色）时，将帧数 $f$ 融合到 $b$ 维度上，即可完成正常计算。当数据到达 Motion Module 时，为了完成 Temporal Attention，又将 $h$ 和 $w$ 融合到 $b$ 维度上，数据维度变为： $\{b\cdot h\cdot w\} \times f \times c$。
 尽管第二阶段训练的 Motion Module 学习了通用的运动先验知识，但仍然需要有效地将其适应到特定运动模式，比如相机缩放、平移等。因此，第三阶段主要针对个性化运动训练对应的 MotionLoRA。MotionLoRA 主要是通过在 Motion Module 的 Attention 中加入LoRA中实现的。实验证明，20 ~ 50 个参考视频、2000 个 step 就能学习到对应的运动能力。同时，多个 MotionLoRA 的运动效果是可以组合的。
 ## 代码样例
 ```python
 from diffsynth import ModelManager, SDXLVideoPipeline, save_video, download_models
 import torch
 # Download models (automatically)
 # `models/stable_diffusion_xl/sd_xl_base_1.0.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors)
 # `models/AnimateDiff/mm_sdxl_v10_beta.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sdxl_v10_beta.ckpt)
 download_models(["StableDiffusionXL_v1", "AnimateDiff_xl_beta"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/sd_xl_base_1.0.safetensors",
    "models/AnimateDiff/mm_sdxl_v10_beta.ckpt"
 ])
 pipe = SDXLVideoPipeline.from_model_manager(model_manager)
 prompt = "A panda standing on a surfboard in the ocean in sunset, 4k, high resolution.Realistic, Cinematic, high resolution"
 negative_prompt = ""
 torch.manual_seed(0)
 video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    cfg_scale=8.5,
    height=1024, width=1024, num_frames=16,
    num_inference_steps=100,
 )
 save_video(video, "output_video.mp4", fps=16)
 ```
--- a/docs/source/model/CogVideo.md
+++ b/docs/source/model/CogVideo.md
@@ -0,0 +1,43 @@
 # CogVideoX
 ## 相关链接
 * 论文：[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072)
 * 模型
    * CogVideoX-5B
        * [HuggingFace](https://huggingface.co/THUDM/CogVideoX-5b)
        * [ModelScope](https://modelscope.cn/models/ZhipuAI/CogVideoX-5b)
 ## 模型介绍
 CogVideoX 是由智谱团队训练并开源的视频生成模型，模型结构分为 Text Encoder、VAE、DiT。
 * Text Encoder 模型为 T5，与 Stable Diffusion 3 以及 FLUX 一致。
 * VAE 部分为 3D 的 Causal VAE，将 8x8x4 的区域压缩成一个 Embedding。其中视频的第一帧单独处理，后续的每 4 帧合并为一组 Embedding。
 * DiT 部分采用了与 Stable Diffusion 3 类似的结构，对视频进行 patch 化之后由连读的多个 transformer 模块处理。
 ![image](https://github.com/user-attachments/assets/d1abec28-4a51-41b7-9f1d-be62d1975f52)
 CogVideoX-5B 模型可以生成长达 49 帧视频，FPS 为 8，效果如下：
 <video width="512" height="256" controls>
  <source src="an astronaut riding a horse on Mars." type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 ## 代码样例
 ```python
 from diffsynth import ModelManager, save_video, CogVideoPipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.bfloat16, model_id_list=["CogVideoX-5B"])
 pipe = CogVideoPipeline.from_model_manager(model_manager)
 video = pipe(
    prompt="a dog",
    height=480, width=720,
    cfg_scale=7.0, num_inference_steps=200
 )
 save_video(video, "video.mp5", fps=8, quality=5)
 ```
--- a/docs/source/model/ControlNet.md
+++ b/docs/source/model/ControlNet.md
@@ -0,0 +1,74 @@
 # ControlNet
 ## 相关链接
 * 论文：
    * [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543)
 * 模型
    * ControlNet-Union-SDXL
        * [HuggingFace](https://huggingface.co/xinsir/controlnet-union-sdxl-1.0)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/controlnet-union-sdxl-1.0)
    * ControlNet-V11-SD15
        * [HuggingFace](https://huggingface.co/lllyasviel/ControlNet-v1-1)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/ControlNet-v1-1)
 ## 模型介绍
 ControlNet 是一种辅助性的模型架构，它能够与已经训练好的 Diffusion 模型相结合。通过给模型额外添加可训练的 ControlNet 模块，我们得以在图像生成过程中施加额外的控制条件。比如，我们可以加入深度图、语义图和人体关键点等额外条件，控制生成图像的画面结构和布局。值得注意的是，针对不同的 Diffusion 模型， ControlNet 的具体结构可能会有所差异。
 ### ControlNet-V11-SD15
 ControlNet V1.1 是基于 Stable Diffusion V1.5 (SD15) 的 ControlNet 更新版本，包含 Canny, Depth, Segmentation, Inpaint, Lineart 等控制条件对应的模型。
 ControlNet 原论文是针对 SD15 设计的模型结构，如下图所示。(a) 部分结构为已训练完成的 Stable Diffusion (SD) 模型，模型输入为文本 Prompt $c_t$ 与去噪时间步长 $t$。(b) 部分结构为 ControlNet，主要包括若干个零初始化的卷积层 (zero convolution) 和 SD UNet Encoder 的可训练副本，模型输入为额外的控制条件 $c_f$。
 zero convolution 为 $1\times1$ 的卷积层，其权重和偏置都被初始化为0。因此，在 ControlNet 被训练之前，所有 zero convolution 模块的输出都为0，保证了 ControlNet 的输出也为0，从而不会改变 SD 模型的输出。注意，zero convolution 的权重和偏置初始化为0并不会导致其梯度也为0，因此这些卷积层是能被训练的。
 ControlNet 中的可训练副本采用与 SD UNet Encoder Blocks 相同的结构，并以其与训练好的权重作为初始化。而 SD 模型本身的所有参数都处于冻结状态。在训练过程中，只有 ControlNet 的参数会进行更新。因此，我们既能通过 ControlNet 的对额外的控制条件进行学习训练，又不会破坏 SD 模型本身的能力。
 给定 SD 模型参数 $\Theta$， ControlNet 参数 $\Theta_{\mathrm{c}}$， 两个 zero convolution 模块 $\Theta_{\mathrm{z1}}$ 和 $\Theta_{\mathrm{z2}}$， 模型的输出如下。
 $$
 \boldsymbol{y}_{\mathrm{c}}=\mathcal{F}(\boldsymbol{x} ; \Theta)+\mathcal{Z}\left(\mathcal{F}\left(\boldsymbol{x}+\mathcal{Z}\left(\boldsymbol{c} ; \Theta_{\mathrm{z} 1}\right) ; \Theta_{\mathrm{c}}\right) ; \Theta_{\mathrm{z2}}\right)
 $$
 ![](https://github.com/user-attachments/assets/dfe2e032-1ff8-4835-b061-ffa746ab1406)
 ControlNet 生成图像示例如下所示：
 ![](https://github.com/user-attachments/assets/b0a122b7-2610-465e-9d01-6237c3fbe0f0)
 ## ControlNet++
 ControlNet++ 是针对 Stable Diffusion XL (SDXL) 模型设计的 ControlNet 结构，对应上文提到的 ControlNet-Union-SDXL 模型。这一模型能同时支持10多种控制条件，包括 Pose，Depth，Canny，Lineart 等。
 模型结构如下图所示。相比于 ControlNet ，这一模型扩充了 Condition Encoder 的卷积通道数量，同时增加了两个新模块，分别是 Condition Transformer 和 Control Encoder。Condition Transformer 用于组合不同的图像条件特征，而 Control Encoder 则用于编码控制条件的类型。
 ![](https://github.com/user-attachments/assets/96c9c4e7-ed0a-49cc-8307-a6f024166e68)
 ## 代码样例
 以下代码为 ControlNet-Union-SDXL 模型的使用样例，其中使用的 [image.jpg](https://github.com/user-attachments/assets/cc094e8f-ff6a-4f9e-ba05-7a5c2e0e609f) 为 SDXL 生成的图像，详见[精准控制技术文档](https://diffsynth-studio.readthedocs.io/zh-cn/latest/creating/AdaptersForImageSynthesis.html)
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
 ])
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
 ])
 torch.manual_seed(2)
 image = pipe(
    prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg")
 )
 image.save("image_controlnet.jpg")
 ```
 生成效果：
 ![image_controlnet](https://github.com/user-attachments/assets/d50d173e-e81a-4d7e-93e3-b2787d69953e)
--- a/docs/source/model/ExVideo.md
+++ b/docs/source/model/ExVideo.md
@@ -0,0 +1,97 @@
 # ExVideo
 ## 相关链接
 * 论文：[ExVideo: Extending Video Diffusion Models via Parameter-Efficient Post-Tuning](https://arxiv.org/abs/2406.14130)
 * 模型
    * ExVideo-CogVideoX
        * [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1)
        * [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1)
    * ExVideo-SVD
        * [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
        * [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1)
 ## 模型介绍
 ExVideo 是一种视频生成模型的后训练（post-training）方法，旨在增强视频生成模型的能力，使其能够生成更长的视频。目前，ExVideo 已经发布了两个版本，分别将 Stable Video Diffusion 扩展到 128 帧、将 CogVideoX-5B 扩展到 129 帧。
 在基于 Stable Video Diffusion 的 ExVideo 扩展模块中，静态的位置编码被替换为了可训练的参数矩阵，并在时序模块中添加了额外的单位卷积（Identidy 3D Convolution），在保留预训练模型本身能力的前提下，使其能够捕获更长时间尺度上的信息，从而生成更长视频。而在基于 CogVideoX-5B 的 ExVideo 扩展模块中，由于模型基础架构为 DiT，为保证计算效率，扩展模块采用 LoRA 的形式构建。
 ![](https://github.com/user-attachments/assets/94aa31ba-3ee3-4421-9713-83333a165660)
 为了在有限的计算资源上实现长视频的训练，ExVideo 做了很多工程优化，包括：
 * Parameter freezing：冻结除了扩展模块以外的所有参数
 * Mixed precision：扩展模块部分以全精度维护，其他部分以 BFloat16 精度维护
 * Gradient checkpointing：在前向传播时丢弃中间变量，并反向传播时重新计算
 * Flash attention：在所有注意力机制上启用加速过的注意力实现
 * Shard optimizer states and gradients：基于 DeepSpeed 把部分参数分拆到多个 GPU 上
 Stable Video Diffusion + ExVideo 的生成效果：
 <video width="512" height="256" controls>
  <source src="https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 CogVideoX-5B + ExVideo 的生成效果：
 <video width="512" height="256" controls>
  <source src="https://github.com/user-attachments/assets/321ee04b-8c17-479e-8a95-8cbcf21f8d7e" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 ## 代码样例
 ExVideo-SVD
 ```python
 from diffsynth import save_video, ModelManager, SVDVideoPipeline
 import torch, requests
 from PIL import Image
 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"])
 pipe = SVDVideoPipeline.from_model_manager(model_manager)
 # Generate a video
 torch.manual_seed(0)
 image = Image.open(requests.get("https://www.modelscope.cn/api/v1/studio/ECNU-CILab/ExVideo-SVD-128f-v1/repo?Revision=master&FilePath=images%2F0.png", stream=True).raw)
 image.save("image.png")
 video = pipe(
    input_image=image.resize((512, 512)),
    num_frames=128, fps=30, height=512, width=512,
    motion_bucket_id=127,
    num_inference_steps=50,
    min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2
 )
 save_video(video, "video.mp4", fps=30)
 ```
 ExVideo-CogVideoX
 ```python
 from diffsynth import ModelManager, CogVideoPipeline, save_video, download_models
 import torch
 download_models(["CogVideoX-5B", "ExVideo-CogVideoX-LoRA-129f-v1"])
 model_manager = ModelManager(torch_dtype=torch.bfloat16)
 model_manager.load_models([
    "models/CogVideo/CogVideoX-5b/text_encoder",
    "models/CogVideo/CogVideoX-5b/transformer",
    "models/CogVideo/CogVideoX-5b/vae/diffusion_pytorch_model.safetensors",
 ])
 model_manager.load_lora("models/lora/ExVideo-CogVideoX-LoRA-129f-v1.safetensors")
 pipe = CogVideoPipeline.from_model_manager(model_manager)
 torch.manual_seed(6)
 video = pipe(
    prompt="an astronaut riding a horse on Mars.",
    height=480, width=720, num_frames=129,
    cfg_scale=7.0, num_inference_steps=100,
 )
 save_video(video, "video_with_lora.mp4", fps=8, quality=5)
 ```
--- a/docs/source/model/FLUX.md
+++ b/docs/source/model/FLUX.md
@@ -0,0 +1,53 @@
 # FLUX
 ## 相关链接
 * 技术报告：https://blackforestlabs.ai/announcing-black-forest-labs/
 * 模型
    * FLUX.1-dev
        * [HuggingFace](https://huggingface.co/black-forest-labs/FLUX.1-dev)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/FLUX.1-dev)
 * 项目页面: https://github.com/black-forest-labs/flux
 ## 模型介绍
 FLUX.1  是由 The Black Forest Team 发布的一系列文生图模型，该模型在图像细节、提示一致性、风格多样性和文本到图像合成的场景复杂性方面定义了新的最先进技术。FLUX.1 提供了三个变体：FLUX.1 [pro]、FLUX.1 [dev] 和 FLUX.1 [schnell]，我们在这里用到的是从 FLUX.1 [pro] 蒸馏出来的用于非商业应用的开放权重的 FLUX.1 [dev]。
 FLUX.1 模型均基于多模态和并行扩散变压器块的混合架构，并可缩放至 12B 参数。通过建立流匹配来改进以前最先进的扩散模型，流匹配是一种通用且概念上简单的训练生成模型的方法，其中包括作为特殊情况的扩散。此外，通过结合旋转位置嵌入和并行注意层来提高模型性能并提高硬件效率。
 FLUX.1 定义了图像合成领域的最新技术，FLUX.1 [pro] 和 [dev] 在以下各个方面超越了 Midjourney v6.0、DALL·E 3 (HD) 和 SD3-Ultra 等流行模型：视觉质量、提示跟随、尺寸/方面可变性、版式和输出多样性。 FLUX.1 [schnell] 是迄今为止最先进的几步模型，其性能不仅优于同类竞争对手，而且还优于 Midjourney v6.0 和 DALL·E 3 (HD) 等强大的非蒸馏模型。FLUX.1 经过专门微调，以保留预训练的整个输出多样性。与当前最先进的技术相比，它们提供了极大改进的可能性，如下所示：
 ![image](https://github.com/user-attachments/assets/fed4b32a-193f-40b0-8fac-a5b2270b7995)
 Flux 的生成效果：
 ![image](https://github.com/user-attachments/assets/68f4888e-0574-402a-ac7a-362198a7b867)
 ## 代码样例
 ```python
 import torch
 from diffsynth import ModelManager, FluxImagePipeline, download_models
 download_models(["FLUX.1-dev"])
 model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
 model_manager.load_models([
    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
    "models/FLUX/FLUX.1-dev/text_encoder_2",
    "models/FLUX/FLUX.1-dev/ae.safetensors",
    "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
 ])
 pipe = FluxImagePipeline.from_model_manager(model_manager)
 prompt = "CG. Full body. A captivating fantasy magic woman portrait in the deep sea. The woman, with blue spaghetti strap silk dress, swims in the sea. Her flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her. Smooth, delicate and fair skin."
 negative_prompt = "dark, worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, dim, fuzzy, depth of Field, nsfw,"
 # Disable classifier-free guidance (consistent with the original implementation of FLUX.1)
 torch.manual_seed(6)
 image = pipe(
    prompt=prompt,
    num_inference_steps=30, embedded_guidance=3.5
 )
 image.save("image_1024.jpg")
 ```
--- a/docs/source/model/HunyuanDiT.md
+++ b/docs/source/model/HunyuanDiT.md
@@ -0,0 +1,58 @@
 # Hunyuan-DiT
 ## 相关链接
 * 论文：[Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding](https://arxiv.org/pdf/2405.08748)
 * 模型
    * HunyuanDiT
        * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT)
        * [ModelScope](https://modelscope.cn/models/modelscope/HunyuanDiT)
    * HunyuanDiT-v1.1
        * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1)
    * HunyuanDiT-v1.2
        * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2)
    * Distillation
        * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/Distillation)
    * Distillation-v1.1
        * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.1)
    * Distillation-v1.2
        * [HuggingFace](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.2)
 * 项目页面: https://dit.hunyuan.tencent.com/
 ## 模型介绍
 Hunyuan-DiT是一种基于传统DiT架构的扩散模型, 为了加强模型对中文的细粒度(fine-grained)理解能力, Hunyuan-DiT对Transformer在多个方面进行了改进. 在类别条件(class-conditional)的 DiT 中使用的自适应层归一化(Adaptive Layer Norm)在强制执行细粒度文本条件方面表现不好, 为此Hunyuan-DiT采用了与Stable Diffusion 相似的交叉注意力机制. Hunyuan-DiT接受VAE潜在空间的向量作为输入, 将它分割成小块后经过线性层得到后续用于transformer块的标记. 在每个Hunyuan-DiT Block中包含三个模块, 自注意力(self-attention), 交叉注意力(cross-attention), 和前馈网络(feed-forward network, FFN).
 ![image](https://github.com/user-attachments/assets/50f3eb1f-855d-4095-88fb-c03711f4c7ae)
 为了加强训练的稳定性, Hunyuan-DiT采用了QK-Norm, 在注意力层计算QKV前加入层归一化, 并且在decoder block的skip module后加入层归一化避免损失爆炸(loss explosion).
 Hunyuan-DiT的生成效果:
 ![image](https://github.com/user-attachments/assets/4c11be16-c7ac-45a1-a900-b620606eb2c4)
 ## 代码样例
 ```python
 from diffsynth import ModelManager, HunyuanDiTImagePipeline, download_models
 import torch
 download_models(["HunyuanDiT"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
    "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
    "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
    "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
 ])
 pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
 prompt = "一幅细致的油画描绘了一只年轻獾轻轻嗅着一朵明亮的黄色玫瑰时错综复杂的皮毛。背景是一棵大树干的粗糙纹理，獾的爪子轻轻地挖进树皮。在柔和的背景中，一个宁静的瀑布倾泻而下，它的水在绿色植物中闪烁着蓝色。"
 torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    num_inference_steps=50, height=1024, width=1024,
 )
 image.save("image_1024.png")
 ```
--- a/docs/source/model/IPAdapter.md
+++ b/docs/source/model/IPAdapter.md
@@ -0,0 +1,62 @@
 # IP-Adapter
 ## 相关链接
 * 论文：
    * [IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image Diffusion Models](https://arxiv.org/abs/2308.06721)
 * 模型
    * IP-Adapter-SDXL
        * [HuggingFace](https://huggingface.co/h94/IP-Adapter)
        * [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/IP-Adapter)
 ## 模型介绍
 IP-Adapter 与 ControlNet 技术类似，是一种通过添加辅助性模型架构，为模型添加额外的控制条件的方法。与 ControlNet 类似的是，IP-Adapter 的额外控制条件也是图像输入，不同的是，IP-Adapter 的额外控制条件是通过 Cross-Attention 的方式加入到原始模型中的。
 IP-Adapter 模型结构如下图所示。不考虑最上层的图像控制条件时，文本特征的信息通过 Cross-Attention 结构被加入到 Denoising U-Net 中，这就是典型的文生图 Pipeline。IP-Adapter 参考这一条件控制的范式，加入了图像控制条件。 对于一个控制图像，首先使用 Image Encoder 提取图像特征，然后使用投影网络将其映射为一个长度为 $N$ 的特征序列。在这个时候，图像特征与文本特征的特征形式已经相近，作者便使用同样的 Cross-Attention 结构来融合这一图像特征到 U-Net 中。 Image Encoder 采用经过预训练的 CLIP 模型，投影网络由一个线性层和层归一化组成，投影后的图像特征序列长度取 $N=4$。
 ![](https://github.com/user-attachments/assets/5ebe45a4-6877-41fe-a2e5-deb2ea33dfdb)
 为了不破坏文生图基础模型的文本控制能力，IP-Adapter 采用了文图解耦的 Cross-Attention 结构，即冻结原本的文本 Cross-Attention，加入额外的图像 Cross-Attention 结构。解耦的 Cross-Attention 公式如下所示，其中 $K$ 和 $V$ 是文本的 Key 和 Value 向量， $K^{\prime}$ 和 $V^{\prime}$ 是图像的 Key 和 Value 向量。由于两个 Attention 的 Query 向量是一样的，只需要添加两个映射矩阵 $W_{K^{\prime}}$ 和 $W_{V^{\prime}}$ 作为可学习参数，这两个参数分别从 $W_{K}$ 和 $W_{V}$ 初始化而来。
 $$
 \mathbf{Z}^{\text {new }}=\operatorname{Softmax}\left(\frac{\mathbf{Q} \mathbf{K}^{\top}}{\sqrt{d}}\right) \mathbf{V}+\operatorname{Softmax}\left(\frac{\mathbf{Q}\left(\mathbf{K}^{\prime}\right)^{\top}}{\sqrt{d}}\right) \mathbf{V}^{\prime}
 $$
 综上所述，IP-Adapter 只有投影网络和部分 Cross-Attenion 参数是可学习的，一共只有 22M 可学习参数量。
 ## 代码样例
 以下代码为 IP-Adapter-SDXL 模型的使用样例，我们使用[皮卡丘](https://github.com/user-attachments/assets/4b750148-0238-4c3c-b58c-355dc7fde8f8)作为图像控制条件，生成超人的图像如下：
 ![](https://github.com/user-attachments/assets/9338f4cf-aac1-4dc0-a307-d184b31133a0)
 ``` python
 from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch
 from PIL import Image
 download_models(["BluePencilXL_v200", "IP-Adapter-SDXL"])
 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
    "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin"
 ])
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 image_pikachu = Image.open('Pikachu.png').convert("RGB").resize((1024, 1024))
 torch.manual_seed(1)
 print("Generating image...")
 image = pipe(
    prompt="A super man",
    negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
    cfg_scale=5,
    height=1024, width=1024, num_inference_steps=50,
    ipadapter_images=[image_pikachu], ipadapter_use_instant_style=False
 )
 image.save(f"PikaSuperMan.jpg")
 ```
--- a/docs/source/model/Kolors.md
+++ b/docs/source/model/Kolors.md
@@ -0,0 +1,47 @@
 # Kolors
 ## 相关链接
 * 论文：[Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf)
 * 模型
    * Kolors
        * [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors)
        * [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors)
 * 项目页面: https://kwai-kolors.github.io/
 ## 模型介绍
 Kolors是一种用于文本生成图像的潜在扩散模型, 使用了General Language Model（GLM）作为文本编码器, 增强了它的中英文理解能力. Kolors有两个训练阶段, 包括概念学习阶段（使用广泛的知识）和质量提升阶段（使用精心整理的高美学数据）, 并且在质量提升阶段使用1100步的调度器添加噪声, 以达到更低的信噪比. 这些改动使得即使Kolors以U-Net作为骨干模型, 也能达到好的效果.
 ![image](https://github.com/user-attachments/assets/d6b91d41-3d88-4d26-a399-03ca180640cf)
 kolors的生成效果:
 ![kolors](https://github.com/user-attachments/assets/f6926507-52e2-471d-87ab-a9351338e4ca)
 ## 代码样例
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch
 download_models(["Kolors"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
    file_path_list=[
        "models/kolors/Kolors/text_encoder",
        "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
        "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors"
 ])
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 prompt = '一张瓢虫的照片，微距，变焦，高质量，电影，拿着一个牌子，写着"Kolors"'
 torch.manual_seed(7)
 image = pipe(
    prompt=prompt,
    num_inference_steps=50,
    cfg_scale=4,
 )
 image.save(f"image_1024.jpg")
 ```
--- a/docs/source/model/StableDiffusion.md
+++ b/docs/source/model/StableDiffusion.md
@@ -0,0 +1,66 @@
 # Stable Diffusion
 ## 相关链接
 * 论文：[High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)
 * 模型
    * stable-diffusion-v1-1
        * [HuggingFace](https://huggingface.co/CompVis/stable-diffusion-v1-1)
    * stable-diffusion-v1-2
        * [HuggingFace](https://huggingface.co/CompVis/stable-diffusion-v1-2)
    * stable-diffusion-v1-3
        * [HuggingFace](https://huggingface.co/CompVis/stable-diffusion-v1-3)
    * stable-diffusion-v1-4
        * [HuggingFace](https://huggingface.co/CompVis/stable-diffusion-v1-4)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-4)
    * stable-diffusion-v1-5
        * [HuggingFace](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5)
 这里仅提供 Stable Diffusion 官方开源的模型。由于Stable Diffusion 是一个完全免费开源的模型，且能让用户使用消费级显卡实现快速文生图，因此有非常多基于 Stable Diffusion 训练的优秀模型涌现出来，DiffSynth 支持主流开源社区的  Stable Diffusion 模型的训练与推理。
 ## 模型介绍
 Stable Diffusion 是一种基于扩散模型的文本到图像生成技术，它最初由 [Stability AI](https://stability.ai/) 和 [LAION](https://laion.ai/) 基于 [LAION-5B](https://laion.ai/blog/laion-5b/) 的子集，对 512*512 的图像训练了一个 latent diffusion model，使用 CLIP ViT-L/14 文本编码器编码文本作为模型的提示。
 扩散模型（DMs）在图像数据及其他领域达到了最先进的合成效果，但是由于直接在像素空间进行加噪和去噪过程，训练和推理时需要大量计算资源，为了在有限的计算资源下训练扩散模型，同时保留其质量和灵活性，Stable Diffusion 在预训练自动编码器的潜在空间 (Latent Space) 中训练扩散模型。
 与之前的工作相比，在这种在潜空间表示上训练扩散模型达到了低复杂性和空间下采样之间的近乎最佳平衡，大大提升了视觉保真度。通过将交叉注意力层引入模型架构，扩散模型被转变为功能强大的灵活生成器，可以用于文本或边界框等一般条件输入，并通过卷积方式实现高分辨率合成。
 Stable Diffusion 在各种任务上表现出极具竞争力的性能，包括无条件图像生成、图像修复和超分辨率，同时相较于基于像素的扩散模型显著降低了计算需求。
 Stable Diffusion 的模型结构如下图所示，通过交叉注意力来实现条件控制。
 ![](https://github.com/user-attachments/assets/9d383abe-2889-4ceb-bc0a-136228b809c8)
 ## 代码样例
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch
 # Download models (automatically)
 # `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575?type=Model&format=SafeTensor&size=full&fp=fp16)
 download_models(["AingDiffusion_v12"])
 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
 pipe = SDImagePipeline.from_model_manager(model_manager)
 prompt = "masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,"
 negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
 torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    cfg_scale=6,
    height=512, width=512, num_inference_steps=60,
 )
 image.save("1024.jpg")
 ```
--- a/docs/source/model/StableDiffusion3.md
+++ b/docs/source/model/StableDiffusion3.md
@@ -0,0 +1,55 @@
 # Stable Diffusion 3
 ## 相关链接
 * 论文：[Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](https://arxiv.org/pdf/2403.03206)
 * 模型
    * stable-diffusion-3-medium
        * [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/stable-diffusion-3-medium)
 * 项目页面: https://stability.ai/news/stable-diffusion-3-medium
 ## 模型介绍
 Stable Diffusion 3（SD3）是 Stability AI 的文生图开源模型，在基于人类偏好的评估中，Stable Diffusion 3 在文字生成图像的性能上超过了目前最先进的系统，包括 DALL·E 3、Midjourney v6 和 Ideogram v1，并在文字内容生成，复杂提示理解和指令遵循方面的性能有显著提升。SD3 采用了全新的多模态扩散变压器（MMDiT）架构，使用不同的权重集来处理图像和语言表示，提高了模型的文本理解和拼写能力。
 最大的 SD3 模型拥有 80 亿参数，可以装入拥有 24GB VRAM 的 RTX 4090 中，使用 50 次采样步骤生成一张 1024x1024 分辨率的图像仅需 34 秒。此外，还发布了多种版本的，参数范围从8亿到80亿的 Stable Diffusion 3。
 ![image](https://github.com/user-attachments/assets/e6d95a9e-cd0a-4438-a564-0754eb4c10e1)
 MMDiT 架构使用三种不同的文本嵌入器（两个 CLIP 模型和 T5）来编码文本表示，并使用改进的自动编码模型来编码图像，然后将结合两种模态的序列拼接起来尽进行注意力操作。相比传统的文本生成图像网络，这种架构在视觉保真度和文本对齐度的训练过程中表现更佳。通过该方法，信息可以在图像和文本之间流动，进而提高生成内容的整体理解能力和视觉设计，同时其设计也容易扩展到视频等多种模态的应用。
 此外，SD3 引入了改进的校正流（RF）公式，使得在训练过程中，数据和噪声可以沿着更直的线性轨迹连接，从而减少了采样步骤。通过对采样计划的重加权，尤其是在中间部分，提升了模型的预测任务性能。与其他 60 种扩散轨迹（例如LDM 、 EDM 和 ADM ））相比，重加权的RF变体在性能上具有更优越的表现。
 在文本编码方面，尽管在推理过程中将拥有 4.7B 参数的 T5 文本编码器排除在外减少了内存需求并略微影响性能，但这对视觉美学无大影响，只是稍微降低了提示文本的遵循性。为了充分发挥文本生成能力，尤其是在处理复杂提示文本的场景中，建议保留 T5 文本编码器。
 Stable Diffusion 3 的生成效果：
 ![image](https://github.com/user-attachments/assets/1b5b0260-6421-47fb-abe7-de6758f4721f)
 ## 代码样例
 ```python
 from diffsynth import ModelManager, SD3ImagePipeline, download_models
 import torch
 download_models(["StableDiffusion3_without_T5"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 prompt = "masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,"
 negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
 torch.manual_seed(7)
 image = pipe(
    prompt=prompt, 
    negative_prompt=negative_prompt,
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_1024.jpg")
 ```
--- a/docs/source/model/StableDiffusionXL.md
+++ b/docs/source/model/StableDiffusionXL.md
@@ -0,0 +1,49 @@
 # Stable Diffusion XL
 ## 相关链接
 * 论文：[High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2307.01952)
 * 模型
    * stable-diffusion-xl-base-1.0
        * [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0)
 ## 模型介绍
 Stable Diffusion XL 与之前版本的 Stable Diffusion 相比，将 UNet 主干网络增大了三倍，SDXL 使用了两个文本编码器：([OpenCLIP-ViT/G](https://github.com/mlfoundations/open_clip) 和 [CLIP-ViT/L](https://github.com/openai/CLIP/tree/main))，因此在 UNet 中增加了更多的注意力模块和更大的交叉注意力上下文。我们设计了多种新颖的条件方案，并在多种宽高比上训练SDXL。同时 SDXL 引入了一个精细化模型 ，在后处理阶段来提高SDXL生成样本的逼真度。
 SXDL的模型结构如下：
 ![](https://github.com/user-attachments/assets/1f94bbe3-a2f4-410b-9f68-d500bf91b0f0)
 ## 代码样例
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, download_models
 import torch
 # Download models (automatically)
 # `models/stable_diffusion_xl/bluePencilXL_v200.safetensors`: [link](https://civitai.com/api/download/models/245614?type=Model&format=SafeTensor&size=pruned&fp=fp16)
 download_models(["BluePencilXL_v200"])
 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models(["models/stable_diffusion_xl/bluePencilXL_v200.safetensors"])
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 prompt = "masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,"
 negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,"
 torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    cfg_scale=6,
    height=1024, width=1024, num_inference_steps=60,
 )
 image.save("1024.jpg")
 ```
--- a/docs/source/model/StableVideoDiffusion.md
+++ b/docs/source/model/StableVideoDiffusion.md
@@ -0,0 +1,50 @@
 # Stable Video Diffusion
 ## 相关链接
 * 论文：[Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets](https://arxiv.org/abs/2311.15127)
 * 模型
    * Stable Video Diffusion v1
        * [HuggingFace](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/stable-video-diffusion-img2vid)
    * Stable Video Diffusion v1-xt
        * [HuggingFace](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
        * [ModelScope](https://modelscope.cn/models/AI-ModelScope/stable-video-diffusion-img2vid-xt)
    * Stable Video Diffusion v1.1-xt
        * [HuggingFace](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1)
        * [ModelScope](https://modelscope.cn/models/cjc1887415157/stable-video-diffusion-img2vid-xt-1-1)
 ## 模型介绍
 Stable Video Diffusion 模型是 StabilityAI 训练并开源的图生视频模型，该模型与 Stable Diffusion 模型类似，采用三段式的模型架构。
 * Image Encoder 采用了 CLIP 模型中的 ViT 部分，用于将输入的图像转化为 Embedding。
 * VAE 分为 Encoder 和 Decoder 部分，Encoder 部分与 Stable Diffusion v1.x 完全相同，仅在图像层面对视频进行逐帧压缩；Decoder 部分在 Stable Diffusion v1.x VAE Decoder 的基础上增加了 3D 的卷积层并进一步进行了训练，用于消除逐帧处理过程中的闪烁问题。
 * UNet 部分同时将 Image Encoder 和 VAE Encoder 的输出作为输入，用于在 Latent Space 中进行迭代去噪。
 Stable Video Diffusion 模型可以把输入的图像作为视频第一帧，并生成后续的 24 帧。但值得注意的是，虽然理论上可以继续分段生成更长视频，但分段之间缺乏连续性，因此我们不建议用这个模型分段生成较长视频。
 Stable Video Diffusion 的生成效果：
 <video width="512" height="256" controls>
  <source src="https://github.com/user-attachments/assets/2696b50c-96b8-48fd-a30e-7f69c3c6839c" type="video/mp4">
 您的浏览器不支持Video标签。
 </video>
 ## 代码样例
 ```python
 from diffsynth import save_video, ModelManager, SVDVideoPipeline
 from PIL import Image
 model_manager = ModelManager(model_id_list=["stable-video-diffusion-img2vid-xt"])
 pipe = SVDVideoPipeline.from_model_manager(model_manager)
 video = pipe(
    input_image=Image.open("your_input_image.png").resize((1024, 576)),
    num_frames=25, fps=15, height=576, width=1024,
    motion_bucket_id=127,
    num_inference_steps=50
 )
 save_video(video, "output_video.mp4", fps=15, quality=5)
 ```
--- a/docs/source/requirement.txt
+++ b/docs/source/requirement.txt
@@ -0,0 +1,4 @@
 recommonmark
 sphinx_rtd_theme
 myst-parser
 sphinx-markdown-tables
--- a/docs/source/tutorial/ASimpleExample.md
+++ b/docs/source/tutorial/ASimpleExample.md
@@ -0,0 +1,68 @@
 # 快速开始
 在这篇文档中，我们通过一段代码为你介绍如何快速上手使用 DiffSynth-Studio 进行创作。
 ## 安装
 使用以下命令从 GitHub 克隆并安装 DiffSynth-Studio。更多信息请参考[安装](./Installation.md)。
 ```shell
 git clone https://github.com/modelscope/DiffSynth-Studio.git
 cd DiffSynth-Studio
 pip install -e .
 ```
 ## 一键运行！
 通过运行以下代码，我们将会下载模型、加载模型、生成图像。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline
 model_manager = ModelManager(device="cuda", model_id_list=["StableDiffusionXL_v1"])
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 image = pipe(
    prompt="Diffuse light particles in the universe",
    height=576, width=1024, seed=0
 )
 image.save("image.jpg")
 ```
 ![image](https://github.com/user-attachments/assets/2e60d18e-534c-43d6-b875-26db5b05442e)
 从这个例子中，我们可以看到，DiffSynth 中有两个关键模块：`ModelManager` 和 `Pipeline`，接下来我们详细介绍。
 ## 下载和加载模型
 `ModelManager` 负责下载和加载模型，通过以下代码可以直接一步完成。
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline
 model_manager = ModelManager(device="cuda", model_id_list=["StableDiffusionXL_v1"])
 ```
 当然，我们也支持分步完成，以下代码和上述代码的行为是等价的。
 ```python
 from diffsynth import download_models, ModelManager
 download_models(["StableDiffusionXL_v1"])
 model_manager = ModelManager(device="cuda")
 model_manager.load_models(["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
 ```
 下载模型时，我们支持从 [ModelScope](https://www.modelscope.cn/) 和 [HuggingFace](https://huggingface.co/) 下载模型，也支持下载非预置的模型，关于模型下载的更多信息请参考[模型下载](./DownloadModels.md)。
 加载模型时，你可以把所有想要加载的模型路径放入其中。对于 `.safetensors` 等格式的模型权重文件，`ModelManager` 在加载后会自动判断模型类型；对于文件夹格式的模型，`ModelManager` 会尝试解析其中的 `config.json` 文件并尝试调用 `transformers` 等第三方库中的对应模块。关于 DiffSynth-Studio 支持的模型，请参考[支持的模型](./Models.md)。
 ## 构建 Pipeline
 DiffSynth-Studio 提供了多个推理 `Pipeline`，这些 `Pipeline` 可以直接通过 `ModelManager` 获取所需的模型并初始化。例如，Kolors（可图）模型的文生图 `Pipeline` 可以这样构建：
 ```python
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 ```
 更多用于图像生成和视频生成的 `Pipeline` 详见[推理流水线](./Pipelines.md)。
--- a/docs/source/tutorial/DownloadModels.md
+++ b/docs/source/tutorial/DownloadModels.md
@@ -0,0 +1,34 @@
 # 下载模型
 我们在 DiffSynth-Studio 中预置了一些主流 Diffusion 模型的下载链接，你可以下载并使用这些模型。
 ## 下载预置模型
 你可以直接使用 `download_models` 函数下载预置的模型文件，其中模型 ID 可参考 [config file](https://github.com/modelscope/DiffSynth-Studio/blob/main/diffsynth/configs/model_config.py)。
 ```python
 from diffsynth import download_models
 download_models(["FLUX.1-dev"])
 ```
 对于 VSCode 用户，激活 Pylance 或其他 Python 语言服务后，在代码中输入 `""` 即可显示支持的所有模型 ID。
 ![image](https://github.com/user-attachments/assets/2bbfec32-e015-45a7-98d9-57af13200b7c)
 ## 下载非预置模型
 你可以选择 [ModelScope](https://modelscope.cn/models) 和 [HuggingFace](https://huggingface.co/models) 两个下载源中的模型。当然，你也可以通过浏览器等工具选择手动下载自己所需的模型。
 ```python
 from diffsynth import download_customized_models
 download_customized_models(
    model_id="Kwai-Kolors/Kolors",
    origin_file_path="vae/diffusion_pytorch_model.fp16.bin",
    local_dir="models/kolors/Kolors/vae",
    downloading_priority=["ModelScope", "HuggingFace"]
 )
 ```
 在这段代码中，我们将会按照下载的优先级，优先从 `ModelScope` 下载，在 ID 为 `Kwai-Kolors/Kolors` 的[模型库](https://modelscope.cn/models/Kwai-Kolors/Kolors)中，把文件 `vae/diffusion_pytorch_model.fp16.bin` 下载到本地的路径 `models/kolors/Kolors/vae` 中。
--- a/docs/source/tutorial/Extensions.md
+++ b/docs/source/tutorial/Extensions.md
@@ -0,0 +1,49 @@
 # 扩展功能
 本文档介绍了一些在 DiffSynth 实现的 Diffusion 模型之外的相关技术，这些模型在图像和视频处理方面具有显著的应用潜力。
 - **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**：RIFE 是一个基于实时中间流估计的帧插值方法。采用 IFNet 结构的模型，能够以很快的速度端到端估计中间流。RIFE 不依赖于预训练的光流模型，能够支持任意时间步的帧插值，通过时间编码输入进行处理。
    在这段代码中，我们用 RIFE 模型把视频的帧数提升到原来的两倍。
    ```python
    from diffsynth import VideoData, ModelManager, save_video
    from diffsynth.extensions.RIFE import RIFEInterpolater
    model_manager = ModelManager(model_id_list=["RIFE"])
    rife = RIFEInterpolater.from_model_manager(model_manager)
    video = VideoData("input_video.mp4", height=512, width=768).raw_data()
    video = rife.interpolate(video)
    save_video(video, "output_video.mp4", fps=60)
    ```
 - **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN 是一个图像超分辨率模型，能够实现四倍的分辨率提升。该方法通过优化网络架构、对抗损失和感知损失，显著提升了生成图像的真实感。
    在这段代码中，我们用 ESRGAN 模型把图像分辨率提升到原来的四倍。
    ```python
    from PIL import Image
    from diffsynth import ModelManager
    from diffsynth.extensions.ESRGAN import ESRGAN
    model_manager = ModelManager(model_id_list=["ESRGAN_x4"])
    rife = ESRGAN.from_model_manager(model_manager)
    image = Image.open("input_image.jpg")
    image = rife.upscale(image)
    image.save("output_image.jpg")
    ```
 - **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend 不依赖模型的视频去闪烁算法，在使用图像生成模型逐帧处理过的视频（风格视频）中，通常会出现闪烁问题，FastBlend 则可以根据原视频（引导视频）中的运动特征，消除风格视频中的闪烁。
    在这段代码中，我们用 FastBlend 把风格视频中的闪烁效果删除。
    ```python
    from diffsynth import VideoData, save_video
    from diffsynth.extensions.FastBlend import FastBlendSmoother
    fastblend = FastBlendSmoother()
    guide_video = VideoData("guide_video.mp4", height=512, width=768).raw_data()
    style_video = VideoData("style_video.mp4", height=512, width=768).raw_data()
    output_video = fastblend(style_video, original_frames=guide_video)
    save_video(output_video, "output_video.mp4", fps=30)
    ```
--- a/docs/source/tutorial/Installation.md
+++ b/docs/source/tutorial/Installation.md
@@ -0,0 +1,26 @@
 # 安装
 目前，DiffSynth-Studio 支持从 GitHub 克隆安装或使用 pip 安装，我们建议用户从 GitHub 克隆安装，从而体验最新的功能。
 ## 从源码下载
 1. 克隆源码仓库：
    ```bash
    git clone https://github.com/modelscope/DiffSynth-Studio.git
    ```
 2. 进入项目目录并安装：
    ```bash
    cd DiffSynth-Studio
    pip install -e .
    ```
 ## 使用 PyPI 下载
 直接通过 PyPI 安装（功能更新存在延后，不建议使用这种方式）：
 ```bash
 pip install diffsynth
 ```
--- a/docs/source/tutorial/Models.md
+++ b/docs/source/tutorial/Models.md
@@ -0,0 +1,18 @@
 # 模型
 目前为止，DiffSynth Studio 支持的模型如下所示：
 * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
 * [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
 * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
 * [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
 * [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
 * [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
 * [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
 * [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
 * [ESRGAN](https://github.com/xinntao/ESRGAN)
 * [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
 * [AnimateDiff](https://github.com/guoyww/animatediff/)
 * [ControlNet](https://github.com/lllyasviel/ControlNet)
 * [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
 * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
--- a/docs/source/tutorial/Pipelines.md
+++ b/docs/source/tutorial/Pipelines.md
@@ -0,0 +1,22 @@
 # 流水线
 DiffSynth-Studio 中包括多个流水线，分为图像生成和视频生成两类。
 ## 图像生成流水线
 | Pipeline                   | Models                                                     |
 |----------------------------|----------------------------------------------------------------|
 | SDImagePipeline             | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter |
 | SDXLImagePipeline           | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter |
 | SD3ImagePipeline            | text_encoder_1: SD3TextEncoder1<br>text_encoder_2: SD3TextEncoder2<br>text_encoder_3: SD3TextEncoder3<br>dit: SD3DiT<br>vae_decoder: SD3VAEDecoder<br>vae_encoder: SD3VAEEncoder |
 | HunyuanDiTImagePipeline     | text_encoder: HunyuanDiTCLIPTextEncoder<br>text_encoder_t5: HunyuanDiTT5TextEncoder<br>dit: HunyuanDiT<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder |
 | FluxImagePipeline     | text_encoder_1: FluxTextEncoder1<br>text_encoder_2: FluxTextEncoder2<br>dit: FluxDiT<br>vae_decoder: FluxVAEDecoder<br>vae_encoder: FluxVAEEncoder |
 ## 视频生成流水线
 | Pipeline                   | Models                                                     |
 |----------------------------|----------------------------------------------------------------|
 | SDVideoPipeline            | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter<br>motion_modules: SDMotionModel |
 | SDXLVideoPipeline          | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter<br>motion_modules: SDXLMotionModel |
 | SVDVideoPipeline           | image_encoder: SVDImageEncoder<br>unet: SVDUNet<br>vae_encoder: SVDVAEEncoder<br>vae_decoder: SVDVAEDecoder |
 | CogVideoPipeline           | text_encoder: FluxTextEncoder2<br>dit: CogDiT<br>vae_encoder: CogVAEEncoder<br>vae_decoder: CogVAEDecoder |
--- a/docs/source/tutorial/Schedulers.md
+++ b/docs/source/tutorial/Schedulers.md
@@ -0,0 +1,11 @@
 # 调度器
 调度器（Scheduler）控制模型的整个去噪（或采样）过程。在加载 Pipeline 时，DiffSynth 会自动选择最适合当前 Pipeline 的调度器，**无需额外配置**。
 我们支持的调度器包括：
 - **EnhancedDDIMScheduler**：扩展了去噪扩散概率模型（DDPM）中的去噪过程，引入了非马尔可夫指导。
 - **FlowMatchScheduler**：实现了 [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) 中提出的流量匹配采样方法。
 - **ContinuousODEScheduler**：基于常微分方程（ODE）的调度器。
--- a/docs/source_en/.readthedocs.yaml
+++ b/docs/source_en/.readthedocs.yaml
@@ -0,0 +1,25 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Set the version of Python and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.11"
 python:
  install:
    - requirements: docs/source_en/requirement.txt
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
  configuration: docs/source_en/conf.py
 # We recommend specifying your dependencies to enable reproducible builds:
 # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 # python:
 #   install:
 #   - requirements: docs/requirements.txt
--- a/docs/source_en/conf.py
+++ b/docs/source_en/conf.py
@@ -0,0 +1,50 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 import os
 import sys
 sys.path.insert(0, os.path.abspath('../../diffsynth'))
 project = 'DiffSynth-Studio'
 copyright = '2024, ModelScope'
 author = 'ModelScope'
 release = '0.1.0'
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.napoleon',
    'sphinx.ext.doctest',
    'sphinx.ext.intersphinx',
    'sphinx.ext.todo',
    'sphinx.ext.coverage',
    'sphinx.ext.imgmath',
    'sphinx.ext.viewcode',
    'recommonmark',
    'sphinx_markdown_tables'
 ]
 templates_path = ['_templates']
 exclude_patterns = []
 source_suffix = ['.rst', '.md']
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 html_theme = 'sphinx_rtd_theme'
 html_static_path = ['_static']
 # multi-language docs
 language = 'en'
 locale_dirs = ['../locales/']   # path is example but recommended.
 gettext_compact = False  # optional.
 gettext_uuid = True  # optional.
--- a/docs/source_en/creating/AdaptersForImageSynthesis.md
+++ b/docs/source_en/creating/AdaptersForImageSynthesis.md
@@ -0,0 +1,135 @@
 # ControlNet、LoRA、IP-Adapter——Precision Control Technology
 Based on the text-to-images model, various adapter-based models can be used to control the generation process.
 Let's download the models we'll be using in the upcoming examples:
 * A highly praised Stable Diffusion XL architecture anime-style model
 * A ControlNet model that supports multiple control modes
 * A LoRA model for the Stable Diffusion XL model
 * An IP-Adapter model and its corresponding image encoder
 Please note that the names of the models are kept in English as per your instruction to retain specific terminology.
 ```python
 from diffsynth import download_models
 download_models([
    "BluePencilXL_v200",
    "ControlNet_union_sdxl_promax",
    "SDXL_lora_zyd23ble_diffusion_xl/bluePencilXL_v200.safetensors"])
 pipe = SDXLImagePipeline.from_model_ma2_ChineseInkStyle_SDXL_v1_0",
    "IP-Adapter-SDXL"
 ])
 ```
 Using basic text-to-image functionality to generate a picture.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models(["models/stanager(model_manager)
 torch.manual_seed(1)
 image = pipe(
    prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    cfg_scale=6, num_inference_steps=60,
 )
 image.save("image.jpg")
 ```
 ![image](https://github.com/user-attachments/assets/cc094e8f-ff6a-4f9e-ba05-7a5c2e0e609f)
 Next, let's transform this graceful underwater dancer into a fire mage! We'll activate the ControlNet to maintain the structure of the image while modifying the prompt.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
 ])
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
 ])
 torch.manual_seed(2)
 image = pipe(
    prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg")
 )
 image.save("image_controlnet.jpg")
 ```
 ![image_controlnet](https://github.com/user-attachments/assets/d50d173e-e81a-4d7e-93e3-b2787d69953e)
 Isn't that cool? There's more! Add a LoRA to make the image closer to the flat style of hand-drawn comics. This LoRA requires certain trigger words to take effect, which is mentioned on the original author's model page. Remember to add the trigger words at the beginning of the prompt.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
 ])
 model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
 ])
 torch.manual_seed(3)
 image = pipe(
    prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg")
 )
 image.save("image_lora.jpg")
 ```
 ![image_lora](https://github.com/user-attachments/assets/c599b2f8-8351-4be5-a6ae-8380889cb9d8)
 Not done yet! Find a Chinese painting with ink-wash style as a style guide, activate the IP-Adapter, and let classical art collide with modern aesthetics!
 | Let's use this image as a style guide. |![ink_style](https://github.com/user-attachments/assets/e47c5a03-9c7b-402b-b260-d8bfd56abbc5)|
 |-|-|
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
 import torch
 from PIL import Image
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
    "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
    "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
    "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
 ])
 model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
    ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
 ])
 torch.manual_seed(2)
 image = pipe(
    prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
    cfg_scale=6, num_inference_steps=60,
    controlnet_image=Image.open("image.jpg"),
    ipadapter_images=[Image.open("ink_style.jpg")],
    ipadapter_use_instant_style=True, ipadapter_scale=0.5
 )
 image.save("image_ipadapter.jpg")
 ```
 ![image_ipadapter](https://github.com/user-attachments/assets/e5924aef-03b0-4462-811f-a60e2523fd7f)
 The joy of generating images with Diffusion lies in the combination of various ecosystem models, which can realize all kinds of creative ideas.
--- a/docs/source_en/creating/BasicImageSynthesis.md
+++ b/docs/source_en/creating/BasicImageSynthesis.md
@@ -0,0 +1,64 @@
 # Text-to-Image, Image-to-Image, and High-Resolution Restoration - First Encounter with the Dazzling Diffusion.
 Load the text-to-image model, here we use an anime-style model from Civitai as an example.
 ```python
 import torch
 from diffsynth import ModelManager, SDImagePipeline, download_models
 download_models(["AingDiffusion_v12"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
 pipe = SDImagePipeline.from_model_manager(model_manager)
 ```
 Generate a picture to give it a try.
 ```python
 torch.manual_seed(0)
 image = pipe(
    prompt="masterpiece, best quality, a girl with long silver hair",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    height=512, width=512, num_inference_steps=80,
 )
 image.save("image.jpg")
 ```
 Ah, a lovely young lady.
 ![image](https://github.com/user-attachments/assets/999100d2-1c39-4f18-b37e-aa9d5b4e519c)
 Use the image-to-image feature to turn her hair red, simply by adding `input_image` and `denoising_strength` as parameters. The `denoising_strength` controls the intensity of the noise added, when set to 0, the generated image will be identical to the input image, and when set to 1, it will be completely randomly generated.
 ```python
 torch.manual_seed(1)
 image_edited = pipe(
    prompt="masterpiece, best quality, a girl with long red hair",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    height=512, width=512, num_inference_steps=80,
    input_image=image, denoising_strength=0.6,
 )
 image_edited.save("image_edited.jpg")
 ```
 Ah, a cute girl with red hair.
 ![image_edited](https://github.com/user-attachments/assets/e3de8bc1-037f-4d4d-aacf-8919143c2375)
 Since the model itself was trained at a resolution of 512*512, the image appears a bit blurry. However, we can utilize the model's own capabilities to refine the image and add details. Specifically, this involves increasing the resolution and then using image-to-image generation.
 ```python
 torch.manual_seed(2)
 image_highres = pipe(
    prompt="masterpiece, best quality, a girl with long red hair",
    negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
    height=1024, width=1024, num_inference_steps=80,
    input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
 )
 image_highres.save("image_highres.jpg")
 ```
 Ah, a clear and lovely girl with red hair.
 ![image_highres](https://github.com/user-attachments/assets/4466353e-662c-49f5-9211-b11bb0bb7fb7)
 It's worth noting that the image-to-image and high-resolution restoration features are globally supported, and currently, all of our image generation pipelines can be used in this way.
--- a/docs/source_en/creating/PromptRefine.md
+++ b/docs/source_en/creating/PromptRefine.md
@@ -0,0 +1,77 @@
 # Translation and Polishing — The Magic of Prompt Words
 When generating images, we need to write prompt words to describe the content of the image. Prompt words directly affect the outcome of the generation, but crafting them is also an art. Good prompt words can produce images with a high degree of aesthetic appeal. We offer a range of models to help users handle prompt words effectively.
 ## Translation
 Most text-to-image models currently only support English prompt words, which can be challenging for users who are not native English speakers. To address this, we can use open-source translation models to translate the prompt words into English. In the following example, we take "一个女孩" (a girl) as the prompt word and use the model opus-mt-zh-en (which can be downloaded from [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) or [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en)) for translation.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, Translator
 import torch
 model_manager = ModelManager(
    torch_dtype=torch.float16, device="cuda",
    model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
 )
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
 torch.manual_seed(0)
 prompt = "一个女孩"
 image = pipe(
    prompt=prompt, negative_prompt="",
    height=1024, width=1024, num_inference_steps=30
 )
 image.save("image_1.jpg")
 ```
 ![image_1](https://github.com/user-attachments/assets/c8070a6b-3d2f-4faf-a806-c403b91f1a94)
 ## Polishing
 Detailed prompt words can generate images with richer details. We can use a prompt polishing model like BeautifulPrompt(which can be downloaded from [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) or [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en)) to embellish simple prompt words. This model can make the overall picture style more gorgeous.
 This module can be activated simultaneously with the translation module, but please pay attention to the order: translate first, then polish.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
 import torch
 model_manager = ModelManager(
    torch_dtype=torch.float16, device="cuda",
    model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
 )
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
 torch.manual_seed(0)
 prompt = "一个女孩"
 image = pipe(
    prompt=prompt, negative_prompt="",
    height=1024, width=1024, num_inference_steps=30
 )
 image.save("image_2.jpg")
 ```
 ![image_2](https://github.com/user-attachments/assets/94f64a7d-b14a-41e2-a013-c9a74635a84d)
 We have also integrated a Tongyi Qwen model that can seamlessly complete the translation and polishing of prompt words in one step.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
 import torch
 model_manager = ModelManager(
    torch_dtype=torch.float16, device="cuda",
    model_id_list=["BluePencilXL_v200", "QwenPrompt"]
 )
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
 torch.manual_seed(0)
 prompt = "一个女孩"
 image = pipe(
    prompt=prompt, negative_prompt="",
    height=1024, width=1024, num_inference_steps=30
 )
 image.save("image_3.jpg")
 ```
 ![image_3](https://github.com/user-attachments/assets/fc1a201d-aef1-4e6a-81d6-2e2249ffa230)
--- a/docs/source_en/creating/ToonShading.md
+++ b/docs/source_en/creating/ToonShading.md
@@ -0,0 +1,95 @@
 # When Image Models Meet AnimateDiff—Model Combination Technology
 We have already witnessed the powerful image generation capabilities of the Stable Diffusion model and its ecosystem models. Now, we introduce a new module: AnimateDiff, which allows us to transfer the capabilities of image models to videos. In this article, we showcase an anime-style video rendering solution built on DiffSynth-Studio: Diffutoon.
 ## Download Models
 The following examples will use many models, so let's download them first.
 * An anime-style Stable Diffusion architecture model
 * Two ControlNet models
 * A Textual Inversion model
 * An AnimateDiff model
 ```python
 from diffsynth import download_models
 download_models([
    "AingDiffusion_v12",
    "AnimateDiff_v2",
    "ControlNet_v11p_sd15_lineart",
    "ControlNet_v11f1e_sd15_tile",
    "TextualInversion_VeryBadImageNegative_v1.3"
 ])
 ```
 ## Download Video
 You can choose any video you like. We use [this video](https://www.bilibili.com/video/BV1iG411a7sQ) as a demonstration. You can download this video file with the following command, but please note, do not use it for commercial purposes without obtaining the commercial copyright from the original video creator.
 ```
 modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
 ```
 ## Generate Anime
 ```python
 from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
 import torch
 # Load models
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
 model_manager.load_models([
    "models/stable_diffusion/aingdiffusion_v12.safetensors",
    "models/AnimateDiff/mm_sd_v15_v2.ckpt",
    "models/ControlNet/control_v11p_sd15_lineart.pth",
    "models/ControlNet/control_v11f1e_sd15_tile.pth",
 ])
 # Build pipeline
 pipe = SDVideoPipeline.from_model_manager(
    model_manager,
    [
        ControlNetConfigUnit(
            processor_id="tile",
            model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
            scale=0.5
        ),
        ControlNetConfigUnit(
            processor_id="lineart",
            model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
            scale=0.5
        )
    ]
 )
 pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
 # Load video
 video = VideoData(
    video_file="data/examples/diffutoon/input_video.mp4",
    height=1536, width=1536
 )
 input_video = [video[i] for i in range(30)]
 # Generate
 torch.manual_seed(0)
 output_video = pipe(
    prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
    negative_prompt="verybadimagenegative_v1.3",
    cfg_scale=7, clip_skip=2,
    input_frames=input_video, denoising_strength=1.0,
    controlnet_frames=input_video, num_frames=len(input_video),
    num_inference_steps=10, height=1536, width=1536,
    animatediff_batch_size=16, animatediff_stride=8,
 )
 # Save video
 save_video(output_video, "output_video.mp4", fps=30)
 ```
 ## Effect Display
 <video width="512" height="256" controls>
  <source src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd" type="video/mp4">
 Your browser does not support the Video tag.
 </video>
--- a/docs/source_en/finetune/overview.md
+++ b/docs/source_en/finetune/overview.md
@@ -0,0 +1,98 @@
 # Training Framework
 We have implemented a training framework for text-to-image diffusion models, allowing users to effortlessly train LoRA models with our framework. Our provided scripts come with the following features:
 * **Comprehensive Functionality**: Our training framework supports multi-GPU and multi-node configurations, is optimized for acceleration with DeepSpeed, and includes gradient checkpointing to accommodate models with higher memory requirements.
 * **Succinct Code**: We have avoided large, complex code blocks. The general module is implemented in `diffsynth/trainers/text_to_image.py`, while model-specific training scripts contain only the minimal code necessary for the model architecture, facilitating ease of use for academic researchers.
 * **Modular Design**: Built on the versatile PyTorch-Lightning framework, our training framework is decoupled in functionality, enabling developers to easily incorporate additional training techniques by modifying our scripts to suit their specific needs.
 Examples of images fine-tuned with LoRA. Prompts are "一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉" (for Chinese models) or "a dog is jumping, flowers around the dog, the background is mountains and clouds" (for English models).
 ||FLUX.1-dev|Kolors|Stable Diffusion 3|Hunyuan-DiT|
 |-|-|-|-|-|
 |Without LoRA|![image_without_lora](https://github.com/user-attachments/assets/df62cef6-d54f-4e3d-a602-5dd290079d49)|![image_without_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/9d79ed7a-e8cf-4d98-800a-f182809db318)|![image_without_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/ddb834a5-6366-412b-93dc-6d957230d66e)|![image_without_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e)|
 |With LoRA|![image_with_lora](https://github.com/user-attachments/assets/4fd39890-0291-4d19-8a88-d70d0ae18533)|![image_with_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/02f62323-6ee5-4788-97a1-549732dbe4f0)|![image_with_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8e7b2888-d874-4da4-a75b-11b6b214b9bf)|![image_with_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282)|
 ## Install Additional Packages
 ```bash
 pip install peft lightning
 ```
 ## Prepare the Dataset
 We provide an [example dataset](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files). You need to organize your training dataset in the following structure:
 ```
 data/dog/
 └── train
    ├── 00.jpg
    ├── 01.jpg
    ├── 02.jpg
    ├── 03.jpg
    ├── 04.jpg
    └── metadata.csv
 ```
 `metadata.csv`:
 ```
 file_name,text
 00.jpg,a dog
 01.jpg,a dog
 02.jpg,a dog
 03.jpg,a dog
 04.jpg,a dog
 ```
 Please note that if the model is a Chinese model (e.g., Hunyuan-DiT and Kolors), we recommend using Chinese text in the dataset. For example:
 ```
 file_name,text
 00.jpg,一只小狗
 01.jpg,一只小狗
 02.jpg,一只小狗
 03.jpg,一只小狗
 04.jpg,一只小狗
 ```
 ## Train LoRA Model
 General parameter options:
 ```
  --lora_target_modules LORA_TARGET_MODULES
                        Layers where the LoRA modules are located.
  --dataset_path DATASET_PATH
                        Path to the dataset.
  --output_path OUTPUT_PATH
                        Path where the model will be saved.
  --steps_per_epoch STEPS_PER_EPOCH
                        Number of steps per epoch.
  --height HEIGHT        The height of the image.
  --width WIDTH          The width of the image.
  --center_crop         Whether to center crop the input image to the specified resolution. If not set, the image will be randomly cropped. The image will be resized to the specified resolution before cropping.
  --random_flip         Whether to randomly horizontally flip the image.
  --batch_size BATCH_SIZE
                        Batch size for the training data loader (per device).
  --dataloader_num_workers DATALOADER_NUM_WORKERS
                        The number of subprocesses used for data loading. A value of 0 means the data will be loaded in the main process.
  --precision {32,16,16-mixed}
                        The precision for training.
  --learning_rate LEARNING_RATE
                        The learning rate.
  --lora_rank LORA_RANK
                        The dimension of the LoRA update matrix.
  --lora_alpha LORA_ALPHA
                        The weight of the LoRA update matrix.
  --use_gradient_checkpointing
                        Whether to use gradient checkpointing.
  --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
                        The number of batches for gradient accumulation.
  --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
                        The training strategy.
  --max_epochs MAX_EPOCHS
                        The number of training epochs.
  --modelscope_model_id MODELSCOPE_MODEL_ID
                        The model ID on ModelScope (https://www.modelscope.cn/). If the model ID is provided, the model will be automatically uploaded to ModelScope.
 ```
--- a/docs/source_en/finetune/train_flux_lora.md
+++ b/docs/source_en/finetune/train_flux_lora.md
@@ -0,0 +1,70 @@
 # Training FLUX LoRA
 The following files will be used to build the FLUX model. You can download them from [huggingface](https://huggingface.co/black-forest-labs/FLUX.1-dev)或[modelscope](https://www.modelscope.cn/models/ai-modelscope/flux.1-dev), or you can use the following code to download these files:
 ```python
 from diffsynth import download_models
 download_models(["FLUX.1-dev"])
 ```
 ```
 models/FLUX/
 └── FLUX.1-dev
    ├── ae.safetensors
    ├── flux1-dev.safetensors
    ├── text_encoder
    │   └── model.safetensors
    └── text_encoder_2
        ├── config.json
        ├── model-00001-of-00002.safetensors
        ├── model-00002-of-00002.safetensors
        └── model.safetensors.index.json
 ```
 Start the training task with the following command:
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/flux/train_flux_lora.py \
  --pretrained_text_encoder_path models/FLUX/FLUX.1-dev/text_encoder/model.safetensors \
  --pretrained_text_encoder_2_path models/FLUX/FLUX.1-dev/text_encoder_2 \
  --pretrained_dit_path models/FLUX/FLUX.1-dev/flux1-dev.safetensors \
  --pretrained_vae_path models/FLUX/FLUX.1-dev/ae.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "bf16" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 For more information on the parameters, please use `python examples/train/flux/train_flux_lora.py -h` to view detailed information.
 After the training is complete, use `model_manager.load_lora` to load the LoRA for inference.
 ```python
 from diffsynth import ModelManager, FluxImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=[
                                 "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
                                 "models/FLUX/FLUX.1-dev/text_encoder_2",
                                 "models/FLUX/FLUX.1-dev/ae.safetensors",
                                 "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
                             ])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt=prompt,
    num_inference_steps=30, embedded_guidance=3.5
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source_en/finetune/train_hunyuan_dit_lora.md
+++ b/docs/source_en/finetune/train_hunyuan_dit_lora.md
@@ -0,0 +1,72 @@
 # Training Hunyuan-DiT LoRA
 Building the Hunyuan DiT model requires four files. You can download these files from [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) or [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary). You can use the following code to download these files:
 ```python
 from diffsynth import download_models
 download_models(["HunyuanDiT"])
 ```
 ```
 models/HunyuanDiT/
 ├── Put Hunyuan DiT checkpoints here.txt
 └── t2i
    ├── clip_text_encoder
    │   └── pytorch_model.bin
    ├── model
    │   └── pytorch_model_ema.pt
    ├── mt5
    │   └── pytorch_model.bin
    └── sdxl-vae-fp16-fix
        └── diffusion_pytorch_model.bin
 ```
 Use the following command to start the training task:
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
  --pretrained_path models/HunyuanDiT/t2i \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 For more information about the parameters, please use `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` to view detailed information.
 After the training is complete, use `model_manager.load_lora` to load the LoRA for inference.
 ```python
 from diffsynth import ModelManager, HunyuanDiTImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=[
                                 "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
                                 "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
                                 "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
                                 "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
                             ])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="A little puppy hops and jumps playfully, surrounded by a profusion of colorful flowers, with a mountain range visible in the distance.
 ", 
    negative_prompt="",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source_en/finetune/train_kolors_lora.md
+++ b/docs/source_en/finetune/train_kolors_lora.md
@@ -0,0 +1,77 @@
 # Training Kolors LoRA
 The following files will be used to build Kolors. You can download Kolors from [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) or [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors). Due to precision overflow issues, we need to download an additional VAE model （from [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) or [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix). You can use the following code to download these files:
 ```python
 from diffsynth import download_models
 download_models(["Kolors", "SDXL-vae-fp16-fix"])
 ```
 ```
 models
 ├── kolors
 │   └── Kolors
 │       ├── text_encoder
 │       │   ├── config.json
 │       │   ├── pytorch_model-00001-of-00007.bin
 │       │   ├── pytorch_model-00002-of-00007.bin
 │       │   ├── pytorch_model-00003-of-00007.bin
 │       │   ├── pytorch_model-00004-of-00007.bin
 │       │   ├── pytorch_model-00005-of-00007.bin
 │       │   ├── pytorch_model-00006-of-00007.bin
 │       │   ├── pytorch_model-00007-of-00007.bin
 │       │   └── pytorch_model.bin.index.json
 │       ├── unet
 │       │   └── diffusion_pytorch_model.safetensors
 │       └── vae
 │           └── diffusion_pytorch_model.safetensors
 └── sdxl-vae-fp16-fix
    └── diffusion_pytorch_model.safetensors
 ```
 Use the following command to start the training task:
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
  --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
  --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
  --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 For more information on the parameters, please use `python examples/train/kolors/train_kolors_lora.py -h` to view detailed information.
 After the training is complete, use `model_manager.load_lora` to load the LoRA for inference.
 ```python
 from diffsynth import ModelManager, SD3ImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source_en/finetune/train_sd3_lora.md
+++ b/docs/source_en/finetune/train_sd3_lora.md
@@ -0,0 +1,57 @@
 # Training Stable Diffusion 3 LoRA
 The training script only requires one file. You can use [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)（without T5 Encoder）或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)（with T5 Encoder）. Please use the following code to download these files:
 ```python
 from diffsynth import download_models
 download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
 ```
 ```
 models/stable_diffusion_3/
 ├── Put Stable Diffusion 3 checkpoints here.txt
 ├── sd3_medium_incl_clips.safetensors
 └── sd3_medium_incl_clips_t5xxlfp16.safetensors
 ```
 Use the following command to start the training task:
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
  --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 For more information on the parameters, please use `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` to view detailed information.
 After training is completed, use `model_manager.load_lora` to load LoRA for inference.
 ```python
 from diffsynth import ModelManager, SD3ImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source_en/finetune/train_sd_lora.md
+++ b/docs/source_en/finetune/train_sd_lora.md
@@ -0,0 +1,58 @@
 # Training Stable Diffusion LoRA
 The training script only requires one file. We support mainstream checkpoints on [CivitAI](https://civitai.com/). By default, we use the basic Stable Diffusion v1.5. You can download it from [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) or [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors). You can use the following code to download this file:
 ```python
 from diffsynth import download_models
 download_models(["StableDiffusion_v15"])
 ```
 ```
 models/stable_diffusion
 ├── Put Stable Diffusion checkpoints here.txt
 └── v1-5-pruned-emaonly.safetensors
 ```
 Start the training task with the following command:
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
  --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 512 \
  --width 512 \
  --center_crop \
  --precision "16-mixed" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 For more information about the parameters, please use `python examples/train/stable_diffusion/train_sd_lora.py -h` to view detailed information.
 After training is complete, use `model_manager.load_lora` to load LoRA for inference.
 ```python
 from diffsynth import ModelManager, SDImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SDImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=512, height=512,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source_en/finetune/train_sdxl_lora.md
+++ b/docs/source_en/finetune/train_sdxl_lora.md
@@ -0,0 +1,57 @@
 # Training Stable Diffusion XL LoRA
 The training script only requires one file. We support mainstream checkpoints on [CivitAI](https://civitai.com/). By default, we use the basic Stable Diffusion XL. You can download it from [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors). You can also use the following code to download this file:
 ```python
 from diffsynth import download_models
 download_models(["StableDiffusionXL_v1"])
 ```
 ```
 models/stable_diffusion_xl
 ├── Put Stable Diffusion XL checkpoints here.txt
 └── sd_xl_base_1.0.safetensors
 ```
 We have observed that Stable Diffusion XL may experience numerical precision overflows when using float16 precision, so we recommend that users train with float32 precision. To start the training task, use the following command:
 ```
 CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
  --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
  --dataset_path data/dog \
  --output_path ./models \
  --max_epochs 1 \
  --steps_per_epoch 500 \
  --height 1024 \
  --width 1024 \
  --center_crop \
  --precision "32" \
  --learning_rate 1e-4 \
  --lora_rank 4 \
  --lora_alpha 4 \
  --use_gradient_checkpointing
 ```
 For more information about the parameters, please use `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` to view detailed information.
 After training is complete, use `model_manager.load_lora` to load LoRA for inference.
 ```python
 from diffsynth import ModelManager, SDXLImagePipeline
 import torch
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
                             file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
 model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
 pipe = SDXLImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
    cfg_scale=7.5,
    num_inference_steps=100, width=1024, height=1024,
 )
 image.save("image_with_lora.jpg")
 ```
--- a/docs/source_en/index.rst
+++ b/docs/source_en/index.rst
@@ -0,0 +1,63 @@
 .. DiffSynth-Studio documentation master file, created by
   sphinx-quickstart on Thu Sep  5 16:39:24 2024.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 DiffSynth-Studio documentation
 ==============================
 Add your content using ``reStructuredText`` syntax. See the
 `reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
 documentation for details.
 .. toctree::
   :maxdepth: 1
   :caption: Quick Started
   tutorial/ASimpleExample.md
   tutorial/Installation.md
   tutorial/DownloadModels.md
   tutorial/Models.md
   tutorial/Pipelines.md
   tutorial/PromptProcessing.md
   tutorial/Extensions.md
   tutorial/Schedulers.md
 .. toctree::
   :maxdepth: 1
   :caption: Embarking on a Creative Journey
   creating/BasicImageSynthesis.md
   creating/AdaptersForImageSynthesis.md
   creating/ToonShading.md
   creating/PromptRefine.md
 .. toctree::
   :maxdepth: 1
   :caption: Model Lists
   model/StableDiffusion.md
   model/StableDiffusionXL.md
   model/ControlNet.md
   model/AnimateDiff.md
   model/IPAdapter.md
   model/HunyuanDiT.md
   model/Kolors.md
   model/StableDiffusion3.md
   model/StableVideoDiffusion.md
   model/ExVideo.md
   model/FLUX.md
   model/CogVideo.md
 .. toctree::
   :maxdepth: 1
   :caption: Finetuning
   finetune/overview.md
   finetune/train_flux_lora.md
   finetune/train_kolors_lora.md
   finetune/train_sd3_lora.md
   finetune/train_hunyuan_dit_lora.md
   finetune/train_sdxl_lora.md
   finetune/train_sd_lora.md
--- a/docs/source_en/requirement.txt
+++ b/docs/source_en/requirement.txt
@@ -0,0 +1,4 @@
 recommonmark
 sphinx_rtd_theme
 myst-parser
 sphinx-markdown-tables
--- a/docs/source_en/tutorial/ASimpleExample.md
+++ b/docs/source_en/tutorial/ASimpleExample.md
@@ -0,0 +1,85 @@
 # Quick Start
 In this document, we introduce how to quickly get started with DiffSynth-Studio for creation through a piece of code.
 ## Installation
 Use the following command to clone and install DiffSynth-Studio from GitHub. For more information, please refer to [Installation](./Installation.md).
 ```shell
 git clone https://github.com/modelscope/DiffSynth-Studio.git
 cd DiffSynth-Studio
 pip install -e .
 ```
 ## One-click Run!
 By running the following code, we will download the model, load the model, and generate an image.
 ```python
 import torch
 from diffsynth import ModelManager, FluxImagePipeline
 model_manager = ModelManager(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_id_list=["FLUX.1-dev"]
 )
 pipe = FluxImagePipeline.from_model_manager(model_manager)
 torch.manual_seed(0)
 image = pipe(
    prompt="In a forest, a wooden plank sign reading DiffSynth",
    height=576, width=1024,
 )
 image.save("image.jpg")
 ```
 ![image](https://github.com/user-attachments/assets/15a52a2b-2f18-46fe-810c-cb3ad2853919)
 From this example, we can see that there are two key modules in DiffSynth: `ModelManager` and `Pipeline`. We will introduce them in detail next.
 ## Downloading and Loading Models
 `ModelManager` is responsible for downloading and loading models, which can be done in one step with the following code.
 ```python
 import torch
 from diffsynth import ModelManager
 model_manager = ModelManager(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_id_list=["FLUX.1-dev"]
 )
 ```
 Of course, we also support completing this step by step, and the following code is equivalent to the above.
 ```python
 import torch
 from diffsynth import download_models, ModelManager
 download_models(["FLUX.1-dev"])
 model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
 model_manager.load_models([
    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
    "models/FLUX/FLUX.1-dev/text_encoder_2",
    "models/FLUX/FLUX.1-dev/ae.safetensors",
    "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
 ])
 ```
 When downloading models, we support downloading from [ModelScope](https://www.modelscope.cn/) and [HuggingFace](https://huggingface.co/), and we also support downloading non-preset models. For more information about model downloading, please refer to [Model Download](./DownloadModels.md).
 When loading models, you can put all the model paths you want to load into it. For model weight files in formats such as `.safetensors`, `ModelManager` will automatically determine the model type after loading; for folder format models, `ModelManager` will try to parse the `config.json` file within and try to call the corresponding module in third-party libraries such as `transformers`. For models supported by DiffSynth-Studio, please refer to [Supported Models](./Models.md).
 ## Building Pipeline
 DiffSynth-Studio provides multiple inference `Pipeline`s, which can be directly obtained through `ModelManager` to get the required models and initialize. For example, the text-to-image `Pipeline` for the FLUX.1-dev model can be constructed as follows:
 ```python
 pipe = FluxImagePipeline.from_model_manager(model_manager)
 ```
 For more `Pipeline`s used for image generation and video generation, see [Inference Pipelines](./Pipelines.md).
--- a/docs/source_en/tutorial/DownloadModels.md
+++ b/docs/source_en/tutorial/DownloadModels.md
@@ -0,0 +1,34 @@
 # Download Models
 We have preset some mainstream Diffusion model download links in DiffSynth-Studio, which you can download and use.
 ## Download Preset Models
 You can directly use the `download_models` function to download the preset model files, where the model ID can refer to the [config file](/diffsynth/configs/model_config.py).
 ```python
 from diffsynth import download_models
 download_models(["FLUX.1-dev"])
 ```
 For VSCode users, after activating Pylance or other Python language services, typing `""` in the code will display all supported model IDs.
 ![image](https://github.com/user-attachments/assets/2bbfec32-e015-45a7-98d9-57af13200b7c)
 ## Download Non-Preset Models
 You can select models from two download sources: [ModelScope](https://modelscope.cn/models) and [HuggingFace](https://huggingface.co/models). Of course, you can also manually download the models you need through browsers or other tools.
 ```python
 from diffsynth import download_customized_models
 download_customized_models(
    model_id="Kwai-Kolors/Kolors",
    origin_file_path="vae/diffusion_pytorch_model.fp16.bin",
    local_dir="models/kolors/Kolors/vae",
    downloading_priority=["ModelScope", "HuggingFace"]
 )
 ```
 In this code snippet, we will prioritize downloading from `ModelScope` according to the download priority, and download the file `vae/diffusion_pytorch_model.fp16.bin` from the model repository with ID `Kwai-Kolors/Kolors` in the [model library](https://modelscope.cn/models/Kwai-Kolors/Kolors) to the local path `models/kolors/Kolors/vae`.
--- a/docs/source_en/tutorial/Extensions.md
+++ b/docs/source_en/tutorial/Extensions.md
@@ -0,0 +1,49 @@
 # Extension Features
 This document introduces some technologies related to the Diffusion models implemented in DiffSynth, which have significant application potential in image and video processing.
 - **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**: RIFE is a frame interpolation method based on real-time intermediate flow estimation. It uses a model with an IFNet structure that can quickly estimate intermediate flows end-to-end. RIFE does not rely on pre-trained optical flow models and supports frame interpolation at arbitrary time steps, processing through time-encoded inputs.
    In this code snippet, we use the RIFE model to double the frame rate of a video.
    ```python
    from diffsynth import VideoData, ModelManager, save_video
    from diffsynth.extensions.RIFE import RIFEInterpolater
    model_manager = ModelManager(model_id_list=["RIFE"])
    rife = RIFEInterpolater.from_model_manager(model_manager)
    video = VideoData("input_video.mp4", height=512, width=768).raw_data()
    video = rife.interpolate(video)
    save_video(video, "output_video.mp4", fps=60)
    ```
 - **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN is an image super-resolution model that can achieve a fourfold increase in resolution. This method significantly enhances the realism of generated images by optimizing network architecture, adversarial loss, and perceptual loss.
    In this code snippet, we use the ESRGAN model to quadruple the resolution of an image.
    ```python
    from PIL import Image
    from diffsynth import ModelManager
    from diffsynth.extensions.ESRGAN import ESRGAN
    model_manager = ModelManager(model_id_list=["ESRGAN_x4"])
    esrgan = ESRGAN.from_model_manager(model_manager)
    image = Image.open("input_image.jpg")
    image = esrgan.upscale(image)
    image.save("output_image.jpg")
    ```
 - **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend is a model-free video de-flickering algorithm. Flicker often occurs in style videos processed frame by frame using image generation models. FastBlend can eliminate flicker in style videos based on the motion features in the original video (guide video).
    In this code snippet, we use FastBlend to remove the flicker effect from a style video.
    ```python
    from diffsynth import VideoData, save_video
    from diffsynth.extensions.FastBlend import FastBlendSmoother
    fastblend = FastBlendSmoother()
    guide_video = VideoData("guide_video.mp4", height=512, width=768).raw_data()
    style_video = VideoData("style_video.mp4", height=512, width=768).raw_data()
    output_video = fastblend(style_video, original_frames=guide_video)
    save_video(output_video, "output_video.mp4", fps=30)
    ```
--- a/docs/source_en/tutorial/Installation.md
+++ b/docs/source_en/tutorial/Installation.md
@@ -0,0 +1,26 @@
 # Installation
 Currently, DiffSynth-Studio supports installation via cloning from GitHub or using pip. We recommend users to clone from GitHub to experience the latest features.
 ## From Source
 1. Clone the source repository:
    ```bash
    git clone https://github.com/modelscope/DiffSynth-Studio.git
    ```
 2. Navigate to the project directory and install:
    ```bash
    cd DiffSynth-Studio
    pip install -e .
    ```
 ## From PyPI
 Install directly via PyPI:
 ```bash
 pip install diffsynth
 ```
--- a/docs/source_en/tutorial/Models.md
+++ b/docs/source_en/tutorial/Models.md
@@ -0,0 +1,18 @@
 # 模型
 So far, the models supported by DiffSynth Studio are as follows:
 * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
 * [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
 * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
 * [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
 * [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
 * [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
 * [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
 * [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
 * [ESRGAN](https://github.com/xinntao/ESRGAN)
 * [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
 * [AnimateDiff](https://github.com/guoyww/animatediff/)
 * [ControlNet](https://github.com/lllyasviel/ControlNet)
 * [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
 * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
--- a/docs/source_en/tutorial/Pipelines.md
+++ b/docs/source_en/tutorial/Pipelines.md
@@ -0,0 +1,22 @@
 # Pipelines
 DiffSynth-Studio includes multiple pipelines, categorized into two types: image generation and video generation.
 ## Image Pipelines
 | Pipeline                   | Models                                                     |
 |----------------------------|----------------------------------------------------------------|
 | SDImagePipeline             | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter |
 | SDXLImagePipeline           | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter |
 | SD3ImagePipeline            | text_encoder_1: SD3TextEncoder1<br>text_encoder_2: SD3TextEncoder2<br>text_encoder_3: SD3TextEncoder3<br>dit: SD3DiT<br>vae_decoder: SD3VAEDecoder<br>vae_encoder: SD3VAEEncoder |
 | HunyuanDiTImagePipeline     | text_encoder: HunyuanDiTCLIPTextEncoder<br>text_encoder_t5: HunyuanDiTT5TextEncoder<br>dit: HunyuanDiT<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder |
 | FluxImagePipeline     | text_encoder_1: FluxTextEncoder1<br>text_encoder_2: FluxTextEncoder2<br>dit: FluxDiT<br>vae_decoder: FluxVAEDecoder<br>vae_encoder: FluxVAEEncoder |
 ## Video Pipelines
 | Pipeline                   | Models                                                     |
 |----------------------------|----------------------------------------------------------------|
 | SDVideoPipeline            | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter<br>motion_modules: SDMotionModel |
 | SDXLVideoPipeline          | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter<br>motion_modules: SDXLMotionModel |
 | SVDVideoPipeline           | image_encoder: SVDImageEncoder<br>unet: SVDUNet<br>vae_encoder: SVDVAEEncoder<br>vae_decoder: SVDVAEDecoder |
 | CogVideoPipeline           | text_encoder: FluxTextEncoder2<br>dit: CogDiT<br>vae_encoder: CogVAEEncoder<br>vae_decoder: CogVAEDecoder |
--- a/docs/source_en/tutorial/PromptProcessing.md
+++ b/docs/source_en/tutorial/PromptProcessing.md
@@ -0,0 +1,35 @@
 # Prompt Processing
 DiffSynth includes prompt processing functionality, which is divided into:
 - **Prompt Refiners (`prompt_refiner_classes`)**: Includes prompt refinement, prompt translation from Chinese to English, and both refinement and translation of prompts. Available parameters are as follows:
    - **English Prompt Refinement**: 'BeautifulPrompt', using the model [pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd).
    - **Prompt Translation from Chinese to English**: 'Translator', using the model [opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en).
    - **Prompt Translation and Refinement**: 'QwenPrompt', using the model [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct).
 - **Prompt Extenders (`prompt_extender_classes`)**: Based on Omost's prompt partition control expansion. Available parameter is:
    - **Prompt Partition Expansion**: 'OmostPromter'.
 ## Usage Instructions
 ### Prompt Refiners
 When loading the model pipeline, you can specify the desired prompt refiner functionality using the `prompt_refiner_classes` parameter. For example code, refer to [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py).
 Available `prompt_refiner_classes` parameters include: Translator, BeautifulPrompt, QwenPrompt.
 ```python
 pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
 ```
 ### Prompt Extenders
 When loading the model pipeline, you can specify the desired prompt extender using the `prompt_extender_classes` parameter. For example code, refer to [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py).
 ```python
 pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
 ```
--- a/docs/source_en/tutorial/Schedulers.md
+++ b/docs/source_en/tutorial/Schedulers.md
@@ -0,0 +1,11 @@
 # Schedulers
 Schedulers control the entire denoising (or sampling) process of the model. When loading the Pipeline, DiffSynth automatically selects the most suitable schedulers for the current Pipeline, **requiring no additional configuration**.
 The supported schedulers are:
 - **EnhancedDDIMScheduler**: Extends the denoising process introduced in the Denoising Diffusion Probabilistic Models (DDPM) with non-Markovian guidance.
 - **FlowMatchScheduler**: Implements the flow matching sampling method introduced in [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).
 - **ContinuousODEScheduler**: A scheduler based on Ordinary Differential Equations (ODE).
--- a/readthedocs.yaml
+++ b/readthedocs.yaml
@@ -0,0 +1,26 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Set the version of Python and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.11"
 python:
  install:
    - requirements: docs/requirements.txt
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
  configuration: docs/source/conf.py
 # We recommend specifying your dependencies to enable reproducible builds:
 # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 # python:
 #   install:
 #   - requirements: docs/requirements.txt
Author	SHA1	Message	Date
Qianyi Zhao	b13963382c	Update FLUX.md	2024-11-08 00:44:06 -06:00
Qianyi Zhao	e11cf9e318	Update FLUX.md	2024-11-08 00:34:29 -06:00
Qianyi Zhao	4f93be2f5a	Update StableDiffusion3.md	2024-11-08 00:11:51 -06:00
Qing112	c9c6be2201	Add ModelQuantization.md Update Kolors.md	2024-11-07 19:20:19 +08:00
Qianyi Zhao	2b07df1c7a	Update MultiControlnet.md	2024-11-06 00:07:50 -06:00
Qianyi Zhao	31161317e3	Update MultiControlnet.md	2024-11-05 23:52:12 -06:00
Qianyi Zhao	787813ab98	Update MultiControlnet.md	2024-11-05 23:51:29 -06:00
Qianyi Zhao	9fef3ee281	Update overview.md	2024-11-05 23:45:09 -06:00
Qianyi Zhao	b205513041	Update MultiControlnet.md	2024-11-05 23:40:24 -06:00
Qianyi Zhao	97dd398f17	Update overview.md	2024-11-05 23:33:24 -06:00
Qianyi Zhao	85ac23c0c3	Update introduction.md	2024-11-05 09:20:40 -06:00
Qianyi Zhao	b4073d2643	Update introduction.md	2024-11-05 09:12:28 -06:00
Qianyi Zhao	9583f16a43	Update introduction.md	2024-11-05 09:08:16 -06:00
Qianyi Zhao	b0633ac8bb	Update introduction.md	2024-11-05 05:38:15 -06:00
Qianyi Zhao	9166a6742c	Update introduction.md	2024-11-05 05:31:54 -06:00
Qianyi Zhao	10cfa6d711	Update introduction.md	2024-11-05 05:29:35 -06:00
Qianyi Zhao	b78ffbe09e	Update introduction.md	2024-11-05 05:25:30 -06:00
Qianyi Zhao	64af33fe33	Update introduction.md	2024-11-05 03:45:57 -06:00
Qianyi Zhao	1180f450ca	Update introduction.md	2024-11-05 03:45:33 -06:00
Qianyi Zhao	99726e02de	Update MultiControlnet.md	2024-11-05 03:45:15 -06:00
Qianyi Zhao	e0c09ed53d	Update overview.md	2024-11-05 03:44:09 -06:00
Qianyi Zhao	250ebf5c72	Update train_flux_lora.md	2024-11-05 03:40:33 -06:00
Artiprocher	47a2f86f7b	update docs	2024-11-04 17:05:58 +08:00
Artiprocher	e2d9710d86	update docs	2024-11-04 16:23:03 +08:00
Artiprocher	384ea0dc69	update docs	2024-11-04 16:09:32 +08:00
Artiprocher	e0ef3eea60	update docs	2024-11-04 15:54:58 +08:00
Artiprocher	ac67acd235	update docs	2024-11-04 15:49:41 +08:00
Artiprocher	fe68a3d1bb	update docs	2024-11-04 14:09:36 +08:00
Zhongjie Duan	deff4512f7	Merge pull request #256 from mi804/doc AnimateDiff doc	2024-11-01 17:35:54 +08:00
张鸿	29efb1c828	AnimateDiff doc	2024-11-01 17:26:06 +08:00
Artiprocher	e833a31909	update doc index	2024-11-01 15:20:09 +08:00
Zhongjie Duan	b626d2aad7	Doc patch (#255 ) * update multi-controlnet doc * update multi-controlnet doc	2024-11-01 15:18:18 +08:00
Zhongjie Duan	0bc89f973e	Merge pull request #254 from mi804/doc IP-Adapter doc	2024-11-01 14:49:45 +08:00
张鸿	3eeaa1cd32	IP-Adapter doc	2024-11-01 14:44:12 +08:00
Artiprocher	4b25495921	update cogvideo doc	2024-11-01 14:40:31 +08:00
Artiprocher	ac2b187b9f	update svd doc	2024-11-01 14:01:31 +08:00
Qianyi Zhao	eece711313	Update en index.rst 英文caption随便写了下，需要重新写	2024-10-28 06:09:08 -05:00
tc2000731	4e1cea64ad	update HunyuanDiT.md Kolors.md	2024-10-25 17:32:29 +08:00
Qianyi Zhao	1a2ce26d37	Update StableDiffusionXL.md	2024-10-25 01:18:11 -05:00
Qianyi Zhao	b17a0297a2	Update StableDiffusion.md	2024-10-25 01:17:47 -05:00
Zhongjie Duan	9af2d08a33	Update index.rst	2024-10-23 10:12:05 +08:00
Zhongjie Duan	b2df73d033	Merge pull request #247 from mi804/doc support controlnet cn doc	2024-10-22 14:53:27 +08:00
Zhongjie Duan	3514eba956	Merge pull request #246 from yrk111222/doc Doc	2024-10-22 14:52:34 +08:00
mi804	fb0e5d1f38	support controlnet cn doc	2024-10-22 11:38:17 +08:00
yrk111222	b43cc35dd9	Create .readthedocs.yaml	2024-10-22 11:30:17 +08:00
yrk111222	34ca18a217	Create .readthedocs.yaml	2024-10-22 11:29:15 +08:00
Zhongjie Duan	550d780cd6	Merge pull request #245 from yrk111222/doc Doc	2024-10-22 10:18:10 +08:00
Zhongjie Duan	ded2882e87	Merge branch 'doc' into doc	2024-10-22 10:17:18 +08:00
yrk111222	f6e676cdf9	Add files via upload 再改一次	2024-10-22 09:56:03 +08:00
yrk111222	157ba2e426	Delete docs directory	2024-10-22 09:54:59 +08:00
yrk111222	1a004ffe81	Add files via upload 上一次上传到docs文件夹中了，修改一下	2024-10-22 09:52:12 +08:00
yrk111222	70c4ff4121	Add files via upload 第三遍检查，一些明显的错误，像是格式之类的。	2024-10-22 09:45:58 +08:00
yrk111222	883d26abb4	Add files via upload 第一版翻译完成，保留了getStart目录，有一些名词还是需要重新检查	2024-10-18 18:02:52 +08:00
Artiprocher	105d4ffbc2	update docs	2024-10-18 15:38:12 +08:00
yrk111222	24b78148b8	Add files via upload 电脑更换，完成到D:\translate\DiffSynth-Studio\docs\source_en\finetune，该写第四个文档	2024-10-18 11:36:48 +08:00
Artiprocher	793062e141	update docs	2024-10-15 18:17:26 +08:00
Artiprocher	98f07f2435	update docs	2024-10-15 17:38:40 +08:00
Artiprocher	ca4b9c8bf4	update docs	2024-10-15 17:38:25 +08:00
Artiprocher	a2ab597eb0	update docs	2024-10-09 09:55:29 +08:00
Artiprocher	950fb486d6	update docs	2024-10-09 09:53:13 +08:00
Artiprocher	28b4a5313e	update docs	2024-10-09 09:39:12 +08:00
Artiprocher	d9d37568a7	update docs	2024-10-08 21:20:36 +08:00
Artiprocher	55f1a10255	update docs	2024-10-08 21:18:35 +08:00
Artiprocher	677ecbf1d2	update docs	2024-09-19 09:47:54 +08:00
Artiprocher	5a06ac5e31	update docs	2024-09-11 21:07:01 +08:00
Artiprocher	41f58e2d41	update docs	2024-09-11 16:37:46 +08:00
Qianyi Zhao	7f6e35fe35	add docs (#201 ) add docs.	2024-09-10 18:46:24 +08:00