add docs (#201)

add docs.
2026-04-08 08:58:20 +00:00 · 2024-09-10 18:46:24 +08:00
parent 0b9e673fa2
commit 7f6e35fe35
32 changed files with 1622 additions and 0 deletions
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/source/.readthedocs.yaml
+++ b/docs/source/.readthedocs.yaml
@@ -0,0 +1,27 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+python:
+  install:
+    - requirements: docs/source/requirement.txt
+# We recommend specifying your dependencies to enable reproducible builds:
+# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+# python:
+#   install:
+#   - requirements: docs/requirements.txt
--- a/docs/source/GetStarted/A_simple_example.md
+++ b/docs/source/GetStarted/A_simple_example.md
@@ -0,0 +1,87 @@
+
+# 基于Flux的文生图示例
+
+以下是如何使用FLUX.1模型进行文生图任务的示例。该脚本提供了一个简单的设置，用于从文本描述生成图像。包括下载必要的模型、配置pipeline，以及在启用和禁用 classifier-free guidance 的情况下生成图像。
+
+其他 DiffSynth 支持的模型详见 [模型.md](模型.md)
+
+## 准备
+
+首先，确保已下载并配置了必要的模型：
+
+```python
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_models
+
+# Download the FLUX.1-dev model files
+download_models(["FLUX.1-dev"])
+```
+
+下载模型的用法详见 [下载模型.md](下载模型.md)
+
+## 加载模型
+
+使用您的设备和数据类型初始化模型管理器
+
+```python
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
+model_manager.load_models([
+    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+    "models/FLUX/FLUX.1-dev/text_encoder_2",
+    "models/FLUX/FLUX.1-dev/ae.safetensors",
+    "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+])
+```
+
+模型加载的用法详见 [ModelManager.md](ModelManager.md)
+
+## 创建 Pipeline
+
+从加载的模型管理器中创建FluxImagePipeline实例：
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+```
+
+Pipeline 的用法详见 [Pipeline.md](Pipeline.md)
+
+## 文生图
+
+使用简短的提示语生成图像。以下是启用和禁用 classifier-free guidance 的图像生成示例。
+
+### 基础文生图
+
+```python
+prompt = "A cute little turtle"
+negative_prompt = ""
+
+torch.manual_seed(6)
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=30, embedded_guidance=3.5
+)
+image.save("image_1024.jpg")
+```
+
+### 使用 Classifier-Free Guidance 生成
+```python
+torch.manual_seed(6)
+image = pipe(
+    prompt=prompt, negative_prompt=negative_prompt,
+    num_inference_steps=30, cfg_scale=2.0, embedded_guidance=3.5
+)
+image.save("image_1024_cfg.jpg")
+```
+
+### 高分辨率修复
+
+```python
+torch.manual_seed(7)
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=30, embedded_guidance=3.5,
+    input_image=image.resize((2048, 2048)), height=2048, width=2048, denoising_strength=0.6, tiled=True
+)
+image.save("image_2048_highres.jpg")
+```
+
--- a/docs/source/GetStarted/Download_models.md
+++ b/docs/source/GetStarted/Download_models.md
@@ -0,0 +1,20 @@
+# 下载模型
+
+下载预设模型，模型ID可参考 [config file](/diffsynth/configs/model_config.py).
+
+```python
+from diffsynth import download_models
+
+download_models(["FLUX.1-dev", "Kolors"])
+```
+
+下载非预设模型，可以选择 [ModelScope](https://modelscope.cn/models) 和 [HuggingFace](https://huggingface.co/models) 两个下载源中的模型。
+
+```python
+from diffsynth.models.downloader import download_from_huggingface, download_from_modelscope
+
+# From Modelscope (recommended)
+download_from_modelscope("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.bin", "models/kolors/Kolors/vae")
+# From Huggingface
+download_from_huggingface("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.safetensors", "models/kolors/Kolors/vae")
+```
--- a/docs/source/GetStarted/Extensions.md
+++ b/docs/source/GetStarted/Extensions.md
@@ -0,0 +1,11 @@
+# Extensions
+
+本文档介绍了一些在 DiffSynth 实现的 Diffusion 模型之外的相关技术，这些模型在图像和视频处理方面具有显著的应用潜力。
+
+- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**：FIRE（实时中间流估计算法）是一个基于实时中间流估计的帧插值（VFI）方法。FIRE采用了一种名为IFNet的神经网络，能够以更快的速度端到端估计中间流。为确保IFNet的稳定训练并提升整体性能，设计了一种特权蒸馏方案。FIRE不依赖于预训练的光流模型，能够支持任意时间步的帧插值，通过时间编码输入进行处理。
+
+- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN（增强型超分辨率生成对抗网络）是对 SRGAN 的一种改进方法，旨在提升单幅图像超分辨率的视觉质量。该方法通过优化SRGAN的三个关键组件——网络架构、对抗损失和感知损失，显著提升了生成图像的真实感。
+
+- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend是一个用来平滑视频的无模型工具包，与 Diffusion 模型结合打造了强大的视频处理流程。该工具能够有效消除视频中的闪烁现象，对关键帧序列插值，并且可以基于单一图像处理完整视频。
+
+
--- a/docs/source/GetStarted/Fine-Tuning.md
+++ b/docs/source/GetStarted/Fine-Tuning.md
@@ -0,0 +1,431 @@
+# 微调
+
+我们实现了一个用于文本到图像扩散模型的训练框架，使用户能够轻松地使用我们的框架训练 LoRA 模型。我们提供的脚本具有以下特点：
+
+* **全面功能与用户友好性**：我们的训练框架支持多GPU和多机器配置，便于使用 DeepSpeed 加速，并包括梯度检查点优化，适用于内存需求较大的模型。
+* **代码简洁与研究者可及性**：我们避免了大块复杂的代码。通用模块实现于 `diffsynth/trainers/text_to_image.py` 中，而模型特定的训练脚本仅包含与模型架构相关的最少代码，便于研究人员使用。
+* **模块化设计与开发者灵活性**：基于通用的 Pytorch-Lightning 框架，我们的训练框架在功能上是解耦的，允许开发者通过修改我们的脚本轻松引入额外的训练技术，以满足他们的需求。
+
+LoRA 微调的图像示例。提示词为 "一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉"（针对中文模型）或 "a dog is jumping, flowers around the dog, the background is mountains and clouds"（针对英文模型）。
+
+||Kolors|Stable Diffusion 3|Hunyuan-DiT|
+|-|-|-|-|
+|Without LoRA|![image_without_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/9d79ed7a-e8cf-4d98-800a-f182809db318)|![image_without_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/ddb834a5-6366-412b-93dc-6d957230d66e)|![image_without_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e)|
+|With LoRA|![image_with_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/02f62323-6ee5-4788-97a1-549732dbe4f0)|![image_with_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8e7b2888-d874-4da4-a75b-11b6b214b9bf)|![image_with_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282)|
+
+## 下载需要的包
+
+```bash
+pip install peft lightning
+```
+
+## 准备你的数据
+
+我们提供了一个 [示例数据集](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files)。你需要将训练数据集按照如下形式组织：
+
+
+```
+data/dog/
+└── train
+    ├── 00.jpg
+    ├── 01.jpg
+    ├── 02.jpg
+    ├── 03.jpg
+    ├── 04.jpg
+    └── metadata.csv
+```
+
+`metadata.csv`:
+
+```
+file_name,text
+00.jpg,a dog
+01.jpg,a dog
+02.jpg,a dog
+03.jpg,a dog
+04.jpg,a dog
+```
+
+请注意，如果模型是中文模型（例如，Hunyuan-DiT 和 Kolors），我们建议在数据集中使用中文文本。例如：
+
+```
+file_name,text
+00.jpg,一只小狗
+01.jpg,一只小狗
+02.jpg,一只小狗
+03.jpg,一只小狗
+04.jpg,一只小狗
+```
+
+## 训练 LoRA 模型
+
+参数选项：
+
+```
+  --lora_target_modules LORA_TARGET_MODULES
+                        LoRA 模块所在的层。
+  --dataset_path DATASET_PATH
+                        数据集的路径。
+  --output_path OUTPUT_PATH
+                        模型保存路径。
+  --steps_per_epoch STEPS_PER_EPOCH
+                        每个周期的步数。
+  --height HEIGHT       图像高度。
+  --width WIDTH         图像宽度。
+  --center_crop         是否将输入图像中心裁剪到指定分辨率。如果未设置，图像将被随机裁剪。图像会在裁剪前先调整到指定分辨率。
+  --random_flip         是否随机水平翻转图像。
+  --batch_size BATCH_SIZE
+                        训练数据加载器的批量大小（每设备）。
+  --dataloader_num_workers DATALOADER_NUM_WORKERS
+                        数据加载使用的子进程数量。0 表示数据将在主进程中加载。
+  --precision {32,16,16-mixed}
+                        训练精度。
+  --learning_rate LEARNING_RATE
+                        学习率。
+  --lora_rank LORA_RANK
+                        LoRA 更新矩阵的维度。
+  --lora_alpha LORA_ALPHA
+                        LoRA 更新矩阵的权重。
+  --use_gradient_checkpointing
+                        是否使用梯度检查点。
+  --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
+                        梯度累积的批次数量。
+  --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
+                        训练策略。
+  --max_epochs MAX_EPOCHS
+                        训练周期数。
+  --modelscope_model_id MODELSCOPE_MODEL_ID
+                        ModelScope 上的模型 ID (https://www.modelscope.cn/)。如果提供模型 ID，模型将自动上传到 ModelScope。
+
+```
+
+### Kolors
+
+以下文件将用于构建 Kolors。你可以从 [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) 或 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) 下载 Kolors。由于精度溢出问题，我们需要下载额外的 VAE 模型（从 [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix)）。你可以使用以下代码下载这些文件：
+
+
+```python
+from diffsynth import download_models
+
+download_models(["Kolors", "SDXL-vae-fp16-fix"])
+```
+
+```
+models
+├── kolors
+│   └── Kolors
+│       ├── text_encoder
+│       │   ├── config.json
+│       │   ├── pytorch_model-00001-of-00007.bin
+│       │   ├── pytorch_model-00002-of-00007.bin
+│       │   ├── pytorch_model-00003-of-00007.bin
+│       │   ├── pytorch_model-00004-of-00007.bin
+│       │   ├── pytorch_model-00005-of-00007.bin
+│       │   ├── pytorch_model-00006-of-00007.bin
+│       │   ├── pytorch_model-00007-of-00007.bin
+│       │   └── pytorch_model.bin.index.json
+│       ├── unet
+│       │   └── diffusion_pytorch_model.safetensors
+│       └── vae
+│           └── diffusion_pytorch_model.safetensors
+└── sdxl-vae-fp16-fix
+    └── diffusion_pytorch_model.safetensors
+```
+
+使用下面的命令启动训练任务：
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
+  --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
+  --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
+  --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+有关参数的更多信息，请使用 `python examples/train/kolors/train_kolors_lora.py -h` 查看详细信息。
+
+训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+
+### Stable Diffusion 3
+
+训练脚本只需要一个文件。你可以使用 [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)（没有 T5 Encoder）或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)（有 T5 Encoder）。请使用以下代码下载这些文件：
+
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
+```
+
+```
+models/stable_diffusion_3/
+├── Put Stable Diffusion 3 checkpoints here.txt
+├── sd3_medium_incl_clips.safetensors
+└── sd3_medium_incl_clips_t5xxlfp16.safetensors
+```
+
+使用下面的命令启动训练任务：
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
+  --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+有关参数的更多信息，请使用 `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` 查看详细信息。
+
+训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Hunyuan-DiT
+
+构建 Hunyuan DiT 需要四个文件。你可以从 [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) 或 [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary) 下载这些文件。你可以使用以下代码下载这些文件：
+
+
+```python
+from diffsynth import download_models
+
+download_models(["HunyuanDiT"])
+```
+
+```
+models/HunyuanDiT/
+├── Put Hunyuan DiT checkpoints here.txt
+└── t2i
+    ├── clip_text_encoder
+    │   └── pytorch_model.bin
+    ├── model
+    │   └── pytorch_model_ema.pt
+    ├── mt5
+    │   └── pytorch_model.bin
+    └── sdxl-vae-fp16-fix
+        └── diffusion_pytorch_model.bin
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
+  --pretrained_path models/HunyuanDiT/t2i \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+有关参数的更多信息，请使用 `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` 查看详细信息。
+
+训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+```python
+from diffsynth import ModelManager, HunyuanDiTImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=[
+                                 "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+                                 "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+                                 "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+                                 "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+                             ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉", 
+    negative_prompt="",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion
+
+训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下，我们使用基础的 Stable Diffusion v1.5。你可以从 [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors) 下载。你可以使用以下代码下载这个文件：
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion_v15"])
+```
+
+```
+models/stable_diffusion
+├── Put Stable Diffusion checkpoints here.txt
+└── v1-5-pruned-emaonly.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
+  --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 512 \
+  --width 512 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+有关参数的更多信息，请使用 `python examples/train/stable_diffusion/train_sd_lora.py -h` 查看详细信息。
+
+训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+
+```python
+from diffsynth import ModelManager, SDImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=512, height=512,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion XL
+
+训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下，我们使用基础的 Stable Diffusion XL。你可以从 [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors) 下载。也可以使用以下代码下载这个文件：
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusionXL_v1"])
+```
+
+```
+models/stable_diffusion_xl
+├── Put Stable Diffusion XL checkpoints here.txt
+└── sd_xl_base_1.0.safetensors
+```
+
+We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32.
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
+  --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "32" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+有关参数的更多信息，请使用 `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` 查看详细信息。
+
+训练完成后，使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
--- a/docs/source/GetStarted/Installation.md
+++ b/docs/source/GetStarted/Installation.md
@@ -0,0 +1,24 @@
+# 安装
+
+## 从源码下载
+
+1. 克隆源码仓库：
+
+    ```bash
+    git clone https://github.com/modelscope/DiffSynth-Studio.git
+    ```
+
+2. 进入项目目录并安装：
+
+    ```bash
+    cd DiffSynth-Studio
+    pip install -e .
+    ```
+
+## 使用 PyPI 下载
+
+直接通过 PyPI 安装：
+
+```bash
+pip install diffsynth
+```
--- a/docs/source/GetStarted/Models.md
+++ b/docs/source/GetStarted/Models.md
@@ -0,0 +1,17 @@
+# 模型
+
+目前为止，DiffSynth Studio 支持的模型如下所示：
+
+* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
+* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
+* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
+* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
+* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
+* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
+* [ESRGAN](https://github.com/xinntao/ESRGAN)
+* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
+* [AnimateDiff](https://github.com/guoyww/animatediff/)
+* [ControlNet](https://github.com/lllyasviel/ControlNet)
+* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
--- a/docs/source/GetStarted/Pipelines.md
+++ b/docs/source/GetStarted/Pipelines.md
@@ -0,0 +1,27 @@
+# Pipelines
+
+So far, the following table lists our pipelines and the models supported by each pipeline.
+
+## Image Pipelines
+
+Pipelines for generating images from text descriptions. Each pipeline relies on specific encoder and decoder models.
+
+| Pipeline                   | Models                                                     |
+|----------------------------|----------------------------------------------------------------|
+| HunyuanDiTImagePipeline     | text_encoder: HunyuanDiTCLIPTextEncoder<br>text_encoder_t5: HunyuanDiTT5TextEncoder<br>dit: HunyuanDiT<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder |
+| SDImagePipeline             | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter |
+| SD3ImagePipeline            | text_encoder_1: SD3TextEncoder1<br>text_encoder_2: SD3TextEncoder2<br>text_encoder_3: SD3TextEncoder3<br>dit: SD3DiT<br>vae_decoder: SD3VAEDecoder<br>vae_encoder: SD3VAEEncoder |
+| SDXLImagePipeline           | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter |
+
+## Video Pipelines
+
+Pipelines for generating videos from text descriptions. In addition to the models required for image generation, they include models for handling motion modules.
+
+| Pipeline                   | Models                                                     |
+|----------------------------|----------------------------------------------------------------|
+| SDVideoPipeline            | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter<br>motion_modules: SDMotionModel |
+| SDXLVideoPipeline          | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter<br>motion_modules: SDXLMotionModel |
+| SVDVideoPipeline           | image_encoder: SVDImageEncoder<br>unet: SVDUNet<br>vae_encoder: SVDVAEEncoder<br>vae_decoder: SVDVAEDecoder |
+
+
+
--- a/docs/source/GetStarted/PromptProcessing.md
+++ b/docs/source/GetStarted/PromptProcessing.md
@@ -0,0 +1,37 @@
+# 提示词（Prompt）处理
+
+DiffSynth 内置了提示词处理功能，分为：
+
+- **提示词润色器（`prompt_refiner_classes`）**：包括提示词润色、提示词中译英、提示词同时润色与中译英，可选参数如下：
+
+    - **英文提示词润色**：'BeautifulPrompt'，使用到的是[pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd)。
+
+    - **提示词中译英**：'Translator'，使用到的是[opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en)。
+
+    - **提示词中译英并润色**：'QwenPrompt'，使用到的是[Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct)。
+
+- **提示词扩展器（`prompt_extender_classes`）**：基于Omost的提示词分区控制扩写，可选参数如下：
+
+    - **提示词分区扩写**：'OmostPromter'。
+
+
+## 使用说明
+
+### 提示词润色器
+
+在加载模型 Pipeline 时，可以通过参数 `prompt_refiner_classes` 指定所需的提示词润色器功能。有关示例代码，请参考 [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py)。
+
+可选的 `prompt_refiner_classes` 参数包括：Translator、BeautifulPrompt、QwenPrompt。
+
+```python
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
+```
+
+### 提示词扩展器
+
+在加载模型 Pipeline 时，可以通过参数 `prompt_extender_classes` 指定所需的提示词扩展器。有关示例代码，请参考 [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py)。
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
+```
+
--- a/docs/source/GetStarted/Schedulers.md
+++ b/docs/source/GetStarted/Schedulers.md
@@ -0,0 +1,11 @@
+# Schedulers
+
+Scheduler 控制模型的整个去噪（或采样）过程。在加载 Pipeline 时，DiffSynth 会自动选择最适合当前 Pipeline 的调度器，无需额外配置。
+
+我们支持的调度器包括：
+
+- **EnhancedDDIMScheduler**：扩展了去噪扩散概率模型（DDPM）中的去噪过程，引入了非马尔可夫指导。
+
+- **FlowMatchScheduler**：实现了 [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) 中提出的流量匹配采样方法。
+
+- **ContinuousODEScheduler**：基于常微分方程（ODE）的调度器。
--- a/docs/source/GetStarted/WebUI.md
+++ b/docs/source/GetStarted/WebUI.md
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -0,0 +1,49 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../diffsynth'))
+
+project = 'DiffSynth-Studio'
+copyright = '2024, ModelScope'
+author = 'ModelScope'
+release = '0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.imgmath',
+    'sphinx.ext.viewcode',
+    'recommonmark',
+    'sphinx_markdown_tables'
+]
+
+templates_path = ['_templates']
+exclude_patterns = []
+
+
+source_suffix = ['.rst', '.md']
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
+# multi-language docs
+language = 'zh_CN'
+locale_dirs = ['../locales/']   # path is example but recommended.
+gettext_compact = False  # optional.
+gettext_uuid = True  # optional.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -0,0 +1,33 @@
+.. DiffSynth-Studio documentation master file, created by
+   sphinx-quickstart on Thu Sep  5 16:39:24 2024.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+DiffSynth-Studio 文档
+==============================
+
+Add your content using ``reStructuredText`` syntax. See the
+`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
+documentation for details.
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
+
+   GetStarted/A_simple_example.md
+   GetStarted/Download_models.md
+   GetStarted/ModelManager.md
+   GetStarted/Models.md
+   GetStarted/Pipelines.md
+   GetStarted/PromptProcessing.md
+   GetStarted/Schedulers.md
+   GetStarted/Fine-tuning.md
+   GetStarted/Extensions.md
+   GetStarted/WebUI.md
+
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API Docs
--- a/docs/source/requirement.txt
+++ b/docs/source/requirement.txt
@@ -0,0 +1,4 @@
+recommonmark
+sphinx_rtd_theme
+myst-parser
+sphinx-markdown-tables
--- a/docs/source_en/.readthedocs.yaml
+++ b/docs/source_en/.readthedocs.yaml
@@ -0,0 +1,25 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+python:
+  install:
+    - requirements: docs/source_en/requirement.txt
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source_en/conf.py
+
+# We recommend specifying your dependencies to enable reproducible builds:
+# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+# python:
+#   install:
+#   - requirements: docs/requirements.txt
--- a/docs/source_en/GetStarted/A_simple_example.md
+++ b/docs/source_en/GetStarted/A_simple_example.md
@@ -0,0 +1,82 @@
+
+# A Simple Example: Text-to-Image Synthesis with Flux
+
+The following example shows how to use the FLUX.1 model for text-to-image tasks. The script provides a simple setup for generating images from text descriptions. It covers downloading the necessary models, configuring the pipeline, and generating images with and without classifier-free guidance.
+
+For other models supported by DiffSynth, see [Models.md](Models.md).
+
+## Setup
+
+First, ensure you have the necessary models downloaded and configured:
+
+```python
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_models
+
+# Download the FLUX.1-dev model files
+download_models(["FLUX.1-dev"])
+```
+
+For instructions on downloading models, see [Download_models.md](Download_models.md).
+
+## Loading Models
+Initialize the model manager with your device and data type:
+
+```python
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
+model_manager.load_models([
+    "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+    "models/FLUX/FLUX.1-dev/text_encoder_2",
+    "models/FLUX/FLUX.1-dev/ae.safetensors",
+    "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+])
+```
+
+For instructions on loading models, see [ModelManager.md](ModelManager.md).
+
+## Creating the Pipeline
+Create an instance of the FluxImagePipeline from the loaded model manager:
+
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+```
+
+For instructions on using the Pipeline, see [Pipeline.md](Pipeline.md).
+## Text-to-Image Synthesis
+Generate an image using a short prompt. Below are examples of generating images with and without classifier-free guidance.
+
+### Basic Generation
+```python
+prompt = "A cute little turtle"
+negative_prompt = ""
+
+torch.manual_seed(6)
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=30, embedded_guidance=3.5
+)
+image.save("image_1024.jpg")
+```
+
+### Generation with Classifier-Free Guidance
+```python
+torch.manual_seed(6)
+image = pipe(
+    prompt=prompt, negative_prompt=negative_prompt,
+    num_inference_steps=30, cfg_scale=2.0, embedded_guidance=3.5
+)
+image.save("image_1024_cfg.jpg")
+```
+
+### High-Resolution Fix
+```python
+torch.manual_seed(7)
+image = pipe(
+    prompt=prompt,
+    num_inference_steps=30, embedded_guidance=3.5,
+    input_image=image.resize((2048, 2048)), height=2048, width=2048, denoising_strength=0.6, tiled=True
+)
+image.save("image_2048_highres.jpg")
+```
+
--- a/docs/source_en/GetStarted/Download_models.md
+++ b/docs/source_en/GetStarted/Download_models.md
@@ -0,0 +1,20 @@
+# Download Models
+
+Download the pre-set models. Model IDs can be found in [config file](/diffsynth/configs/model_config.py).
+
+```python
+from diffsynth import download_models
+
+download_models(["FLUX.1-dev", "Kolors"])
+```
+
+To download non-pre-set models, you can choose models from either the [ModelScope](https://modelscope.cn/models) or [HuggingFace](https://huggingface.co/models) sources.
+
+```python
+from diffsynth.models.downloader import download_from_huggingface, download_from_modelscope
+
+# From Modelscope (recommended)
+download_from_modelscope("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.bin", "models/kolors/Kolors/vae")
+# From Huggingface
+download_from_huggingface("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.safetensors", "models/kolors/Kolors/vae")
+```
--- a/docs/source_en/GetStarted/Extensions.md
+++ b/docs/source_en/GetStarted/Extensions.md
@@ -0,0 +1,10 @@
+# Extensions
+
+This document introduces some relevant techniques beyond the diffusion models implemented in DiffSynth, which have significant application potential in image and video processing.
+
+- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**: FIRE (Real-Time Intermediate Flow Estimation Algorithm) is a frame interpolation (VFI) method based on real-time intermediate flow estimation. It includes an end-to-end efficient intermediate flow estimation network called IFNet, as well as an optical flow supervision framework based on privileged distillation. RIFE supports inserting frames at any moment between two frames, achieving state-of-the-art performance across multiple datasets without relying on any pre-trained models.
+
+- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN (Enhanced Super Resolution Generative Adversarial Network) is an improved method based on SRGAN, aimed at enhancing the visual quality of single image super-resolution. This approach significantly improves the realism of generated images by optimizing three key components of SRGAN: network architecture, adversarial loss, and perceptual loss.
+
+- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend is a model-free toolkit designed for smoothing videos, integrated with Diffusion models to create a powerful video processing workflow. This tool effectively eliminates flickering in videos, performs interpolation on keyframe sequences, and can process complete videos based on a single image.
+
--- a/docs/source_en/GetStarted/Fine-tuning.md
+++ b/docs/source_en/GetStarted/Fine-tuning.md
@@ -0,0 +1,426 @@
+# Fine-Tuning
+
+We have implemented a training framework for text-to-image Diffusion models, enabling users to easily train LoRA models using our framework. Our provided scripts come with the following advantages:
+
+* **Comprehensive Functionality & User-Friendliness**: Our training framework supports multi-GPU and multi-machine setups, facilitates the use of DeepSpeed for acceleration, and includes gradient checkpointing optimizations for models with excessive memory demands.
+* **Code Conciseness & Researcher Accessibility**: We avoid large blocks of complicated code. General-purpose modules are implemented in `diffsynth/trainers/text_to_image.py`, while model-specific training scripts contain only minimal code pertinent to the model architecture, making it researcher-friendly.
+* **Modular Design & Developer Flexibility**: Built on the universal Pytorch-Lightning framework, our training framework is decoupled in terms of functionality, allowing developers to easily introduce additional training techniques by modifying our scripts to suit their needs.
+
+Image Examples of fine-tuned LoRA. The prompt is "一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉" (for Chinese models) or "a dog is jumping, flowers around the dog, the background is mountains and clouds" (for English models).
+
+||Kolors|Stable Diffusion 3|Hunyuan-DiT|
+|-|-|-|-|
+|Without LoRA|![image_without_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/9d79ed7a-e8cf-4d98-800a-f182809db318)|![image_without_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/ddb834a5-6366-412b-93dc-6d957230d66e)|![image_without_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e)|
+|With LoRA|![image_with_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/02f62323-6ee5-4788-97a1-549732dbe4f0)|![image_with_lora](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/8e7b2888-d874-4da4-a75b-11b6b214b9bf)|![image_with_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282)|
+
+## Install additional packages
+
+```bash
+pip install peft lightning
+```
+
+## Prepare your dataset
+
+We provide an example dataset [here](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files). You need to manage the training images as follows:
+
+```
+data/dog/
+└── train
+    ├── 00.jpg
+    ├── 01.jpg
+    ├── 02.jpg
+    ├── 03.jpg
+    ├── 04.jpg
+    └── metadata.csv
+```
+
+`metadata.csv`:
+
+```
+file_name,text
+00.jpg,a dog
+01.jpg,a dog
+02.jpg,a dog
+03.jpg,a dog
+04.jpg,a dog
+```
+
+Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommand to use Chinese texts in the dataset. For example
+
+```
+file_name,text
+00.jpg,一只小狗
+01.jpg,一只小狗
+02.jpg,一只小狗
+03.jpg,一只小狗
+04.jpg,一只小狗
+```
+
+## Train a LoRA model
+
+General options:
+
+```
+  --lora_target_modules LORA_TARGET_MODULES
+                        Layers with LoRA modules.
+  --dataset_path DATASET_PATH
+                        The path of the Dataset.
+  --output_path OUTPUT_PATH
+                        Path to save the model.
+  --steps_per_epoch STEPS_PER_EPOCH
+                        Number of steps per epoch.
+  --height HEIGHT       Image height.
+  --width WIDTH         Image width.
+  --center_crop         Whether to center crop the input images to the resolution. If not set, the images will be randomly cropped. The images will be resized to the resolution first before cropping.
+  --random_flip         Whether to randomly flip images horizontally
+  --batch_size BATCH_SIZE
+                        Batch size (per device) for the training dataloader.
+  --dataloader_num_workers DATALOADER_NUM_WORKERS
+                        Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
+  --precision {32,16,16-mixed}
+                        Training precision
+  --learning_rate LEARNING_RATE
+                        Learning rate.
+  --lora_rank LORA_RANK
+                        The dimension of the LoRA update matrices.
+  --lora_alpha LORA_ALPHA
+                        The weight of the LoRA update matrices.
+  --use_gradient_checkpointing
+                        Whether to use gradient checkpointing.
+  --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
+                        The number of batches in gradient accumulation.
+  --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
+                        Training strategy
+  --max_epochs MAX_EPOCHS
+                        Number of epochs.
+  --modelscope_model_id MODELSCOPE_MODEL_ID
+                        Model ID on ModelScope (https://www.modelscope.cn/). The model will be uploaded to ModelScope automatically if you provide a Model ID.
+  --modelscope_access_token MODELSCOPE_ACCESS_TOKEN
+                        Access key on ModelScope (https://www.modelscope.cn/). Required if you want to upload the model to ModelScope.
+```
+
+### Kolors
+
+The following files will be used for constructing Kolors. You can download Kolors from [huggingface](https://huggingface.co/Kwai-Kolors/Kolors) or [modelscope](https://modelscope.cn/models/Kwai-Kolors/Kolors). Due to precision overflow issues, we need to download an additional VAE model (from [huggingface](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) or [modelscope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix)). You can use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["Kolors", "SDXL-vae-fp16-fix"])
+```
+
+```
+models
+├── kolors
+│   └── Kolors
+│       ├── text_encoder
+│       │   ├── config.json
+│       │   ├── pytorch_model-00001-of-00007.bin
+│       │   ├── pytorch_model-00002-of-00007.bin
+│       │   ├── pytorch_model-00003-of-00007.bin
+│       │   ├── pytorch_model-00004-of-00007.bin
+│       │   ├── pytorch_model-00005-of-00007.bin
+│       │   ├── pytorch_model-00006-of-00007.bin
+│       │   ├── pytorch_model-00007-of-00007.bin
+│       │   └── pytorch_model.bin.index.json
+│       ├── unet
+│       │   └── diffusion_pytorch_model.safetensors
+│       └── vae
+│           └── diffusion_pytorch_model.safetensors
+└── sdxl-vae-fp16-fix
+    └── diffusion_pytorch_model.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
+  --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
+  --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
+  --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/kolors/train_kolors_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=[
+                                 "models/kolors/Kolors/text_encoder",
+                                 "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
+                                 "models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors"
+                             ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉", 
+    negative_prompt="",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion 3
+
+Only one file is required in the training script. You can use [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors) (without T5 encoder) or [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors) (with T5 encoder). Please use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
+```
+
+```
+models/stable_diffusion_3/
+├── Put Stable Diffusion 3 checkpoints here.txt
+├── sd3_medium_incl_clips.safetensors
+└── sd3_medium_incl_clips_t5xxlfp16.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
+  --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Hunyuan-DiT
+
+Four files will be used for constructing Hunyuan DiT. You can download them from [huggingface](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) or [modelscope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary). You can use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["HunyuanDiT"])
+```
+
+```
+models/HunyuanDiT/
+├── Put Hunyuan DiT checkpoints here.txt
+└── t2i
+    ├── clip_text_encoder
+    │   └── pytorch_model.bin
+    ├── model
+    │   └── pytorch_model_ema.pt
+    ├── mt5
+    │   └── pytorch_model.bin
+    └── sdxl-vae-fp16-fix
+        └── diffusion_pytorch_model.bin
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
+  --pretrained_path models/HunyuanDiT/t2i \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, HunyuanDiTImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=[
+                                 "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+                                 "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+                                 "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+                                 "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+                             ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉", 
+    negative_prompt="",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion
+
+Only one file is required in the training script. We support the mainstream checkpoints in [CivitAI](https://civitai.com/). By default, we use the base Stable Diffusion v1.5. You can download it from [huggingface](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) or [modelscope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors). You can use the following code to download this file:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion_v15"])
+```
+
+```
+models/stable_diffusion
+├── Put Stable Diffusion checkpoints here.txt
+└── v1-5-pruned-emaonly.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
+  --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 512 \
+  --width 512 \
+  --center_crop \
+  --precision "16-mixed" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion/train_sd_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SDImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=512, height=512,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion XL
+
+Only one file is required in the training script. We support the mainstream checkpoints in [CivitAI](https://civitai.com/). By default, we use the base Stable Diffusion XL. You can download it from [huggingface](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) or [modelscope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors). You can use the following code to download this file:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusionXL_v1"])
+```
+
+```
+models/stable_diffusion_xl
+├── Put Stable Diffusion XL checkpoints here.txt
+└── sd_xl_base_1.0.safetensors
+```
+
+We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32.
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
+  --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
+  --dataset_path data/dog \
+  --output_path ./models \
+  --max_epochs 1 \
+  --steps_per_epoch 500 \
+  --height 1024 \
+  --width 1024 \
+  --center_crop \
+  --precision "32" \
+  --learning_rate 1e-4 \
+  --lora_rank 4 \
+  --lora_alpha 4 \
+  --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+                             file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+    prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds", 
+    negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+    cfg_scale=7.5,
+    num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
--- a/docs/source_en/GetStarted/Installation.md
+++ b/docs/source_en/GetStarted/Installation.md
@@ -0,0 +1,24 @@
+# Installation
+
+## From Source
+
+1. Clone the source repository:
+
+    ```bash
+    git clone https://github.com/modelscope/DiffSynth-Studio.git
+    ```
+
+2. Navigate to the project directory and install:
+
+    ```bash
+    cd DiffSynth-Studio
+    pip install -e .
+    ```
+
+## From PyPI
+
+Install directly via PyPI:
+
+```bash
+pip install diffsynth
+```
--- a/docs/source_en/GetStarted/ModelManager.md
+++ b/docs/source_en/GetStarted/ModelManager.md
--- a/docs/source_en/GetStarted/Models.md
+++ b/docs/source_en/GetStarted/Models.md
@@ -0,0 +1,17 @@
+# Models
+
+Until now, DiffSynth Studio has supported the following models:
+
+* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
+* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
+* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
+* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
+* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
+* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
+* [ESRGAN](https://github.com/xinntao/ESRGAN)
+* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
+* [AnimateDiff](https://github.com/guoyww/animatediff/)
+* [ControlNet](https://github.com/lllyasviel/ControlNet)
+* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
--- a/docs/source_en/GetStarted/Pipelines.md
+++ b/docs/source_en/GetStarted/Pipelines.md
@@ -0,0 +1,27 @@
+# Pipelines
+
+So far, the following table lists our pipelines and the models supported by each pipeline.
+
+## Image Pipelines
+
+Pipelines for generating images from text descriptions. Each pipeline relies on specific encoder and decoder models.
+
+| Pipeline                   | Models                                                     |
+|----------------------------|----------------------------------------------------------------|
+| HunyuanDiTImagePipeline     | text_encoder: HunyuanDiTCLIPTextEncoder<br>text_encoder_t5: HunyuanDiTT5TextEncoder<br>dit: HunyuanDiT<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder |
+| SDImagePipeline             | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter |
+| SD3ImagePipeline            | text_encoder_1: SD3TextEncoder1<br>text_encoder_2: SD3TextEncoder2<br>text_encoder_3: SD3TextEncoder3<br>dit: SD3DiT<br>vae_decoder: SD3VAEDecoder<br>vae_encoder: SD3VAEEncoder |
+| SDXLImagePipeline           | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter |
+
+## Video Pipelines
+
+Pipelines for generating videos from text descriptions. In addition to the models required for image generation, they include models for handling motion modules.
+
+| Pipeline                   | Models                                                     |
+|----------------------------|----------------------------------------------------------------|
+| SDVideoPipeline            | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter<br>motion_modules: SDMotionModel |
+| SDXLVideoPipeline          | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter<br>motion_modules: SDXLMotionModel |
+| SVDVideoPipeline           | image_encoder: SVDImageEncoder<br>unet: SVDUNet<br>vae_encoder: SVDVAEEncoder<br>vae_decoder: SVDVAEDecoder |
+
+
+
--- a/docs/source_en/GetStarted/PromptProcessing.md
+++ b/docs/source_en/GetStarted/PromptProcessing.md
@@ -0,0 +1,35 @@
+# Prompt Processing
+
+DiffSynth includes prompt processing functionality, which is divided into:
+
+- **Prompt Refiners (`prompt_refiner_classes`)**: Includes prompt refinement, prompt translation from Chinese to English, and both refinement and translation of prompts. Available parameters are as follows:
+
+    - **English Prompt Refinement**: 'BeautifulPrompt', using the model [pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd).
+
+    - **Prompt Translation from Chinese to English**: 'Translator', using the model [opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en).
+
+    - **Prompt Translation and Refinement**: 'QwenPrompt', using the model [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct).
+
+- **Prompt Extenders (`prompt_extender_classes`)**: Based on Omost's prompt partition control expansion. Available parameter is:
+
+    - **Prompt Partition Expansion**: 'OmostPromter'.
+
+## Usage Instructions
+
+### Prompt Refiners
+
+When loading the model pipeline, you can specify the desired prompt refiner functionality using the `prompt_refiner_classes` parameter. For example code, refer to [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py).
+
+Available `prompt_refiner_classes` parameters include: Translator, BeautifulPrompt, QwenPrompt.
+
+```python
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
+```
+
+### Prompt Extenders
+
+When loading the model pipeline, you can specify the desired prompt extender using the prompt_extender_classes parameter. For example code, refer to [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py).
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
+```
--- a/docs/source_en/GetStarted/Schedulers.md
+++ b/docs/source_en/GetStarted/Schedulers.md
@@ -0,0 +1,11 @@
+# Schedulers
+
+Schedulers control the entire denoising (or sampling) process of the model. When loading the Pipeline, DiffSynth automatically selects the most suitable schedulers for the current Pipeline, requiring no additional configuration.
+
+The supported schedulers are:
+
+- **EnhancedDDIMScheduler**: Extends the denoising process introduced in the Denoising Diffusion Probabilistic Models (DDPM) with non-Markovian guidance.
+
+- **FlowMatchScheduler**: Implements the flow matching sampling method introduced in Stable Diffusion 3.
+
+- **ContinuousODEScheduler**: A scheduler based on Ordinary Differential Equations (ODE).
--- a/docs/source_en/GetStarted/WebUI.md
+++ b/docs/source_en/GetStarted/WebUI.md
--- a/docs/source_en/conf.py
+++ b/docs/source_en/conf.py
@@ -0,0 +1,50 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../diffsynth'))
+
+project = 'DiffSynth-Studio'
+copyright = '2024, ModelScope'
+author = 'ModelScope'
+release = '0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.imgmath',
+    'sphinx.ext.viewcode',
+    'recommonmark',
+    'sphinx_markdown_tables'
+]
+
+templates_path = ['_templates']
+exclude_patterns = []
+
+
+source_suffix = ['.rst', '.md']
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
+# multi-language docs
+language = 'en'
+locale_dirs = ['../locales/']   # path is example but recommended.
+gettext_compact = False  # optional.
+gettext_uuid = True  # optional.
--- a/docs/source_en/index.rst
+++ b/docs/source_en/index.rst
@@ -0,0 +1,32 @@
+.. DiffSynth-Studio documentation master file, created by
+   sphinx-quickstart on Thu Sep  5 16:39:24 2024.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+DiffSynth-Studio documentation
+==============================
+
+Add your content using ``reStructuredText`` syntax. See the
+`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
+documentation for details.
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
+
+   GetStarted/A_simple_example.md
+   GetStarted/Download_models.md
+   GetStarted/ModelManager.md
+   GetStarted/Models.md
+   GetStarted/Pipelines.md
+   GetStarted/PromptProcessing.md
+   GetStarted/Schedulers.md
+   GetStarted/Fine-tuning.md
+   GetStarted/Extensions.md
+   GetStarted/WebUI.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API Docs
--- a/docs/source_en/requirement.txt
+++ b/docs/source_en/requirement.txt
@@ -0,0 +1,4 @@
+recommonmark
+sphinx_rtd_theme
+myst-parser
+sphinx-markdown-tables
--- a/readthedocs.yaml
+++ b/readthedocs.yaml
@@ -0,0 +1,26 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# We recommend specifying your dependencies to enable reproducible builds:
+# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+# python:
+#   install:
+#   - requirements: docs/requirements.txt