From 24b78148b823cc6df0b5f3ec1b869d402b52652a Mon Sep 17 00:00:00 2001
From: yrk111222 <2493404415@qq.com>
Date: Fri, 18 Oct 2024 11:36:48 +0800
Subject: [PATCH] Add files via upload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
电脑更换,完成到D:\translate\DiffSynth-Studio\docs\source_en\finetune,该写第四个文档
---
docs/Makefile | 40 +-
docs/source/conf.py | 96 +-
.../creating/AdaptersForImageSynthesis.md | 266 +++---
docs/source/creating/BasicImageSynthesis.md | 130 +--
docs/source/creating/PromptRefine.md | 156 ++--
docs/source/creating/ToonShading.md | 190 ++--
docs/source/finetune/overview.md | 196 ++--
docs/source/finetune/train_flux_lora.md | 142 +--
.../source/finetune/train_hunyuan_dit_lora.md | 144 +--
docs/source/finetune/train_kolors_lora.md | 156 ++--
docs/source/finetune/train_sd3_lora.md | 118 +--
docs/source/finetune/train_sd_lora.md | 118 +--
docs/source/finetune/train_sdxl_lora.md | 114 +--
docs/source/index.rst | 88 +-
docs/source/requirement.txt | 6 +-
docs/source/tutorial/ASimpleExample.md | 170 ++--
docs/source/tutorial/DownloadModels.md | 68 +-
docs/source/tutorial/Extensions.md | 98 +-
docs/source/tutorial/Installation.md | 50 +-
docs/source/tutorial/Models.md | 36 +-
docs/source/tutorial/Pipelines.md | 44 +-
docs/source/tutorial/PromptProcessing.md | 74 +-
docs/source/tutorial/Schedulers.md | 22 +-
docs/source_en/GetStarted/A_simple_example.md | 164 ++--
docs/source_en/GetStarted/Download_models.md | 40 +-
docs/source_en/GetStarted/Extensions.md | 20 +-
docs/source_en/GetStarted/Fine-tuning.md | 852 +++++++++---------
docs/source_en/GetStarted/Installation.md | 46 +-
docs/source_en/GetStarted/Models.md | 34 +-
docs/source_en/GetStarted/Pipelines.md | 54 +-
docs/source_en/GetStarted/PromptProcessing.md | 70 +-
docs/source_en/GetStarted/Schedulers.md | 20 +-
docs/source_en/conf.py | 98 +-
.../creating/AdaptersForImageSynthesis.md | 135 +++
.../source_en/creating/BasicImageSynthesis.md | 64 ++
docs/source_en/creating/PromptRefine.md | 77 ++
docs/source_en/creating/ToonShading.md | 95 ++
docs/source_en/finetune/overview.md | 102 +++
docs/source_en/finetune/train_flux_lora.md | 70 ++
.../finetune/train_hunyuan_dit_lora.md | 72 ++
docs/source_en/finetune/train_kolors_lora.md | 78 ++
docs/source_en/finetune/train_sd3_lora.md | 59 ++
docs/source_en/finetune/train_sd_lora.md | 59 ++
docs/source_en/finetune/train_sdxl_lora.md | 57 ++
docs/source_en/index.rst | 64 +-
docs/source_en/requirement.txt | 6 +-
46 files changed, 2863 insertions(+), 1995 deletions(-)
create mode 100644 docs/source_en/creating/AdaptersForImageSynthesis.md
create mode 100644 docs/source_en/creating/BasicImageSynthesis.md
create mode 100644 docs/source_en/creating/PromptRefine.md
create mode 100644 docs/source_en/creating/ToonShading.md
create mode 100644 docs/source_en/finetune/overview.md
create mode 100644 docs/source_en/finetune/train_flux_lora.md
create mode 100644 docs/source_en/finetune/train_hunyuan_dit_lora.md
create mode 100644 docs/source_en/finetune/train_kolors_lora.md
create mode 100644 docs/source_en/finetune/train_sd3_lora.md
create mode 100644 docs/source_en/finetune/train_sd_lora.md
create mode 100644 docs/source_en/finetune/train_sdxl_lora.md
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf..26b9422 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,20 +1,20 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = source
-BUILDDIR = build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 233a1fd..bde92b4 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,49 +1,49 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# For the full list of built-in configuration values, see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Project information -----------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-
-import os
-import sys
-sys.path.insert(0, os.path.abspath('../../diffsynth'))
-
-project = 'DiffSynth-Studio'
-copyright = '2024, ModelScope'
-author = 'ModelScope'
-release = '0.1.0'
-
-
-# -- General configuration ---------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-
-extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.napoleon',
- 'sphinx.ext.doctest',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.todo',
- 'sphinx.ext.coverage',
- 'sphinx.ext.imgmath',
- 'sphinx.ext.viewcode',
- 'recommonmark',
- 'sphinx_markdown_tables'
-]
-
-templates_path = ['_templates']
-exclude_patterns = []
-
-
-source_suffix = ['.rst', '.md']
-# -- Options for HTML output -------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-
-html_theme = 'sphinx_rtd_theme'
-html_static_path = ['_static']
-# multi-language docs
-language = 'zh_CN'
-locale_dirs = ['../locales/'] # path is example but recommended.
-gettext_compact = False # optional.
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../diffsynth'))
+
+project = 'DiffSynth-Studio'
+copyright = '2024, ModelScope'
+author = 'ModelScope'
+release = '0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.napoleon',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.imgmath',
+ 'sphinx.ext.viewcode',
+ 'recommonmark',
+ 'sphinx_markdown_tables'
+]
+
+templates_path = ['_templates']
+exclude_patterns = []
+
+
+source_suffix = ['.rst', '.md']
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
+# multi-language docs
+language = 'zh_CN'
+locale_dirs = ['../locales/'] # path is example but recommended.
+gettext_compact = False # optional.
gettext_uuid = True # optional.
\ No newline at end of file
diff --git a/docs/source/creating/AdaptersForImageSynthesis.md b/docs/source/creating/AdaptersForImageSynthesis.md
index 45dfd48..e242b8d 100644
--- a/docs/source/creating/AdaptersForImageSynthesis.md
+++ b/docs/source/creating/AdaptersForImageSynthesis.md
@@ -1,133 +1,133 @@
-# ControlNet、LoRA、IP-Adapter——精准控制技术
-
-在文生图模型的基础上,还可以使用各种 Adapter 架构的模型对生成过程进行控制。
-
-接下来的例子会用到很多模型,我们先把它们下载好。
-
-* 一个广受好评的 Stable Diffusion XL 架构动漫风格模型
-* 一个支持多种控制模式的 ControlNet 模型
-* 一个 Stable Diffusion XL 模型的 LoRA 模型
-* 一个 IP-Adapter 模型及其对应的图像编码器
-
-```python
-from diffsynth import download_models
-
-download_models([
- "BluePencilXL_v200",
- "ControlNet_union_sdxl_promax",
- "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
- "IP-Adapter-SDXL"
-])
-```
-
-用基础文生图功能生成一张图
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models(["models/stable_diffusion_xl/bluePencilXL_v200.safetensors"])
-pipe = SDXLImagePipeline.from_model_manager(model_manager)
-torch.manual_seed(1)
-image = pipe(
- prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
- cfg_scale=6, num_inference_steps=60,
-)
-image.save("image.jpg")
-```
-
-
-
-接下来,我们让这位水下翩翩起舞的少女变成火系魔法师!启用 ControlNet 保持画面结构的同时,修改提示词。
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
-import torch
-from PIL import Image
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
- "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
- "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
-])
-pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
- ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
-])
-torch.manual_seed(2)
-image = pipe(
- prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
- cfg_scale=6, num_inference_steps=60,
- controlnet_image=Image.open("image.jpg")
-)
-image.save("image_controlnet.jpg")
-```
-
-
-
-很酷对不对?还有更酷的,加个 LoRA,让画面更贴近手绘漫画的扁平风格。这个 LoRA 需要一定的触发词才能生效,这在原作者的模型页面有提到,记得在提示词的开头加上触发词哦。
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
-import torch
-from PIL import Image
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
- "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
- "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
-])
-model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
-pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
- ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
-])
-torch.manual_seed(3)
-image = pipe(
- prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
- cfg_scale=6, num_inference_steps=60,
- controlnet_image=Image.open("image.jpg")
-)
-image.save("image_lora.jpg")
-```
-
-
-
-还没结束呢!找一张水墨风的中国画作为风格引导,启动 IP-Adapter,让古典艺术和现代美学碰撞!
-
-|就用这张图作为风格引导吧||
-|-|-|
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
-import torch
-from PIL import Image
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
- "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
- "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
- "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
- "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
-])
-model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
-pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
- ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
-])
-torch.manual_seed(2)
-image = pipe(
- prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
- cfg_scale=6, num_inference_steps=60,
- controlnet_image=Image.open("image.jpg"),
- ipadapter_images=[Image.open("ink_style.jpg")],
- ipadapter_use_instant_style=True, ipadapter_scale=0.5
-)
-image.save("image_ipadapter.jpg")
-```
-
-
-
-用 Diffusion 生成图像的乐趣在于,各种生态模型的组合,可以实现各种奇思妙想。
+# ControlNet、LoRA、IP-Adapter——精准控制技术
+
+在文生图模型的基础上,还可以使用各种 Adapter 架构的模型对生成过程进行控制。
+
+接下来的例子会用到很多模型,我们先把它们下载好。
+
+* 一个广受好评的 Stable Diffusion XL 架构动漫风格模型
+* 一个支持多种控制模式的 ControlNet 模型
+* 一个 Stable Diffusion XL 模型的 LoRA 模型
+* 一个 IP-Adapter 模型及其对应的图像编码器
+
+```python
+from diffsynth import download_models
+
+download_models([
+ "BluePencilXL_v200",
+ "ControlNet_union_sdxl_promax",
+ "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
+ "IP-Adapter-SDXL"
+])
+```
+
+用基础文生图功能生成一张图
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models(["models/stable_diffusion_xl/bluePencilXL_v200.safetensors"])
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+torch.manual_seed(1)
+image = pipe(
+ prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ cfg_scale=6, num_inference_steps=60,
+)
+image.save("image.jpg")
+```
+
+
+
+接下来,我们让这位水下翩翩起舞的少女变成火系魔法师!启用 ControlNet 保持画面结构的同时,修改提示词。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
+import torch
+from PIL import Image
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+ "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
+])
+pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+ ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
+])
+torch.manual_seed(2)
+image = pipe(
+ prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
+ cfg_scale=6, num_inference_steps=60,
+ controlnet_image=Image.open("image.jpg")
+)
+image.save("image_controlnet.jpg")
+```
+
+
+
+很酷对不对?还有更酷的,加个 LoRA,让画面更贴近手绘漫画的扁平风格。这个 LoRA 需要一定的触发词才能生效,这在原作者的模型页面有提到,记得在提示词的开头加上触发词哦。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
+import torch
+from PIL import Image
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+ "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
+])
+model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+ ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
+])
+torch.manual_seed(3)
+image = pipe(
+ prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
+ cfg_scale=6, num_inference_steps=60,
+ controlnet_image=Image.open("image.jpg")
+)
+image.save("image_lora.jpg")
+```
+
+
+
+还没结束呢!找一张水墨风的中国画作为风格引导,启动 IP-Adapter,让古典艺术和现代美学碰撞!
+
+|就用这张图作为风格引导吧||
+|-|-|
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
+import torch
+from PIL import Image
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+ "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
+ "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
+ "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
+])
+model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+ ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
+])
+torch.manual_seed(2)
+image = pipe(
+ prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
+ cfg_scale=6, num_inference_steps=60,
+ controlnet_image=Image.open("image.jpg"),
+ ipadapter_images=[Image.open("ink_style.jpg")],
+ ipadapter_use_instant_style=True, ipadapter_scale=0.5
+)
+image.save("image_ipadapter.jpg")
+```
+
+
+
+用 Diffusion 生成图像的乐趣在于,各种生态模型的组合,可以实现各种奇思妙想。
diff --git a/docs/source/creating/BasicImageSynthesis.md b/docs/source/creating/BasicImageSynthesis.md
index 7d3864a..7814662 100644
--- a/docs/source/creating/BasicImageSynthesis.md
+++ b/docs/source/creating/BasicImageSynthesis.md
@@ -1,65 +1,65 @@
-# 文生图、图生图、高分辨率修复——初识绚丽的 Diffusion
-
-加载文生图模型,这里我们使用一个 Civiai 上一个动漫风格的模型作为例子。
-
-```python
-import torch
-from diffsynth import ModelManager, SDImagePipeline, download_models
-
-download_models(["AingDiffusion_v12"])
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
-pipe = SDImagePipeline.from_model_manager(model_manager)
-```
-
-生成一张图小试身手。
-
-```python
-torch.manual_seed(0)
-image = pipe(
- prompt="masterpiece, best quality, a girl with long silver hair",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
- height=512, width=512, num_inference_steps=80,
-)
-image.save("image.jpg")
-```
-
-嗯,一个可爱的小姐姐。
-
-
-
-用图生图功能把她的头发变成红色,只需要添加 `input_image` 和 `denoising_strength` 两个参数。其中 `denoising_strength` 用于控制加噪声的强度,为 0 时生成的图与输入的图完全一致,为 1 时完全随机生成图。
-
-```python
-torch.manual_seed(1)
-image_edited = pipe(
- prompt="masterpiece, best quality, a girl with long red hair",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
- height=512, width=512, num_inference_steps=80,
- input_image=image, denoising_strength=0.6,
-)
-image_edited.save("image_edited.jpg")
-```
-
-嗯,一个红色头发的可爱小姐姐。
-
-
-
-由于模型本身是在 512*512 分辨率下训练的,所以图片看起来有点模糊,不过我们可以利用模型自身的能力润色这张图,为其填充细节。具体来说,就是提高分辨率后进行图生图。
-
-```python
-torch.manual_seed(2)
-image_highres = pipe(
- prompt="masterpiece, best quality, a girl with long red hair",
- negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
- height=1024, width=1024, num_inference_steps=80,
- input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
-)
-image_highres.save("image_highres.jpg")
-```
-
-嗯,一个清晰的红色头发可爱小姐姐。
-
-
-
-值得注意的是,图生图和高分辨率修复功能是全局支持的,目前我们所有的图像生成流水线都可以这样使用。
+# 文生图、图生图、高分辨率修复——初识绚丽的 Diffusion
+
+加载文生图模型,这里我们使用一个 Civiai 上一个动漫风格的模型作为例子。
+
+```python
+import torch
+from diffsynth import ModelManager, SDImagePipeline, download_models
+
+download_models(["AingDiffusion_v12"])
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
+pipe = SDImagePipeline.from_model_manager(model_manager)
+```
+
+生成一张图小试身手。
+
+```python
+torch.manual_seed(0)
+image = pipe(
+ prompt="masterpiece, best quality, a girl with long silver hair",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ height=512, width=512, num_inference_steps=80,
+)
+image.save("image.jpg")
+```
+
+嗯,一个可爱的小姐姐。
+
+
+
+用图生图功能把她的头发变成红色,只需要添加 `input_image` 和 `denoising_strength` 两个参数。其中 `denoising_strength` 用于控制加噪声的强度,为 0 时生成的图与输入的图完全一致,为 1 时完全随机生成图。
+
+```python
+torch.manual_seed(1)
+image_edited = pipe(
+ prompt="masterpiece, best quality, a girl with long red hair",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ height=512, width=512, num_inference_steps=80,
+ input_image=image, denoising_strength=0.6,
+)
+image_edited.save("image_edited.jpg")
+```
+
+嗯,一个红色头发的可爱小姐姐。
+
+
+
+由于模型本身是在 512*512 分辨率下训练的,所以图片看起来有点模糊,不过我们可以利用模型自身的能力润色这张图,为其填充细节。具体来说,就是提高分辨率后进行图生图。
+
+```python
+torch.manual_seed(2)
+image_highres = pipe(
+ prompt="masterpiece, best quality, a girl with long red hair",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ height=1024, width=1024, num_inference_steps=80,
+ input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
+)
+image_highres.save("image_highres.jpg")
+```
+
+嗯,一个清晰的红色头发可爱小姐姐。
+
+
+
+值得注意的是,图生图和高分辨率修复功能是全局支持的,目前我们所有的图像生成流水线都可以这样使用。
diff --git a/docs/source/creating/PromptRefine.md b/docs/source/creating/PromptRefine.md
index c113140..69e413e 100644
--- a/docs/source/creating/PromptRefine.md
+++ b/docs/source/creating/PromptRefine.md
@@ -1,78 +1,78 @@
-# 翻译、润色——提示词的魔法
-
-在生成图像时,我们需要编写提示词,用来描述图像的内容。提示词会直接影响生成的效果,但提示词的编写也是一门学问,好的提示词可以生成具有高度美感的图像,我们提供了一系列模型来帮助用户处理提示词。
-
-## 翻译
-
-目前大多数文生图模型都是只支持英文提示词的,对于非英文母语的用户,使用起来有些困难,我们可以使用开源的翻译模型把提示词翻译成英文。在下面这个例子中,我们以“一个女孩”为提示词,使用模型 opus-mt-zh-en(可在 [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) 或 [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en) 下载)进行翻译。
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline, Translator
-import torch
-
-model_manager = ModelManager(
- torch_dtype=torch.float16, device="cuda",
- model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
-)
-pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
-
-torch.manual_seed(0)
-prompt = "一个女孩"
-image = pipe(
- prompt=prompt, negative_prompt="",
- height=1024, width=1024, num_inference_steps=30
-)
-image.save("image_1.jpg")
-```
-
-
-
-## 润色
-
-详细的提示词可以生成细节更丰富的图像,我们可以使用提示词润色模型 BeautifulPrompt(可在 [HuggingFace](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd) 下载)对简单的提示词进行润色,这个模型能够让整体画面风格更加华丽。
-
-这个模块可以和翻译模块同时启用,但请注意顺序,先翻译,后润色。
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
-import torch
-
-model_manager = ModelManager(
- torch_dtype=torch.float16, device="cuda",
- model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
-)
-pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
-
-torch.manual_seed(0)
-prompt = "一个女孩"
-image = pipe(
- prompt=prompt, negative_prompt="",
- height=1024, width=1024, num_inference_steps=30
-)
-image.save("image_2.jpg")
-```
-
-
-
-我们还内置了一个通义千问模型,这个模型可以一步到位地完成提示词的翻译和润色工作。
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
-import torch
-
-model_manager = ModelManager(
- torch_dtype=torch.float16, device="cuda",
- model_id_list=["BluePencilXL_v200", "QwenPrompt"]
-)
-pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
-
-torch.manual_seed(0)
-prompt = "一个女孩"
-image = pipe(
- prompt=prompt, negative_prompt="",
- height=1024, width=1024, num_inference_steps=30
-)
-image.save("image_3.jpg")
-```
-
-
+# 翻译、润色——提示词的魔法
+
+在生成图像时,我们需要编写提示词,用来描述图像的内容。提示词会直接影响生成的效果,但提示词的编写也是一门学问,好的提示词可以生成具有高度美感的图像,我们提供了一系列模型来帮助用户处理提示词。
+
+## 翻译
+
+目前大多数文生图模型都是只支持英文提示词的,对于非英文母语的用户,使用起来有些困难,我们可以使用开源的翻译模型把提示词翻译成英文。在下面这个例子中,我们以“一个女孩”为提示词,使用模型 opus-mt-zh-en(可在 [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) 或 [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en) 下载)进行翻译。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, Translator
+import torch
+
+model_manager = ModelManager(
+ torch_dtype=torch.float16, device="cuda",
+ model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
+)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
+
+torch.manual_seed(0)
+prompt = "一个女孩"
+image = pipe(
+ prompt=prompt, negative_prompt="",
+ height=1024, width=1024, num_inference_steps=30
+)
+image.save("image_1.jpg")
+```
+
+
+
+## 润色
+
+详细的提示词可以生成细节更丰富的图像,我们可以使用提示词润色模型 BeautifulPrompt(可在 [HuggingFace](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd) 下载)对简单的提示词进行润色,这个模型能够让整体画面风格更加华丽。
+
+这个模块可以和翻译模块同时启用,但请注意顺序,先翻译,后润色。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
+import torch
+
+model_manager = ModelManager(
+ torch_dtype=torch.float16, device="cuda",
+ model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
+)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
+
+torch.manual_seed(0)
+prompt = "一个女孩"
+image = pipe(
+ prompt=prompt, negative_prompt="",
+ height=1024, width=1024, num_inference_steps=30
+)
+image.save("image_2.jpg")
+```
+
+
+
+我们还内置了一个通义千问模型,这个模型可以一步到位地完成提示词的翻译和润色工作。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
+import torch
+
+model_manager = ModelManager(
+ torch_dtype=torch.float16, device="cuda",
+ model_id_list=["BluePencilXL_v200", "QwenPrompt"]
+)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
+
+torch.manual_seed(0)
+prompt = "一个女孩"
+image = pipe(
+ prompt=prompt, negative_prompt="",
+ height=1024, width=1024, num_inference_steps=30
+)
+image.save("image_3.jpg")
+```
+
+
diff --git a/docs/source/creating/ToonShading.md b/docs/source/creating/ToonShading.md
index 729ef18..98e7bc0 100644
--- a/docs/source/creating/ToonShading.md
+++ b/docs/source/creating/ToonShading.md
@@ -1,95 +1,95 @@
-# 当图像模型遇见 AnimateDiff——模型组合技术
-
-我们已经领略到了 Stable Diffusion 模型及其生态模型的强大图像生成能力,现在我们引入一个新的模块:AnimateDiff,这样一来就可以把图像模型的能力迁移到视频中。在本篇文章中,我们为您展示基于 DiffSynth-Studio 搭建的动漫风格视频渲染方案:Diffutoon。
-
-## 下载模型
-
-接下来的例子会用到很多模型,我们先把它们下载好。
-
-* 一个动漫风格的 Stable Diffusion 架构模型
-* 两个 ControlNet 模型
-* 一个 Textual Inversion 模型
-* 一个 AnimateDiff 模型
-
-```python
-from diffsynth import download_models
-
-download_models([
- "AingDiffusion_v12",
- "AnimateDiff_v2",
- "ControlNet_v11p_sd15_lineart",
- "ControlNet_v11f1e_sd15_tile",
- "TextualInversion_VeryBadImageNegative_v1.3"
-])
-```
-
-## 下载视频
-
-你可以随意选择任何你喜欢的视频,我们使用[这个视频](https://www.bilibili.com/video/BV1iG411a7sQ)作为演示,你可以通过以下命令下载这个视频文件,但请注意,在没有获得视频原作者的商用版权时,请不要将其用作商业用途。
-
-```
-modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
-```
-
-## 生成动漫
-
-```python
-from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
-import torch
-
-# Load models
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
- "models/stable_diffusion/aingdiffusion_v12.safetensors",
- "models/AnimateDiff/mm_sd_v15_v2.ckpt",
- "models/ControlNet/control_v11p_sd15_lineart.pth",
- "models/ControlNet/control_v11f1e_sd15_tile.pth",
-])
-
-# Build pipeline
-pipe = SDVideoPipeline.from_model_manager(
- model_manager,
- [
- ControlNetConfigUnit(
- processor_id="tile",
- model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
- scale=0.5
- ),
- ControlNetConfigUnit(
- processor_id="lineart",
- model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
- scale=0.5
- )
- ]
-)
-pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
-
-# Load video
-video = VideoData(
- video_file="data/examples/diffutoon/input_video.mp4",
- height=1536, width=1536
-)
-input_video = [video[i] for i in range(30)]
-
-# Generate
-torch.manual_seed(0)
-output_video = pipe(
- prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
- negative_prompt="verybadimagenegative_v1.3",
- cfg_scale=7, clip_skip=2,
- input_frames=input_video, denoising_strength=1.0,
- controlnet_frames=input_video, num_frames=len(input_video),
- num_inference_steps=10, height=1536, width=1536,
- animatediff_batch_size=16, animatediff_stride=8,
-)
-
-# Save video
-save_video(output_video, "output_video.mp4", fps=30)
-```
-
-## 效果展示
-
-
+# 当图像模型遇见 AnimateDiff——模型组合技术
+
+我们已经领略到了 Stable Diffusion 模型及其生态模型的强大图像生成能力,现在我们引入一个新的模块:AnimateDiff,这样一来就可以把图像模型的能力迁移到视频中。在本篇文章中,我们为您展示基于 DiffSynth-Studio 搭建的动漫风格视频渲染方案:Diffutoon。
+
+## 下载模型
+
+接下来的例子会用到很多模型,我们先把它们下载好。
+
+* 一个动漫风格的 Stable Diffusion 架构模型
+* 两个 ControlNet 模型
+* 一个 Textual Inversion 模型
+* 一个 AnimateDiff 模型
+
+```python
+from diffsynth import download_models
+
+download_models([
+ "AingDiffusion_v12",
+ "AnimateDiff_v2",
+ "ControlNet_v11p_sd15_lineart",
+ "ControlNet_v11f1e_sd15_tile",
+ "TextualInversion_VeryBadImageNegative_v1.3"
+])
+```
+
+## 下载视频
+
+你可以随意选择任何你喜欢的视频,我们使用[这个视频](https://www.bilibili.com/video/BV1iG411a7sQ)作为演示,你可以通过以下命令下载这个视频文件,但请注意,在没有获得视频原作者的商用版权时,请不要将其用作商业用途。
+
+```
+modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
+```
+
+## 生成动漫
+
+```python
+from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
+import torch
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion/aingdiffusion_v12.safetensors",
+ "models/AnimateDiff/mm_sd_v15_v2.ckpt",
+ "models/ControlNet/control_v11p_sd15_lineart.pth",
+ "models/ControlNet/control_v11f1e_sd15_tile.pth",
+])
+
+# Build pipeline
+pipe = SDVideoPipeline.from_model_manager(
+ model_manager,
+ [
+ ControlNetConfigUnit(
+ processor_id="tile",
+ model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
+ scale=0.5
+ ),
+ ControlNetConfigUnit(
+ processor_id="lineart",
+ model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
+ scale=0.5
+ )
+ ]
+)
+pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
+
+# Load video
+video = VideoData(
+ video_file="data/examples/diffutoon/input_video.mp4",
+ height=1536, width=1536
+)
+input_video = [video[i] for i in range(30)]
+
+# Generate
+torch.manual_seed(0)
+output_video = pipe(
+ prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
+ negative_prompt="verybadimagenegative_v1.3",
+ cfg_scale=7, clip_skip=2,
+ input_frames=input_video, denoising_strength=1.0,
+ controlnet_frames=input_video, num_frames=len(input_video),
+ num_inference_steps=10, height=1536, width=1536,
+ animatediff_batch_size=16, animatediff_stride=8,
+)
+
+# Save video
+save_video(output_video, "output_video.mp4", fps=30)
+```
+
+## 效果展示
+
+
diff --git a/docs/source/finetune/overview.md b/docs/source/finetune/overview.md
index ded131c..7d55ced 100644
--- a/docs/source/finetune/overview.md
+++ b/docs/source/finetune/overview.md
@@ -1,98 +1,98 @@
-# 训练框架
-
-我们实现了一个用于文本到图像扩散模型的训练框架,使用户能够轻松地使用我们的框架训练 LoRA 模型。我们提供的脚本具有以下特点:
-
-* **功能全面**:我们的训练框架支持多GPU和多机器配置,便于使用 DeepSpeed 加速,并包括梯度检查点优化,适用于内存需求较大的模型。
-* **代码简洁**:我们避免了大块复杂的代码。通用模块实现于 `diffsynth/trainers/text_to_image.py` 中,而模型特定的训练脚本仅包含与模型架构相关的最少代码,便于学术研究人员使用。
-* **模块化设计**:基于通用的 Pytorch-Lightning 框架,我们的训练框架在功能上是解耦的,允许开发者通过修改我们的脚本轻松引入额外的训练技术,以满足他们的需求。
-
-LoRA 微调的图像示例。提示词为 "一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉"(针对中文模型)或 "a dog is jumping, flowers around the dog, the background is mountains and clouds"(针对英文模型)。
-
-||FLUX.1-dev|Kolors|Stable Diffusion 3|Hunyuan-DiT|
-|-|-|-|-|-|
-|Without LoRA|||||
-|With LoRA|||||
-
-## 安装额外包
-
-```
-pip install peft lightning
-```
-
-## 准备数据集
-
-我们提供了一个[示例数据集](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files)。你需要将训练数据集按照如下形式组织:
-
-```
-data/dog/
-└── train
- ├── 00.jpg
- ├── 01.jpg
- ├── 02.jpg
- ├── 03.jpg
- ├── 04.jpg
- └── metadata.csv
-```
-
-`metadata.csv`:
-
-```
-file_name,text
-00.jpg,a dog
-01.jpg,a dog
-02.jpg,a dog
-03.jpg,a dog
-04.jpg,a dog
-```
-
-请注意,如果模型是中文模型(例如,Hunyuan-DiT 和 Kolors),我们建议在数据集中使用中文文本。例如:
-
-```
-file_name,text
-00.jpg,一只小狗
-01.jpg,一只小狗
-02.jpg,一只小狗
-03.jpg,一只小狗
-04.jpg,一只小狗
-```
-
-## 训练 LoRA 模型
-
-通用参数选项:
-
-```
- --lora_target_modules LORA_TARGET_MODULES
- LoRA 模块所在的层。
- --dataset_path DATASET_PATH
- 数据集的路径。
- --output_path OUTPUT_PATH
- 模型保存路径。
- --steps_per_epoch STEPS_PER_EPOCH
- 每个周期的步数。
- --height HEIGHT 图像高度。
- --width WIDTH 图像宽度。
- --center_crop 是否将输入图像中心裁剪到指定分辨率。如果未设置,图像将被随机裁剪。图像会在裁剪前先调整到指定分辨率。
- --random_flip 是否随机水平翻转图像。
- --batch_size BATCH_SIZE
- 训练数据加载器的批量大小(每设备)。
- --dataloader_num_workers DATALOADER_NUM_WORKERS
- 数据加载使用的子进程数量。0 表示数据将在主进程中加载。
- --precision {32,16,16-mixed}
- 训练精度。
- --learning_rate LEARNING_RATE
- 学习率。
- --lora_rank LORA_RANK
- LoRA 更新矩阵的维度。
- --lora_alpha LORA_ALPHA
- LoRA 更新矩阵的权重。
- --use_gradient_checkpointing
- 是否使用梯度检查点。
- --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
- 梯度累积的批次数量。
- --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
- 训练策略。
- --max_epochs MAX_EPOCHS
- 训练轮数。
- --modelscope_model_id MODELSCOPE_MODEL_ID
- ModelScope 上的模型 ID (https://www.modelscope.cn/)。如果提供模型 ID,模型将自动上传到 ModelScope。
-```
+# 训练框架
+
+我们实现了一个用于文本到图像扩散模型的训练框架,使用户能够轻松地使用我们的框架训练 LoRA 模型。我们提供的脚本具有以下特点:
+
+* **功能全面**:我们的训练框架支持多GPU和多机器配置,便于使用 DeepSpeed 加速,并包括梯度检查点优化,适用于内存需求较大的模型。
+* **代码简洁**:我们避免了大块复杂的代码。通用模块实现于 `diffsynth/trainers/text_to_image.py` 中,而模型特定的训练脚本仅包含与模型架构相关的最少代码,便于学术研究人员使用。
+* **模块化设计**:基于通用的 Pytorch-Lightning 框架,我们的训练框架在功能上是解耦的,允许开发者通过修改我们的脚本轻松引入额外的训练技术,以满足他们的需求。
+
+LoRA 微调的图像示例。提示词为 "一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉"(针对中文模型)或 "a dog is jumping, flowers around the dog, the background is mountains and clouds"(针对英文模型)。
+
+||FLUX.1-dev|Kolors|Stable Diffusion 3|Hunyuan-DiT|
+|-|-|-|-|-|
+|Without LoRA|||||
+|With LoRA|||||
+
+## 安装额外包
+
+```
+pip install peft lightning
+```
+
+## 准备数据集
+
+我们提供了一个[示例数据集](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files)。你需要将训练数据集按照如下形式组织:
+
+```
+data/dog/
+└── train
+ ├── 00.jpg
+ ├── 01.jpg
+ ├── 02.jpg
+ ├── 03.jpg
+ ├── 04.jpg
+ └── metadata.csv
+```
+
+`metadata.csv`:
+
+```
+file_name,text
+00.jpg,a dog
+01.jpg,a dog
+02.jpg,a dog
+03.jpg,a dog
+04.jpg,a dog
+```
+
+请注意,如果模型是中文模型(例如,Hunyuan-DiT 和 Kolors),我们建议在数据集中使用中文文本。例如:
+
+```
+file_name,text
+00.jpg,一只小狗
+01.jpg,一只小狗
+02.jpg,一只小狗
+03.jpg,一只小狗
+04.jpg,一只小狗
+```
+
+## 训练 LoRA 模型
+
+通用参数选项:
+
+```
+ --lora_target_modules LORA_TARGET_MODULES
+ LoRA 模块所在的层。
+ --dataset_path DATASET_PATH
+ 数据集的路径。
+ --output_path OUTPUT_PATH
+ 模型保存路径。
+ --steps_per_epoch STEPS_PER_EPOCH
+ 每个周期的步数。
+ --height HEIGHT 图像高度。
+ --width WIDTH 图像宽度。
+ --center_crop 是否将输入图像中心裁剪到指定分辨率。如果未设置,图像将被随机裁剪。图像会在裁剪前先调整到指定分辨率。
+ --random_flip 是否随机水平翻转图像。
+ --batch_size BATCH_SIZE
+ 训练数据加载器的批量大小(每设备)。
+ --dataloader_num_workers DATALOADER_NUM_WORKERS
+ 数据加载使用的子进程数量。0 表示数据将在主进程中加载。
+ --precision {32,16,16-mixed}
+ 训练精度。
+ --learning_rate LEARNING_RATE
+ 学习率。
+ --lora_rank LORA_RANK
+ LoRA 更新矩阵的维度。
+ --lora_alpha LORA_ALPHA
+ LoRA 更新矩阵的权重。
+ --use_gradient_checkpointing
+ 是否使用梯度检查点。
+ --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
+ 梯度累积的批次数量。
+ --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
+ 训练策略。
+ --max_epochs MAX_EPOCHS
+ 训练轮数。
+ --modelscope_model_id MODELSCOPE_MODEL_ID
+ ModelScope 上的模型 ID (https://www.modelscope.cn/)。如果提供模型 ID,模型将自动上传到 ModelScope。
+```
diff --git a/docs/source/finetune/train_flux_lora.md b/docs/source/finetune/train_flux_lora.md
index 89ae5cf..9410a66 100644
--- a/docs/source/finetune/train_flux_lora.md
+++ b/docs/source/finetune/train_flux_lora.md
@@ -1,71 +1,71 @@
-# 训练 FLUX LoRA
-
-以下文件将会被用于构建 FLUX 模型。 你可以从[huggingface](https://huggingface.co/black-forest-labs/FLUX.1-dev)或[modelscope](https://www.modelscope.cn/models/ai-modelscope/flux.1-dev)下载,也可以使用以下代码下载这些文件:
-
-```python
-from diffsynth import download_models
-
-download_models(["FLUX.1-dev"])
-```
-
-```
-models/FLUX/
-└── FLUX.1-dev
- ├── ae.safetensors
- ├── flux1-dev.safetensors
- ├── text_encoder
- │ └── model.safetensors
- └── text_encoder_2
- ├── config.json
- ├── model-00001-of-00002.safetensors
- ├── model-00002-of-00002.safetensors
- └── model.safetensors.index.json
-```
-
-使用以下命令启动训练任务:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/flux/train_flux_lora.py \
- --pretrained_text_encoder_path models/FLUX/FLUX.1-dev/text_encoder/model.safetensors \
- --pretrained_text_encoder_2_path models/FLUX/FLUX.1-dev/text_encoder_2 \
- --pretrained_dit_path models/FLUX/FLUX.1-dev/flux1-dev.safetensors \
- --pretrained_vae_path models/FLUX/FLUX.1-dev/ae.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "bf16" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-有关参数的更多信息,请使用 `python examples/train/flux/train_flux_lora.py -h` 查看详细信息。
-
-训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
-
-```python
-from diffsynth import ModelManager, FluxImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=[
- "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
- "models/FLUX/FLUX.1-dev/text_encoder_2",
- "models/FLUX/FLUX.1-dev/ae.safetensors",
- "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
- ])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SDXLImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt=prompt,
- num_inference_steps=30, embedded_guidance=3.5
-)
-image.save("image_with_lora.jpg")
-```
+# 训练 FLUX LoRA
+
+以下文件将会被用于构建 FLUX 模型。 你可以从[huggingface](https://huggingface.co/black-forest-labs/FLUX.1-dev)或[modelscope](https://www.modelscope.cn/models/ai-modelscope/flux.1-dev)下载,也可以使用以下代码下载这些文件:
+
+```python
+from diffsynth import download_models
+
+download_models(["FLUX.1-dev"])
+```
+
+```
+models/FLUX/
+└── FLUX.1-dev
+ ├── ae.safetensors
+ ├── flux1-dev.safetensors
+ ├── text_encoder
+ │ └── model.safetensors
+ └── text_encoder_2
+ ├── config.json
+ ├── model-00001-of-00002.safetensors
+ ├── model-00002-of-00002.safetensors
+ └── model.safetensors.index.json
+```
+
+使用以下命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/flux/train_flux_lora.py \
+ --pretrained_text_encoder_path models/FLUX/FLUX.1-dev/text_encoder/model.safetensors \
+ --pretrained_text_encoder_2_path models/FLUX/FLUX.1-dev/text_encoder_2 \
+ --pretrained_dit_path models/FLUX/FLUX.1-dev/flux1-dev.safetensors \
+ --pretrained_vae_path models/FLUX/FLUX.1-dev/ae.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "bf16" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/flux/train_flux_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, FluxImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=[
+ "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+ "models/FLUX/FLUX.1-dev/text_encoder_2",
+ "models/FLUX/FLUX.1-dev/ae.safetensors",
+ "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+ ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt=prompt,
+ num_inference_steps=30, embedded_guidance=3.5
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source/finetune/train_hunyuan_dit_lora.md b/docs/source/finetune/train_hunyuan_dit_lora.md
index cbd050c..4b29657 100644
--- a/docs/source/finetune/train_hunyuan_dit_lora.md
+++ b/docs/source/finetune/train_hunyuan_dit_lora.md
@@ -1,72 +1,72 @@
-# 训练 Hunyuan-DiT LoRA
-
-构建 Hunyuan DiT 需要四个文件。你可以从 [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) 或 [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary) 下载这些文件。你可以使用以下代码下载这些文件:
-
-
-```python
-from diffsynth import download_models
-
-download_models(["HunyuanDiT"])
-```
-
-```
-models/HunyuanDiT/
-├── Put Hunyuan DiT checkpoints here.txt
-└── t2i
- ├── clip_text_encoder
- │ └── pytorch_model.bin
- ├── model
- │ └── pytorch_model_ema.pt
- ├── mt5
- │ └── pytorch_model.bin
- └── sdxl-vae-fp16-fix
- └── diffusion_pytorch_model.bin
-```
-
-使用以下命令启动训练任务:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
- --pretrained_path models/HunyuanDiT/t2i \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-有关参数的更多信息,请使用 `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` 查看详细信息。
-
-训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
-
-
-```python
-from diffsynth import ModelManager, HunyuanDiTImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=[
- "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
- "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
- "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
- "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
- ])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
- negative_prompt="",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
+# 训练 Hunyuan-DiT LoRA
+
+构建 Hunyuan DiT 需要四个文件。你可以从 [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) 或 [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary) 下载这些文件。你可以使用以下代码下载这些文件:
+
+
+```python
+from diffsynth import download_models
+
+download_models(["HunyuanDiT"])
+```
+
+```
+models/HunyuanDiT/
+├── Put Hunyuan DiT checkpoints here.txt
+└── t2i
+ ├── clip_text_encoder
+ │ └── pytorch_model.bin
+ ├── model
+ │ └── pytorch_model_ema.pt
+ ├── mt5
+ │ └── pytorch_model.bin
+ └── sdxl-vae-fp16-fix
+ └── diffusion_pytorch_model.bin
+```
+
+使用以下命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
+ --pretrained_path models/HunyuanDiT/t2i \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+```python
+from diffsynth import ModelManager, HunyuanDiTImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=[
+ "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+ "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+ "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+ "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+ ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
+ negative_prompt="",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source/finetune/train_kolors_lora.md b/docs/source/finetune/train_kolors_lora.md
index d7bab00..dae9d5c 100644
--- a/docs/source/finetune/train_kolors_lora.md
+++ b/docs/source/finetune/train_kolors_lora.md
@@ -1,78 +1,78 @@
-# 训练 Kolors LoRA
-
-以下文件将用于构建 Kolors。你可以从 [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) 或 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) 下载 Kolors。由于精度溢出问题,我们需要下载额外的 VAE 模型(从 [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix))。你可以使用以下代码下载这些文件:
-
-
-```python
-from diffsynth import download_models
-
-download_models(["Kolors", "SDXL-vae-fp16-fix"])
-```
-
-```
-models
-├── kolors
-│ └── Kolors
-│ ├── text_encoder
-│ │ ├── config.json
-│ │ ├── pytorch_model-00001-of-00007.bin
-│ │ ├── pytorch_model-00002-of-00007.bin
-│ │ ├── pytorch_model-00003-of-00007.bin
-│ │ ├── pytorch_model-00004-of-00007.bin
-│ │ ├── pytorch_model-00005-of-00007.bin
-│ │ ├── pytorch_model-00006-of-00007.bin
-│ │ ├── pytorch_model-00007-of-00007.bin
-│ │ └── pytorch_model.bin.index.json
-│ ├── unet
-│ │ └── diffusion_pytorch_model.safetensors
-│ └── vae
-│ └── diffusion_pytorch_model.safetensors
-└── sdxl-vae-fp16-fix
- └── diffusion_pytorch_model.safetensors
-```
-
-使用下面的命令启动训练任务:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
- --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
- --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
- --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-有关参数的更多信息,请使用 `python examples/train/kolors/train_kolors_lora.py -h` 查看详细信息。
-
-训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
-
-
-
-```python
-from diffsynth import ModelManager, SD3ImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SD3ImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
+# 训练 Kolors LoRA
+
+以下文件将用于构建 Kolors。你可以从 [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) 或 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) 下载 Kolors。由于精度溢出问题,我们需要下载额外的 VAE 模型(从 [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix))。你可以使用以下代码下载这些文件:
+
+
+```python
+from diffsynth import download_models
+
+download_models(["Kolors", "SDXL-vae-fp16-fix"])
+```
+
+```
+models
+├── kolors
+│ └── Kolors
+│ ├── text_encoder
+│ │ ├── config.json
+│ │ ├── pytorch_model-00001-of-00007.bin
+│ │ ├── pytorch_model-00002-of-00007.bin
+│ │ ├── pytorch_model-00003-of-00007.bin
+│ │ ├── pytorch_model-00004-of-00007.bin
+│ │ ├── pytorch_model-00005-of-00007.bin
+│ │ ├── pytorch_model-00006-of-00007.bin
+│ │ ├── pytorch_model-00007-of-00007.bin
+│ │ └── pytorch_model.bin.index.json
+│ ├── unet
+│ │ └── diffusion_pytorch_model.safetensors
+│ └── vae
+│ └── diffusion_pytorch_model.safetensors
+└── sdxl-vae-fp16-fix
+ └── diffusion_pytorch_model.safetensors
+```
+
+使用下面的命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
+ --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
+ --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
+ --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/kolors/train_kolors_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source/finetune/train_sd3_lora.md b/docs/source/finetune/train_sd3_lora.md
index bb6f383..e370175 100644
--- a/docs/source/finetune/train_sd3_lora.md
+++ b/docs/source/finetune/train_sd3_lora.md
@@ -1,59 +1,59 @@
-# 训练 Stable Diffusion 3 LoRA
-
-训练脚本只需要一个文件。你可以使用 [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)(没有 T5 Encoder)或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)(有 T5 Encoder)。请使用以下代码下载这些文件:
-
-
-```python
-from diffsynth import download_models
-
-download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
-```
-
-```
-models/stable_diffusion_3/
-├── Put Stable Diffusion 3 checkpoints here.txt
-├── sd3_medium_incl_clips.safetensors
-└── sd3_medium_incl_clips_t5xxlfp16.safetensors
-```
-
-使用下面的命令启动训练任务:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
- --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-有关参数的更多信息,请使用 `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` 查看详细信息。
-
-训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
-
-```python
-from diffsynth import ModelManager, SD3ImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SD3ImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
+# 训练 Stable Diffusion 3 LoRA
+
+训练脚本只需要一个文件。你可以使用 [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)(没有 T5 Encoder)或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)(有 T5 Encoder)。请使用以下代码下载这些文件:
+
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
+```
+
+```
+models/stable_diffusion_3/
+├── Put Stable Diffusion 3 checkpoints here.txt
+├── sd3_medium_incl_clips.safetensors
+└── sd3_medium_incl_clips_t5xxlfp16.safetensors
+```
+
+使用下面的命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
+ --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source/finetune/train_sd_lora.md b/docs/source/finetune/train_sd_lora.md
index dc792c7..e3d1abb 100644
--- a/docs/source/finetune/train_sd_lora.md
+++ b/docs/source/finetune/train_sd_lora.md
@@ -1,59 +1,59 @@
-# 训练 Stable Diffusion LoRA
-
-训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion v1.5。你可以从 [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors) 下载。你可以使用以下代码下载这个文件:
-
-```python
-from diffsynth import download_models
-
-download_models(["StableDiffusion_v15"])
-```
-
-```
-models/stable_diffusion
-├── Put Stable Diffusion checkpoints here.txt
-└── v1-5-pruned-emaonly.safetensors
-```
-
-使用以下命令启动训练任务:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
- --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 512 \
- --width 512 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-有关参数的更多信息,请使用 `python examples/train/stable_diffusion/train_sd_lora.py -h` 查看详细信息。
-
-训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
-
-
-
-```python
-from diffsynth import ModelManager, SDImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SDImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=512, height=512,
-)
-image.save("image_with_lora.jpg")
-```
+# 训练 Stable Diffusion LoRA
+
+训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion v1.5。你可以从 [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors) 下载。你可以使用以下代码下载这个文件:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion_v15"])
+```
+
+```
+models/stable_diffusion
+├── Put Stable Diffusion checkpoints here.txt
+└── v1-5-pruned-emaonly.safetensors
+```
+
+使用以下命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
+ --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 512 \
+ --width 512 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/stable_diffusion/train_sd_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+
+```python
+from diffsynth import ModelManager, SDImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=512, height=512,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source/finetune/train_sdxl_lora.md b/docs/source/finetune/train_sdxl_lora.md
index e51f092..0b0b746 100644
--- a/docs/source/finetune/train_sdxl_lora.md
+++ b/docs/source/finetune/train_sdxl_lora.md
@@ -1,57 +1,57 @@
-# 训练 Stable Diffusion XL LoRA
-
-训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion XL。你可以从 [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors) 下载。也可以使用以下代码下载这个文件:
-
-```python
-from diffsynth import download_models
-
-download_models(["StableDiffusionXL_v1"])
-```
-
-```
-models/stable_diffusion_xl
-├── Put Stable Diffusion XL checkpoints here.txt
-└── sd_xl_base_1.0.safetensors
-```
-
-我们观察到 Stable Diffusion XL 在 float16 精度下会出现数值精度溢出,因此我们建议用户使用 float32 精度训练,使用以下命令启动训练任务:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
- --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "32" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-有关参数的更多信息,请使用 `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` 查看详细信息。
-
-训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SDXLImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
+# 训练 Stable Diffusion XL LoRA
+
+训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion XL。你可以从 [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors) 下载。也可以使用以下代码下载这个文件:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusionXL_v1"])
+```
+
+```
+models/stable_diffusion_xl
+├── Put Stable Diffusion XL checkpoints here.txt
+└── sd_xl_base_1.0.safetensors
+```
+
+我们观察到 Stable Diffusion XL 在 float16 精度下会出现数值精度溢出,因此我们建议用户使用 float32 精度训练,使用以下命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
+ --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "32" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 51f2f80..fe5a33a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,44 +1,44 @@
-.. DiffSynth-Studio documentation master file, created by
- sphinx-quickstart on Thu Sep 5 16:39:24 2024.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-DiffSynth-Studio 文档
-==============================
-
-欢迎来到 DiffSynth-Studio,我们旨在构建 Diffusion 模型的开源互联生态,在这里,你可以体验到 AIGC(AI Generated Content)技术魔法般的魅力!
-
-.. toctree::
- :maxdepth: 1
- :caption: 快速开始
-
- tutorial/ASimpleExample.md
- tutorial/Installation.md
- tutorial/DownloadModels.md
- tutorial/Models.md
- tutorial/Pipelines.md
- tutorial/PromptProcessing.md
- tutorial/Extensions.md
- tutorial/Schedulers.md
-
-.. toctree::
- :maxdepth: 1
- :caption: 开启创作之旅
-
- creating/BasicImageSynthesis.md
- creating/AdaptersForImageSynthesis.md
- creating/ToonShading.md
- creating/PromptRefine.md
-
-.. toctree::
- :maxdepth: 1
- :caption: 微调
-
- finetune/overview.md
- finetune/train_flux_lora.md
- finetune/train_kolors_lora.md
- finetune/train_sd3_lora.md
- finetune/train_hunyuan_dit_lora.md
- finetune/train_sdxl_lora.md
- finetune/train_sd_lora.md
-
+.. DiffSynth-Studio documentation master file, created by
+ sphinx-quickstart on Thu Sep 5 16:39:24 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+DiffSynth-Studio 文档
+==============================
+
+欢迎来到 DiffSynth-Studio,我们旨在构建 Diffusion 模型的开源互联生态,在这里,你可以体验到 AIGC(AI Generated Content)技术魔法般的魅力!
+
+.. toctree::
+ :maxdepth: 1
+ :caption: 快速开始
+
+ tutorial/ASimpleExample.md
+ tutorial/Installation.md
+ tutorial/DownloadModels.md
+ tutorial/Models.md
+ tutorial/Pipelines.md
+ tutorial/PromptProcessing.md
+ tutorial/Extensions.md
+ tutorial/Schedulers.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: 开启创作之旅
+
+ creating/BasicImageSynthesis.md
+ creating/AdaptersForImageSynthesis.md
+ creating/ToonShading.md
+ creating/PromptRefine.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: 微调
+
+ finetune/overview.md
+ finetune/train_flux_lora.md
+ finetune/train_kolors_lora.md
+ finetune/train_sd3_lora.md
+ finetune/train_hunyuan_dit_lora.md
+ finetune/train_sdxl_lora.md
+ finetune/train_sd_lora.md
+
diff --git a/docs/source/requirement.txt b/docs/source/requirement.txt
index fa5b901..6f7f63b 100644
--- a/docs/source/requirement.txt
+++ b/docs/source/requirement.txt
@@ -1,4 +1,4 @@
-recommonmark
-sphinx_rtd_theme
-myst-parser
+recommonmark
+sphinx_rtd_theme
+myst-parser
sphinx-markdown-tables
\ No newline at end of file
diff --git a/docs/source/tutorial/ASimpleExample.md b/docs/source/tutorial/ASimpleExample.md
index 7d312a4..8d80852 100644
--- a/docs/source/tutorial/ASimpleExample.md
+++ b/docs/source/tutorial/ASimpleExample.md
@@ -1,85 +1,85 @@
-# 快速开始
-
-在这篇文档中,我们通过一段代码为你介绍如何快速上手使用 DiffSynth-Studio 进行创作。
-
-## 安装
-
-使用以下命令从 GitHub 克隆并安装 DiffSynth-Studio。更多信息请参考[安装](./Installation.md)。
-
-```shell
-git clone https://github.com/modelscope/DiffSynth-Studio.git
-cd DiffSynth-Studio
-pip install -e .
-```
-
-## 一键运行!
-
-通过运行以下代码,我们将会下载模型、加载模型、生成图像。
-
-```python
-import torch
-from diffsynth import ModelManager, FluxImagePipeline
-
-model_manager = ModelManager(
- torch_dtype=torch.bfloat16,
- device="cuda",
- model_id_list=["FLUX.1-dev"]
-)
-pipe = FluxImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="In a forest, a wooden plank sign reading DiffSynth",
- height=576, width=1024,
-)
-image.save("image.jpg")
-```
-
-
-
-从这个例子中,我们可以看到,DiffSynth 中有两个关键模块:`ModelManager` 和 `Pipeline`,接下来我们详细介绍。
-
-## 下载和加载模型
-
-`ModelManager` 负责下载和加载模型,通过以下代码可以直接一步完成。
-
-```python
-import torch
-from diffsynth import ModelManager
-
-model_manager = ModelManager(
- torch_dtype=torch.bfloat16,
- device="cuda",
- model_id_list=["FLUX.1-dev"]
-)
-```
-
-当然,我们也支持分步完成,以下代码和上述代码的行为是等价的。
-
-```python
-import torch
-from diffsynth import download_models, ModelManager
-
-download_models(["FLUX.1-dev"])
-model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
-model_manager.load_models([
- "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
- "models/FLUX/FLUX.1-dev/text_encoder_2",
- "models/FLUX/FLUX.1-dev/ae.safetensors",
- "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
-])
-```
-
-下载模型时,我们支持从 [ModelScope](https://www.modelscope.cn/) 和 [HuggingFace](https://huggingface.co/) 下载模型,也支持下载非预置的模型,关于模型下载的更多信息请参考[模型下载](./DownloadModels.md)。
-
-加载模型时,你可以把所有想要加载的模型路径放入其中。对于 `.safetensors` 等格式的模型权重文件,`ModelManager` 在加载后会自动判断模型类型;对于文件夹格式的模型,`ModelManager` 会尝试解析其中的 `config.json` 文件并尝试调用 `transformers` 等第三方库中的对应模块。关于 DiffSynth-Studio 支持的模型,请参考[支持的模型](./Models.md)。
-
-## 构建 Pipeline
-
-DiffSynth-Studio 提供了多个推理 `Pipeline`,这些 `Pipeline` 可以直接通过 `ModelManager` 获取所需的模型并初始化。例如,FLUX.1-dev 模型的文生图 `Pipeline` 可以这样构建:
-
-```python
-pipe = FluxImagePipeline.from_model_manager(model_manager)
-```
-
-更多用于图像生成和视频生成的 `Pipeline` 详见[推理流水线](./Pipelines.md)。
+# 快速开始
+
+在这篇文档中,我们通过一段代码为你介绍如何快速上手使用 DiffSynth-Studio 进行创作。
+
+## 安装
+
+使用以下命令从 GitHub 克隆并安装 DiffSynth-Studio。更多信息请参考[安装](./Installation.md)。
+
+```shell
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+
+## 一键运行!
+
+通过运行以下代码,我们将会下载模型、加载模型、生成图像。
+
+```python
+import torch
+from diffsynth import ModelManager, FluxImagePipeline
+
+model_manager = ModelManager(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_id_list=["FLUX.1-dev"]
+)
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="In a forest, a wooden plank sign reading DiffSynth",
+ height=576, width=1024,
+)
+image.save("image.jpg")
+```
+
+
+
+从这个例子中,我们可以看到,DiffSynth 中有两个关键模块:`ModelManager` 和 `Pipeline`,接下来我们详细介绍。
+
+## 下载和加载模型
+
+`ModelManager` 负责下载和加载模型,通过以下代码可以直接一步完成。
+
+```python
+import torch
+from diffsynth import ModelManager
+
+model_manager = ModelManager(
+ torch_dtype=torch.bfloat16,
+ device="cuda",
+ model_id_list=["FLUX.1-dev"]
+)
+```
+
+当然,我们也支持分步完成,以下代码和上述代码的行为是等价的。
+
+```python
+import torch
+from diffsynth import download_models, ModelManager
+
+download_models(["FLUX.1-dev"])
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
+model_manager.load_models([
+ "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+ "models/FLUX/FLUX.1-dev/text_encoder_2",
+ "models/FLUX/FLUX.1-dev/ae.safetensors",
+ "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+])
+```
+
+下载模型时,我们支持从 [ModelScope](https://www.modelscope.cn/) 和 [HuggingFace](https://huggingface.co/) 下载模型,也支持下载非预置的模型,关于模型下载的更多信息请参考[模型下载](./DownloadModels.md)。
+
+加载模型时,你可以把所有想要加载的模型路径放入其中。对于 `.safetensors` 等格式的模型权重文件,`ModelManager` 在加载后会自动判断模型类型;对于文件夹格式的模型,`ModelManager` 会尝试解析其中的 `config.json` 文件并尝试调用 `transformers` 等第三方库中的对应模块。关于 DiffSynth-Studio 支持的模型,请参考[支持的模型](./Models.md)。
+
+## 构建 Pipeline
+
+DiffSynth-Studio 提供了多个推理 `Pipeline`,这些 `Pipeline` 可以直接通过 `ModelManager` 获取所需的模型并初始化。例如,FLUX.1-dev 模型的文生图 `Pipeline` 可以这样构建:
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+```
+
+更多用于图像生成和视频生成的 `Pipeline` 详见[推理流水线](./Pipelines.md)。
diff --git a/docs/source/tutorial/DownloadModels.md b/docs/source/tutorial/DownloadModels.md
index 0922cc6..7ed9173 100644
--- a/docs/source/tutorial/DownloadModels.md
+++ b/docs/source/tutorial/DownloadModels.md
@@ -1,34 +1,34 @@
-# 下载模型
-
-我们在 DiffSynth-Studio 中预置了一些主流 Diffusion 模型的下载链接,你可以下载并使用这些模型。
-
-## 下载预置模型
-
-你可以直接使用 `download_models` 函数下载预置的模型文件,其中模型 ID 可参考 [config file](/diffsynth/configs/model_config.py)。
-
-```python
-from diffsynth import download_models
-
-download_models(["FLUX.1-dev"])
-```
-
-对于 VSCode 用户,激活 Pylance 或其他 Python 语言服务后,在代码中输入 `""` 即可显示支持的所有模型 ID。
-
-
-
-## 下载非预置模型
-
-你可以选择 [ModelScope](https://modelscope.cn/models) 和 [HuggingFace](https://huggingface.co/models) 两个下载源中的模型。当然,你也可以通过浏览器等工具选择手动下载自己所需的模型。
-
-```python
-from diffsynth import download_customized_models
-
-download_customized_models(
- model_id="Kwai-Kolors/Kolors",
- origin_file_path="vae/diffusion_pytorch_model.fp16.bin",
- local_dir="models/kolors/Kolors/vae",
- downloading_priority=["ModelScope", "HuggingFace"]
-)
-```
-
-在这段代码中,我们将会按照下载的优先级,优先从 `ModelScope` 下载,在 ID 为 `Kwai-Kolors/Kolors` 的[模型库](https://modelscope.cn/models/Kwai-Kolors/Kolors)中,把文件 `vae/diffusion_pytorch_model.fp16.bin` 下载到本地的路径 `models/kolors/Kolors/vae` 中。
+# 下载模型
+
+我们在 DiffSynth-Studio 中预置了一些主流 Diffusion 模型的下载链接,你可以下载并使用这些模型。
+
+## 下载预置模型
+
+你可以直接使用 `download_models` 函数下载预置的模型文件,其中模型 ID 可参考 [config file](/diffsynth/configs/model_config.py)。
+
+```python
+from diffsynth import download_models
+
+download_models(["FLUX.1-dev"])
+```
+
+对于 VSCode 用户,激活 Pylance 或其他 Python 语言服务后,在代码中输入 `""` 即可显示支持的所有模型 ID。
+
+
+
+## 下载非预置模型
+
+你可以选择 [ModelScope](https://modelscope.cn/models) 和 [HuggingFace](https://huggingface.co/models) 两个下载源中的模型。当然,你也可以通过浏览器等工具选择手动下载自己所需的模型。
+
+```python
+from diffsynth import download_customized_models
+
+download_customized_models(
+ model_id="Kwai-Kolors/Kolors",
+ origin_file_path="vae/diffusion_pytorch_model.fp16.bin",
+ local_dir="models/kolors/Kolors/vae",
+ downloading_priority=["ModelScope", "HuggingFace"]
+)
+```
+
+在这段代码中,我们将会按照下载的优先级,优先从 `ModelScope` 下载,在 ID 为 `Kwai-Kolors/Kolors` 的[模型库](https://modelscope.cn/models/Kwai-Kolors/Kolors)中,把文件 `vae/diffusion_pytorch_model.fp16.bin` 下载到本地的路径 `models/kolors/Kolors/vae` 中。
diff --git a/docs/source/tutorial/Extensions.md b/docs/source/tutorial/Extensions.md
index 0c2ad99..a38b061 100644
--- a/docs/source/tutorial/Extensions.md
+++ b/docs/source/tutorial/Extensions.md
@@ -1,49 +1,49 @@
-# 扩展功能
-
-本文档介绍了一些在 DiffSynth 实现的 Diffusion 模型之外的相关技术,这些模型在图像和视频处理方面具有显著的应用潜力。
-
-- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**:RIFE 是一个基于实时中间流估计的帧插值方法。采用 IFNet 结构的模型,能够以很快的速度端到端估计中间流。RIFE 不依赖于预训练的光流模型,能够支持任意时间步的帧插值,通过时间编码输入进行处理。
-
- 在这段代码中,我们用 RIFE 模型把视频的帧数提升到原来的两倍。
-
- ```python
- from diffsynth import VideoData, ModelManager, save_video
- from diffsynth.extensions.RIFE import RIFEInterpolater
-
- model_manager = ModelManager(model_id_list=["RIFE"])
- rife = RIFEInterpolater.from_model_manager(model_manager)
- video = VideoData("input_video.mp4", height=512, width=768).raw_data()
- video = rife.interpolate(video)
- save_video(video, "output_video.mp4", fps=60)
- ```
-
-- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN 是一个图像超分辨率模型,能够实现四倍的分辨率提升。该方法通过优化网络架构、对抗损失和感知损失,显著提升了生成图像的真实感。
-
- 在这段代码中,我们用 ESRGAN 模型把图像分辨率提升到原来的四倍。
-
- ```python
- from PIL import Image
- from diffsynth import ModelManager
- from diffsynth.extensions.ESRGAN import ESRGAN
-
- model_manager = ModelManager(model_id_list=["ESRGAN_x4"])
- rife = ESRGAN.from_model_manager(model_manager)
- image = Image.open("input_image.jpg")
- image = rife.upscale(image)
- image.save("output_image.jpg")
- ```
-
-- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend 不依赖模型的视频去闪烁算法,在使用图像生成模型逐帧处理过的视频(风格视频)中,通常会出现闪烁问题,FastBlend 则可以根据原视频(引导视频)中的运动特征,消除风格视频中的闪烁。
-
- 在这段代码中,我们用 FastBlend 把风格视频中的闪烁效果删除。
-
- ```python
- from diffsynth import VideoData, save_video
- from diffsynth.extensions.FastBlend import FastBlendSmoother
-
- fastblend = FastBlendSmoother()
- guide_video = VideoData("guide_video.mp4", height=512, width=768).raw_data()
- style_video = VideoData("style_video.mp4", height=512, width=768).raw_data()
- output_video = fastblend(style_video, original_frames=guide_video)
- save_video(output_video, "output_video.mp4", fps=30)
- ```
+# 扩展功能
+
+本文档介绍了一些在 DiffSynth 实现的 Diffusion 模型之外的相关技术,这些模型在图像和视频处理方面具有显著的应用潜力。
+
+- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**:RIFE 是一个基于实时中间流估计的帧插值方法。采用 IFNet 结构的模型,能够以很快的速度端到端估计中间流。RIFE 不依赖于预训练的光流模型,能够支持任意时间步的帧插值,通过时间编码输入进行处理。
+
+ 在这段代码中,我们用 RIFE 模型把视频的帧数提升到原来的两倍。
+
+ ```python
+ from diffsynth import VideoData, ModelManager, save_video
+ from diffsynth.extensions.RIFE import RIFEInterpolater
+
+ model_manager = ModelManager(model_id_list=["RIFE"])
+ rife = RIFEInterpolater.from_model_manager(model_manager)
+ video = VideoData("input_video.mp4", height=512, width=768).raw_data()
+ video = rife.interpolate(video)
+ save_video(video, "output_video.mp4", fps=60)
+ ```
+
+- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN 是一个图像超分辨率模型,能够实现四倍的分辨率提升。该方法通过优化网络架构、对抗损失和感知损失,显著提升了生成图像的真实感。
+
+ 在这段代码中,我们用 ESRGAN 模型把图像分辨率提升到原来的四倍。
+
+ ```python
+ from PIL import Image
+ from diffsynth import ModelManager
+ from diffsynth.extensions.ESRGAN import ESRGAN
+
+ model_manager = ModelManager(model_id_list=["ESRGAN_x4"])
+ rife = ESRGAN.from_model_manager(model_manager)
+ image = Image.open("input_image.jpg")
+ image = rife.upscale(image)
+ image.save("output_image.jpg")
+ ```
+
+- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend 不依赖模型的视频去闪烁算法,在使用图像生成模型逐帧处理过的视频(风格视频)中,通常会出现闪烁问题,FastBlend 则可以根据原视频(引导视频)中的运动特征,消除风格视频中的闪烁。
+
+ 在这段代码中,我们用 FastBlend 把风格视频中的闪烁效果删除。
+
+ ```python
+ from diffsynth import VideoData, save_video
+ from diffsynth.extensions.FastBlend import FastBlendSmoother
+
+ fastblend = FastBlendSmoother()
+ guide_video = VideoData("guide_video.mp4", height=512, width=768).raw_data()
+ style_video = VideoData("style_video.mp4", height=512, width=768).raw_data()
+ output_video = fastblend(style_video, original_frames=guide_video)
+ save_video(output_video, "output_video.mp4", fps=30)
+ ```
diff --git a/docs/source/tutorial/Installation.md b/docs/source/tutorial/Installation.md
index 4c9f4e2..3831cd2 100644
--- a/docs/source/tutorial/Installation.md
+++ b/docs/source/tutorial/Installation.md
@@ -1,26 +1,26 @@
-# 安装
-
-目前,DiffSynth-Studio 支持从 GitHub 克隆安装或使用 pip 安装,我们建议用户从 GitHub 克隆安装,从而体验最新的功能。
-
-## 从源码下载
-
-1. 克隆源码仓库:
-
- ```bash
- git clone https://github.com/modelscope/DiffSynth-Studio.git
- ```
-
-2. 进入项目目录并安装:
-
- ```bash
- cd DiffSynth-Studio
- pip install -e .
- ```
-
-## 使用 PyPI 下载
-
-直接通过 PyPI 安装(功能更新存在延后):
-
-```bash
-pip install diffsynth
+# 安装
+
+目前,DiffSynth-Studio 支持从 GitHub 克隆安装或使用 pip 安装,我们建议用户从 GitHub 克隆安装,从而体验最新的功能。
+
+## 从源码下载
+
+1. 克隆源码仓库:
+
+ ```bash
+ git clone https://github.com/modelscope/DiffSynth-Studio.git
+ ```
+
+2. 进入项目目录并安装:
+
+ ```bash
+ cd DiffSynth-Studio
+ pip install -e .
+ ```
+
+## 使用 PyPI 下载
+
+直接通过 PyPI 安装(功能更新存在延后):
+
+```bash
+pip install diffsynth
```
\ No newline at end of file
diff --git a/docs/source/tutorial/Models.md b/docs/source/tutorial/Models.md
index ff8a479..d1a7ed0 100644
--- a/docs/source/tutorial/Models.md
+++ b/docs/source/tutorial/Models.md
@@ -1,18 +1,18 @@
-# 模型
-
-目前为止,DiffSynth Studio 支持的模型如下所示:
-
-* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
-* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
-* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
-* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
-* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
-* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
-* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
-* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
-* [ESRGAN](https://github.com/xinntao/ESRGAN)
-* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
-* [AnimateDiff](https://github.com/guoyww/animatediff/)
-* [ControlNet](https://github.com/lllyasviel/ControlNet)
-* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
+# 模型
+
+目前为止,DiffSynth Studio 支持的模型如下所示:
+
+* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
+* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
+* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
+* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
+* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
+* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
+* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
+* [ESRGAN](https://github.com/xinntao/ESRGAN)
+* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
+* [AnimateDiff](https://github.com/guoyww/animatediff/)
+* [ControlNet](https://github.com/lllyasviel/ControlNet)
+* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
diff --git a/docs/source/tutorial/Pipelines.md b/docs/source/tutorial/Pipelines.md
index 21810a1..e58acad 100644
--- a/docs/source/tutorial/Pipelines.md
+++ b/docs/source/tutorial/Pipelines.md
@@ -1,22 +1,22 @@
-# 流水线
-
-DiffSynth-Studio 中包括多个流水线,分为图像生成和视频生成两类。
-
-## 图像生成流水线
-
-| Pipeline | Models |
-|----------------------------|----------------------------------------------------------------|
-| SDImagePipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter |
-| SDXLImagePipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter |
-| SD3ImagePipeline | text_encoder_1: SD3TextEncoder1
text_encoder_2: SD3TextEncoder2
text_encoder_3: SD3TextEncoder3
dit: SD3DiT
vae_decoder: SD3VAEDecoder
vae_encoder: SD3VAEEncoder |
-| HunyuanDiTImagePipeline | text_encoder: HunyuanDiTCLIPTextEncoder
text_encoder_t5: HunyuanDiTT5TextEncoder
dit: HunyuanDiT
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder |
-| FluxImagePipeline | text_encoder_1: FluxTextEncoder1
text_encoder_2: FluxTextEncoder2
dit: FluxDiT
vae_decoder: FluxVAEDecoder
vae_encoder: FluxVAEEncoder |
-
-## 视频生成流水线
-
-| Pipeline | Models |
-|----------------------------|----------------------------------------------------------------|
-| SDVideoPipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter
motion_modules: SDMotionModel |
-| SDXLVideoPipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter
motion_modules: SDXLMotionModel |
-| SVDVideoPipeline | image_encoder: SVDImageEncoder
unet: SVDUNet
vae_encoder: SVDVAEEncoder
vae_decoder: SVDVAEDecoder |
-| CogVideoPipeline | text_encoder: FluxTextEncoder2
dit: CogDiT
vae_encoder: CogVAEEncoder
vae_decoder: CogVAEDecoder |
+# 流水线
+
+DiffSynth-Studio 中包括多个流水线,分为图像生成和视频生成两类。
+
+## 图像生成流水线
+
+| Pipeline | Models |
+|----------------------------|----------------------------------------------------------------|
+| SDImagePipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter |
+| SDXLImagePipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter |
+| SD3ImagePipeline | text_encoder_1: SD3TextEncoder1
text_encoder_2: SD3TextEncoder2
text_encoder_3: SD3TextEncoder3
dit: SD3DiT
vae_decoder: SD3VAEDecoder
vae_encoder: SD3VAEEncoder |
+| HunyuanDiTImagePipeline | text_encoder: HunyuanDiTCLIPTextEncoder
text_encoder_t5: HunyuanDiTT5TextEncoder
dit: HunyuanDiT
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder |
+| FluxImagePipeline | text_encoder_1: FluxTextEncoder1
text_encoder_2: FluxTextEncoder2
dit: FluxDiT
vae_decoder: FluxVAEDecoder
vae_encoder: FluxVAEEncoder |
+
+## 视频生成流水线
+
+| Pipeline | Models |
+|----------------------------|----------------------------------------------------------------|
+| SDVideoPipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter
motion_modules: SDMotionModel |
+| SDXLVideoPipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter
motion_modules: SDXLMotionModel |
+| SVDVideoPipeline | image_encoder: SVDImageEncoder
unet: SVDUNet
vae_encoder: SVDVAEEncoder
vae_decoder: SVDVAEDecoder |
+| CogVideoPipeline | text_encoder: FluxTextEncoder2
dit: CogDiT
vae_encoder: CogVAEEncoder
vae_decoder: CogVAEDecoder |
diff --git a/docs/source/tutorial/PromptProcessing.md b/docs/source/tutorial/PromptProcessing.md
index 539aa5d..7356ba3 100644
--- a/docs/source/tutorial/PromptProcessing.md
+++ b/docs/source/tutorial/PromptProcessing.md
@@ -1,37 +1,37 @@
-# 提示词处理
-
-DiffSynth 内置了提示词处理功能,分为:
-
-- **提示词润色器(`prompt_refiner_classes`)**:包括提示词润色、提示词中译英、提示词同时润色与中译英,可选参数如下:
-
- - **英文提示词润色**:'BeautifulPrompt',使用到的是[pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd)。
-
- - **提示词中译英**:'Translator',使用到的是[opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en)。
-
- - **提示词中译英并润色**:'QwenPrompt',使用到的是[Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct)。
-
-- **提示词扩展器(`prompt_extender_classes`)**:基于Omost的提示词分区控制扩写,可选参数如下:
-
- - **提示词分区扩写**:'OmostPromter'。
-
-
-## 使用说明
-
-### 提示词润色器
-
-在加载模型 Pipeline 时,可以通过参数 `prompt_refiner_classes` 指定所需的提示词润色器功能。有关示例代码,请参考 [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py)。
-
-可选的 `prompt_refiner_classes` 参数包括:Translator、BeautifulPrompt、QwenPrompt。
-
-```python
-pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
-```
-
-### 提示词扩展器
-
-在加载模型 Pipeline 时,可以通过参数 `prompt_extender_classes` 指定所需的提示词扩展器。有关示例代码,请参考 [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py)。
-
-```python
-pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
-```
-
+# 提示词处理
+
+DiffSynth 内置了提示词处理功能,分为:
+
+- **提示词润色器(`prompt_refiner_classes`)**:包括提示词润色、提示词中译英、提示词同时润色与中译英,可选参数如下:
+
+ - **英文提示词润色**:'BeautifulPrompt',使用到的是[pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd)。
+
+ - **提示词中译英**:'Translator',使用到的是[opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en)。
+
+ - **提示词中译英并润色**:'QwenPrompt',使用到的是[Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct)。
+
+- **提示词扩展器(`prompt_extender_classes`)**:基于Omost的提示词分区控制扩写,可选参数如下:
+
+ - **提示词分区扩写**:'OmostPromter'。
+
+
+## 使用说明
+
+### 提示词润色器
+
+在加载模型 Pipeline 时,可以通过参数 `prompt_refiner_classes` 指定所需的提示词润色器功能。有关示例代码,请参考 [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py)。
+
+可选的 `prompt_refiner_classes` 参数包括:Translator、BeautifulPrompt、QwenPrompt。
+
+```python
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
+```
+
+### 提示词扩展器
+
+在加载模型 Pipeline 时,可以通过参数 `prompt_extender_classes` 指定所需的提示词扩展器。有关示例代码,请参考 [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py)。
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
+```
+
diff --git a/docs/source/tutorial/Schedulers.md b/docs/source/tutorial/Schedulers.md
index 07635db..a2adfd9 100644
--- a/docs/source/tutorial/Schedulers.md
+++ b/docs/source/tutorial/Schedulers.md
@@ -1,11 +1,11 @@
-# 调度器
-
-调度器(Scheduler)控制模型的整个去噪(或采样)过程。在加载 Pipeline 时,DiffSynth 会自动选择最适合当前 Pipeline 的调度器,**无需额外配置**。
-
-我们支持的调度器包括:
-
-- **EnhancedDDIMScheduler**:扩展了去噪扩散概率模型(DDPM)中的去噪过程,引入了非马尔可夫指导。
-
-- **FlowMatchScheduler**:实现了 [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) 中提出的流量匹配采样方法。
-
-- **ContinuousODEScheduler**:基于常微分方程(ODE)的调度器。
+# 调度器
+
+调度器(Scheduler)控制模型的整个去噪(或采样)过程。在加载 Pipeline 时,DiffSynth 会自动选择最适合当前 Pipeline 的调度器,**无需额外配置**。
+
+我们支持的调度器包括:
+
+- **EnhancedDDIMScheduler**:扩展了去噪扩散概率模型(DDPM)中的去噪过程,引入了非马尔可夫指导。
+
+- **FlowMatchScheduler**:实现了 [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) 中提出的流量匹配采样方法。
+
+- **ContinuousODEScheduler**:基于常微分方程(ODE)的调度器。
diff --git a/docs/source_en/GetStarted/A_simple_example.md b/docs/source_en/GetStarted/A_simple_example.md
index 9d28911..1eb91d8 100644
--- a/docs/source_en/GetStarted/A_simple_example.md
+++ b/docs/source_en/GetStarted/A_simple_example.md
@@ -1,82 +1,82 @@
-
-# A Simple Example: Text-to-Image Synthesis with Flux
-
-The following example shows how to use the FLUX.1 model for text-to-image tasks. The script provides a simple setup for generating images from text descriptions. It covers downloading the necessary models, configuring the pipeline, and generating images with and without classifier-free guidance.
-
-For other models supported by DiffSynth, see [Models.md](Models.md).
-
-## Setup
-
-First, ensure you have the necessary models downloaded and configured:
-
-```python
-import torch
-from diffsynth import ModelManager, FluxImagePipeline, download_models
-
-# Download the FLUX.1-dev model files
-download_models(["FLUX.1-dev"])
-```
-
-For instructions on downloading models, see [Download_models.md](Download_models.md).
-
-## Loading Models
-Initialize the model manager with your device and data type:
-
-```python
-model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
-model_manager.load_models([
- "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
- "models/FLUX/FLUX.1-dev/text_encoder_2",
- "models/FLUX/FLUX.1-dev/ae.safetensors",
- "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
-])
-```
-
-For instructions on loading models, see [ModelManager.md](ModelManager.md).
-
-## Creating the Pipeline
-Create an instance of the FluxImagePipeline from the loaded model manager:
-
-
-```python
-pipe = FluxImagePipeline.from_model_manager(model_manager)
-```
-
-For instructions on using the Pipeline, see [Pipeline.md](Pipeline.md).
-## Text-to-Image Synthesis
-Generate an image using a short prompt. Below are examples of generating images with and without classifier-free guidance.
-
-### Basic Generation
-```python
-prompt = "A cute little turtle"
-negative_prompt = ""
-
-torch.manual_seed(6)
-image = pipe(
- prompt=prompt,
- num_inference_steps=30, embedded_guidance=3.5
-)
-image.save("image_1024.jpg")
-```
-
-### Generation with Classifier-Free Guidance
-```python
-torch.manual_seed(6)
-image = pipe(
- prompt=prompt, negative_prompt=negative_prompt,
- num_inference_steps=30, cfg_scale=2.0, embedded_guidance=3.5
-)
-image.save("image_1024_cfg.jpg")
-```
-
-### High-Resolution Fix
-```python
-torch.manual_seed(7)
-image = pipe(
- prompt=prompt,
- num_inference_steps=30, embedded_guidance=3.5,
- input_image=image.resize((2048, 2048)), height=2048, width=2048, denoising_strength=0.6, tiled=True
-)
-image.save("image_2048_highres.jpg")
-```
-
+
+# A Simple Example: Text-to-Image Synthesis with Flux
+
+The following example shows how to use the FLUX.1 model for text-to-image tasks. The script provides a simple setup for generating images from text descriptions. It covers downloading the necessary models, configuring the pipeline, and generating images with and without classifier-free guidance.
+
+For other models supported by DiffSynth, see [Models.md](Models.md).
+
+## Setup
+
+First, ensure you have the necessary models downloaded and configured:
+
+```python
+import torch
+from diffsynth import ModelManager, FluxImagePipeline, download_models
+
+# Download the FLUX.1-dev model files
+download_models(["FLUX.1-dev"])
+```
+
+For instructions on downloading models, see [Download_models.md](Download_models.md).
+
+## Loading Models
+Initialize the model manager with your device and data type:
+
+```python
+model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
+model_manager.load_models([
+ "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+ "models/FLUX/FLUX.1-dev/text_encoder_2",
+ "models/FLUX/FLUX.1-dev/ae.safetensors",
+ "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+])
+```
+
+For instructions on loading models, see [ModelManager.md](ModelManager.md).
+
+## Creating the Pipeline
+Create an instance of the FluxImagePipeline from the loaded model manager:
+
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager)
+```
+
+For instructions on using the Pipeline, see [Pipeline.md](Pipeline.md).
+## Text-to-Image Synthesis
+Generate an image using a short prompt. Below are examples of generating images with and without classifier-free guidance.
+
+### Basic Generation
+```python
+prompt = "A cute little turtle"
+negative_prompt = ""
+
+torch.manual_seed(6)
+image = pipe(
+ prompt=prompt,
+ num_inference_steps=30, embedded_guidance=3.5
+)
+image.save("image_1024.jpg")
+```
+
+### Generation with Classifier-Free Guidance
+```python
+torch.manual_seed(6)
+image = pipe(
+ prompt=prompt, negative_prompt=negative_prompt,
+ num_inference_steps=30, cfg_scale=2.0, embedded_guidance=3.5
+)
+image.save("image_1024_cfg.jpg")
+```
+
+### High-Resolution Fix
+```python
+torch.manual_seed(7)
+image = pipe(
+ prompt=prompt,
+ num_inference_steps=30, embedded_guidance=3.5,
+ input_image=image.resize((2048, 2048)), height=2048, width=2048, denoising_strength=0.6, tiled=True
+)
+image.save("image_2048_highres.jpg")
+```
+
diff --git a/docs/source_en/GetStarted/Download_models.md b/docs/source_en/GetStarted/Download_models.md
index 50b3bf1..e52160f 100644
--- a/docs/source_en/GetStarted/Download_models.md
+++ b/docs/source_en/GetStarted/Download_models.md
@@ -1,20 +1,20 @@
-# Download Models
-
-Download the pre-set models. Model IDs can be found in [config file](/diffsynth/configs/model_config.py).
-
-```python
-from diffsynth import download_models
-
-download_models(["FLUX.1-dev", "Kolors"])
-```
-
-To download non-pre-set models, you can choose models from either the [ModelScope](https://modelscope.cn/models) or [HuggingFace](https://huggingface.co/models) sources.
-
-```python
-from diffsynth.models.downloader import download_from_huggingface, download_from_modelscope
-
-# From Modelscope (recommended)
-download_from_modelscope("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.bin", "models/kolors/Kolors/vae")
-# From Huggingface
-download_from_huggingface("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.safetensors", "models/kolors/Kolors/vae")
-```
+# Download Models
+
+Download the pre-set models. Model IDs can be found in [config file](/diffsynth/configs/model_config.py).
+
+```python
+from diffsynth import download_models
+
+download_models(["FLUX.1-dev", "Kolors"])
+```
+
+To download non-pre-set models, you can choose models from either the [ModelScope](https://modelscope.cn/models) or [HuggingFace](https://huggingface.co/models) sources.
+
+```python
+from diffsynth.models.downloader import download_from_huggingface, download_from_modelscope
+
+# From Modelscope (recommended)
+download_from_modelscope("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.bin", "models/kolors/Kolors/vae")
+# From Huggingface
+download_from_huggingface("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.fp16.safetensors", "models/kolors/Kolors/vae")
+```
diff --git a/docs/source_en/GetStarted/Extensions.md b/docs/source_en/GetStarted/Extensions.md
index 037e8d5..97cbda8 100644
--- a/docs/source_en/GetStarted/Extensions.md
+++ b/docs/source_en/GetStarted/Extensions.md
@@ -1,10 +1,10 @@
-# Extensions
-
-This document introduces some relevant techniques beyond the diffusion models implemented in DiffSynth, which have significant application potential in image and video processing.
-
-- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**: FIRE (Real-Time Intermediate Flow Estimation Algorithm) is a frame interpolation (VFI) method based on real-time intermediate flow estimation. It includes an end-to-end efficient intermediate flow estimation network called IFNet, as well as an optical flow supervision framework based on privileged distillation. RIFE supports inserting frames at any moment between two frames, achieving state-of-the-art performance across multiple datasets without relying on any pre-trained models.
-
-- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN (Enhanced Super Resolution Generative Adversarial Network) is an improved method based on SRGAN, aimed at enhancing the visual quality of single image super-resolution. This approach significantly improves the realism of generated images by optimizing three key components of SRGAN: network architecture, adversarial loss, and perceptual loss.
-
-- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend is a model-free toolkit designed for smoothing videos, integrated with Diffusion models to create a powerful video processing workflow. This tool effectively eliminates flickering in videos, performs interpolation on keyframe sequences, and can process complete videos based on a single image.
-
+# Extensions
+
+This document introduces some relevant techniques beyond the diffusion models implemented in DiffSynth, which have significant application potential in image and video processing.
+
+- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**: FIRE (Real-Time Intermediate Flow Estimation Algorithm) is a frame interpolation (VFI) method based on real-time intermediate flow estimation. It includes an end-to-end efficient intermediate flow estimation network called IFNet, as well as an optical flow supervision framework based on privileged distillation. RIFE supports inserting frames at any moment between two frames, achieving state-of-the-art performance across multiple datasets without relying on any pre-trained models.
+
+- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN (Enhanced Super Resolution Generative Adversarial Network) is an improved method based on SRGAN, aimed at enhancing the visual quality of single image super-resolution. This approach significantly improves the realism of generated images by optimizing three key components of SRGAN: network architecture, adversarial loss, and perceptual loss.
+
+- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend is a model-free toolkit designed for smoothing videos, integrated with Diffusion models to create a powerful video processing workflow. This tool effectively eliminates flickering in videos, performs interpolation on keyframe sequences, and can process complete videos based on a single image.
+
diff --git a/docs/source_en/GetStarted/Fine-tuning.md b/docs/source_en/GetStarted/Fine-tuning.md
index fbcb7f9..2076bdc 100644
--- a/docs/source_en/GetStarted/Fine-tuning.md
+++ b/docs/source_en/GetStarted/Fine-tuning.md
@@ -1,426 +1,426 @@
-# Fine-Tuning
-
-We have implemented a training framework for text-to-image Diffusion models, enabling users to easily train LoRA models using our framework. Our provided scripts come with the following advantages:
-
-* **Comprehensive Functionality & User-Friendliness**: Our training framework supports multi-GPU and multi-machine setups, facilitates the use of DeepSpeed for acceleration, and includes gradient checkpointing optimizations for models with excessive memory demands.
-* **Code Conciseness & Researcher Accessibility**: We avoid large blocks of complicated code. General-purpose modules are implemented in `diffsynth/trainers/text_to_image.py`, while model-specific training scripts contain only minimal code pertinent to the model architecture, making it researcher-friendly.
-* **Modular Design & Developer Flexibility**: Built on the universal Pytorch-Lightning framework, our training framework is decoupled in terms of functionality, allowing developers to easily introduce additional training techniques by modifying our scripts to suit their needs.
-
-Image Examples of fine-tuned LoRA. The prompt is "一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉" (for Chinese models) or "a dog is jumping, flowers around the dog, the background is mountains and clouds" (for English models).
-
-||Kolors|Stable Diffusion 3|Hunyuan-DiT|
-|-|-|-|-|
-|Without LoRA||||
-|With LoRA||||
-
-## Install additional packages
-
-```bash
-pip install peft lightning
-```
-
-## Prepare your dataset
-
-We provide an example dataset [here](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files). You need to manage the training images as follows:
-
-```
-data/dog/
-└── train
- ├── 00.jpg
- ├── 01.jpg
- ├── 02.jpg
- ├── 03.jpg
- ├── 04.jpg
- └── metadata.csv
-```
-
-`metadata.csv`:
-
-```
-file_name,text
-00.jpg,a dog
-01.jpg,a dog
-02.jpg,a dog
-03.jpg,a dog
-04.jpg,a dog
-```
-
-Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommand to use Chinese texts in the dataset. For example
-
-```
-file_name,text
-00.jpg,一只小狗
-01.jpg,一只小狗
-02.jpg,一只小狗
-03.jpg,一只小狗
-04.jpg,一只小狗
-```
-
-## Train a LoRA model
-
-General options:
-
-```
- --lora_target_modules LORA_TARGET_MODULES
- Layers with LoRA modules.
- --dataset_path DATASET_PATH
- The path of the Dataset.
- --output_path OUTPUT_PATH
- Path to save the model.
- --steps_per_epoch STEPS_PER_EPOCH
- Number of steps per epoch.
- --height HEIGHT Image height.
- --width WIDTH Image width.
- --center_crop Whether to center crop the input images to the resolution. If not set, the images will be randomly cropped. The images will be resized to the resolution first before cropping.
- --random_flip Whether to randomly flip images horizontally
- --batch_size BATCH_SIZE
- Batch size (per device) for the training dataloader.
- --dataloader_num_workers DATALOADER_NUM_WORKERS
- Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
- --precision {32,16,16-mixed}
- Training precision
- --learning_rate LEARNING_RATE
- Learning rate.
- --lora_rank LORA_RANK
- The dimension of the LoRA update matrices.
- --lora_alpha LORA_ALPHA
- The weight of the LoRA update matrices.
- --use_gradient_checkpointing
- Whether to use gradient checkpointing.
- --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
- The number of batches in gradient accumulation.
- --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
- Training strategy
- --max_epochs MAX_EPOCHS
- Number of epochs.
- --modelscope_model_id MODELSCOPE_MODEL_ID
- Model ID on ModelScope (https://www.modelscope.cn/). The model will be uploaded to ModelScope automatically if you provide a Model ID.
- --modelscope_access_token MODELSCOPE_ACCESS_TOKEN
- Access key on ModelScope (https://www.modelscope.cn/). Required if you want to upload the model to ModelScope.
-```
-
-### Kolors
-
-The following files will be used for constructing Kolors. You can download Kolors from [huggingface](https://huggingface.co/Kwai-Kolors/Kolors) or [modelscope](https://modelscope.cn/models/Kwai-Kolors/Kolors). Due to precision overflow issues, we need to download an additional VAE model (from [huggingface](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) or [modelscope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix)). You can use the following code to download these files:
-
-```python
-from diffsynth import download_models
-
-download_models(["Kolors", "SDXL-vae-fp16-fix"])
-```
-
-```
-models
-├── kolors
-│ └── Kolors
-│ ├── text_encoder
-│ │ ├── config.json
-│ │ ├── pytorch_model-00001-of-00007.bin
-│ │ ├── pytorch_model-00002-of-00007.bin
-│ │ ├── pytorch_model-00003-of-00007.bin
-│ │ ├── pytorch_model-00004-of-00007.bin
-│ │ ├── pytorch_model-00005-of-00007.bin
-│ │ ├── pytorch_model-00006-of-00007.bin
-│ │ ├── pytorch_model-00007-of-00007.bin
-│ │ └── pytorch_model.bin.index.json
-│ ├── unet
-│ │ └── diffusion_pytorch_model.safetensors
-│ └── vae
-│ └── diffusion_pytorch_model.safetensors
-└── sdxl-vae-fp16-fix
- └── diffusion_pytorch_model.safetensors
-```
-
-Launch the training task using the following command:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
- --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
- --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
- --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-For more information about the parameters, please use `python examples/train/kolors/train_kolors_lora.py -h` to see the details.
-
-After training, use `model_manager.load_lora` to load the LoRA for inference.
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=[
- "models/kolors/Kolors/text_encoder",
- "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
- "models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors"
- ])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SDXLImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
- negative_prompt="",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
-
-### Stable Diffusion 3
-
-Only one file is required in the training script. You can use [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors) (without T5 encoder) or [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors) (with T5 encoder). Please use the following code to download these files:
-
-```python
-from diffsynth import download_models
-
-download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
-```
-
-```
-models/stable_diffusion_3/
-├── Put Stable Diffusion 3 checkpoints here.txt
-├── sd3_medium_incl_clips.safetensors
-└── sd3_medium_incl_clips_t5xxlfp16.safetensors
-```
-
-Launch the training task using the following command:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
- --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-For more information about the parameters, please use `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` to see the details.
-
-After training, use `model_manager.load_lora` to load the LoRA for inference.
-
-```python
-from diffsynth import ModelManager, SD3ImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SD3ImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
-
-### Hunyuan-DiT
-
-Four files will be used for constructing Hunyuan DiT. You can download them from [huggingface](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) or [modelscope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary). You can use the following code to download these files:
-
-```python
-from diffsynth import download_models
-
-download_models(["HunyuanDiT"])
-```
-
-```
-models/HunyuanDiT/
-├── Put Hunyuan DiT checkpoints here.txt
-└── t2i
- ├── clip_text_encoder
- │ └── pytorch_model.bin
- ├── model
- │ └── pytorch_model_ema.pt
- ├── mt5
- │ └── pytorch_model.bin
- └── sdxl-vae-fp16-fix
- └── diffusion_pytorch_model.bin
-```
-
-Launch the training task using the following command:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
- --pretrained_path models/HunyuanDiT/t2i \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-For more information about the parameters, please use `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` to see the details.
-
-After training, use `model_manager.load_lora` to load the LoRA for inference.
-
-```python
-from diffsynth import ModelManager, HunyuanDiTImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=[
- "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
- "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
- "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
- "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
- ])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
- negative_prompt="",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
-
-### Stable Diffusion
-
-Only one file is required in the training script. We support the mainstream checkpoints in [CivitAI](https://civitai.com/). By default, we use the base Stable Diffusion v1.5. You can download it from [huggingface](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) or [modelscope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors). You can use the following code to download this file:
-
-```python
-from diffsynth import download_models
-
-download_models(["StableDiffusion_v15"])
-```
-
-```
-models/stable_diffusion
-├── Put Stable Diffusion checkpoints here.txt
-└── v1-5-pruned-emaonly.safetensors
-```
-
-Launch the training task using the following command:
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
- --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 512 \
- --width 512 \
- --center_crop \
- --precision "16-mixed" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-For more information about the parameters, please use `python examples/train/stable_diffusion/train_sd_lora.py -h` to see the details.
-
-After training, use `model_manager.load_lora` to load the LoRA for inference.
-
-```python
-from diffsynth import ModelManager, SDImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SDImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=512, height=512,
-)
-image.save("image_with_lora.jpg")
-```
-
-### Stable Diffusion XL
-
-Only one file is required in the training script. We support the mainstream checkpoints in [CivitAI](https://civitai.com/). By default, we use the base Stable Diffusion XL. You can download it from [huggingface](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) or [modelscope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors). You can use the following code to download this file:
-
-```python
-from diffsynth import download_models
-
-download_models(["StableDiffusionXL_v1"])
-```
-
-```
-models/stable_diffusion_xl
-├── Put Stable Diffusion XL checkpoints here.txt
-└── sd_xl_base_1.0.safetensors
-```
-
-We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32.
-
-```
-CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
- --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
- --dataset_path data/dog \
- --output_path ./models \
- --max_epochs 1 \
- --steps_per_epoch 500 \
- --height 1024 \
- --width 1024 \
- --center_crop \
- --precision "32" \
- --learning_rate 1e-4 \
- --lora_rank 4 \
- --lora_alpha 4 \
- --use_gradient_checkpointing
-```
-
-For more information about the parameters, please use `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` to see the details.
-
-After training, use `model_manager.load_lora` to load the LoRA for inference.
-
-```python
-from diffsynth import ModelManager, SDXLImagePipeline
-import torch
-
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
- file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
-model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
-pipe = SDXLImagePipeline.from_model_manager(model_manager)
-
-torch.manual_seed(0)
-image = pipe(
- prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
- negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
- cfg_scale=7.5,
- num_inference_steps=100, width=1024, height=1024,
-)
-image.save("image_with_lora.jpg")
-```
+# Fine-Tuning
+
+We have implemented a training framework for text-to-image Diffusion models, enabling users to easily train LoRA models using our framework. Our provided scripts come with the following advantages:
+
+* **Comprehensive Functionality & User-Friendliness**: Our training framework supports multi-GPU and multi-machine setups, facilitates the use of DeepSpeed for acceleration, and includes gradient checkpointing optimizations for models with excessive memory demands.
+* **Code Conciseness & Researcher Accessibility**: We avoid large blocks of complicated code. General-purpose modules are implemented in `diffsynth/trainers/text_to_image.py`, while model-specific training scripts contain only minimal code pertinent to the model architecture, making it researcher-friendly.
+* **Modular Design & Developer Flexibility**: Built on the universal Pytorch-Lightning framework, our training framework is decoupled in terms of functionality, allowing developers to easily introduce additional training techniques by modifying our scripts to suit their needs.
+
+Image Examples of fine-tuned LoRA. The prompt is "一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉" (for Chinese models) or "a dog is jumping, flowers around the dog, the background is mountains and clouds" (for English models).
+
+||Kolors|Stable Diffusion 3|Hunyuan-DiT|
+|-|-|-|-|
+|Without LoRA||||
+|With LoRA||||
+
+## Install additional packages
+
+```bash
+pip install peft lightning
+```
+
+## Prepare your dataset
+
+We provide an example dataset [here](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files). You need to manage the training images as follows:
+
+```
+data/dog/
+└── train
+ ├── 00.jpg
+ ├── 01.jpg
+ ├── 02.jpg
+ ├── 03.jpg
+ ├── 04.jpg
+ └── metadata.csv
+```
+
+`metadata.csv`:
+
+```
+file_name,text
+00.jpg,a dog
+01.jpg,a dog
+02.jpg,a dog
+03.jpg,a dog
+04.jpg,a dog
+```
+
+Note that if the model is Chinese model (for example, Hunyuan-DiT and Kolors), we recommand to use Chinese texts in the dataset. For example
+
+```
+file_name,text
+00.jpg,一只小狗
+01.jpg,一只小狗
+02.jpg,一只小狗
+03.jpg,一只小狗
+04.jpg,一只小狗
+```
+
+## Train a LoRA model
+
+General options:
+
+```
+ --lora_target_modules LORA_TARGET_MODULES
+ Layers with LoRA modules.
+ --dataset_path DATASET_PATH
+ The path of the Dataset.
+ --output_path OUTPUT_PATH
+ Path to save the model.
+ --steps_per_epoch STEPS_PER_EPOCH
+ Number of steps per epoch.
+ --height HEIGHT Image height.
+ --width WIDTH Image width.
+ --center_crop Whether to center crop the input images to the resolution. If not set, the images will be randomly cropped. The images will be resized to the resolution first before cropping.
+ --random_flip Whether to randomly flip images horizontally
+ --batch_size BATCH_SIZE
+ Batch size (per device) for the training dataloader.
+ --dataloader_num_workers DATALOADER_NUM_WORKERS
+ Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
+ --precision {32,16,16-mixed}
+ Training precision
+ --learning_rate LEARNING_RATE
+ Learning rate.
+ --lora_rank LORA_RANK
+ The dimension of the LoRA update matrices.
+ --lora_alpha LORA_ALPHA
+ The weight of the LoRA update matrices.
+ --use_gradient_checkpointing
+ Whether to use gradient checkpointing.
+ --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
+ The number of batches in gradient accumulation.
+ --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
+ Training strategy
+ --max_epochs MAX_EPOCHS
+ Number of epochs.
+ --modelscope_model_id MODELSCOPE_MODEL_ID
+ Model ID on ModelScope (https://www.modelscope.cn/). The model will be uploaded to ModelScope automatically if you provide a Model ID.
+ --modelscope_access_token MODELSCOPE_ACCESS_TOKEN
+ Access key on ModelScope (https://www.modelscope.cn/). Required if you want to upload the model to ModelScope.
+```
+
+### Kolors
+
+The following files will be used for constructing Kolors. You can download Kolors from [huggingface](https://huggingface.co/Kwai-Kolors/Kolors) or [modelscope](https://modelscope.cn/models/Kwai-Kolors/Kolors). Due to precision overflow issues, we need to download an additional VAE model (from [huggingface](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) or [modelscope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix)). You can use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["Kolors", "SDXL-vae-fp16-fix"])
+```
+
+```
+models
+├── kolors
+│ └── Kolors
+│ ├── text_encoder
+│ │ ├── config.json
+│ │ ├── pytorch_model-00001-of-00007.bin
+│ │ ├── pytorch_model-00002-of-00007.bin
+│ │ ├── pytorch_model-00003-of-00007.bin
+│ │ ├── pytorch_model-00004-of-00007.bin
+│ │ ├── pytorch_model-00005-of-00007.bin
+│ │ ├── pytorch_model-00006-of-00007.bin
+│ │ ├── pytorch_model-00007-of-00007.bin
+│ │ └── pytorch_model.bin.index.json
+│ ├── unet
+│ │ └── diffusion_pytorch_model.safetensors
+│ └── vae
+│ └── diffusion_pytorch_model.safetensors
+└── sdxl-vae-fp16-fix
+ └── diffusion_pytorch_model.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
+ --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
+ --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
+ --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/kolors/train_kolors_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=[
+ "models/kolors/Kolors/text_encoder",
+ "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
+ "models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors"
+ ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
+ negative_prompt="",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion 3
+
+Only one file is required in the training script. You can use [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors) (without T5 encoder) or [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors) (with T5 encoder). Please use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
+```
+
+```
+models/stable_diffusion_3/
+├── Put Stable Diffusion 3 checkpoints here.txt
+├── sd3_medium_incl_clips.safetensors
+└── sd3_medium_incl_clips_t5xxlfp16.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
+ --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Hunyuan-DiT
+
+Four files will be used for constructing Hunyuan DiT. You can download them from [huggingface](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) or [modelscope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary). You can use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["HunyuanDiT"])
+```
+
+```
+models/HunyuanDiT/
+├── Put Hunyuan DiT checkpoints here.txt
+└── t2i
+ ├── clip_text_encoder
+ │ └── pytorch_model.bin
+ ├── model
+ │ └── pytorch_model_ema.pt
+ ├── mt5
+ │ └── pytorch_model.bin
+ └── sdxl-vae-fp16-fix
+ └── diffusion_pytorch_model.bin
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
+ --pretrained_path models/HunyuanDiT/t2i \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, HunyuanDiTImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=[
+ "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+ "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+ "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+ "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+ ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
+ negative_prompt="",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion
+
+Only one file is required in the training script. We support the mainstream checkpoints in [CivitAI](https://civitai.com/). By default, we use the base Stable Diffusion v1.5. You can download it from [huggingface](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) or [modelscope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors). You can use the following code to download this file:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion_v15"])
+```
+
+```
+models/stable_diffusion
+├── Put Stable Diffusion checkpoints here.txt
+└── v1-5-pruned-emaonly.safetensors
+```
+
+Launch the training task using the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
+ --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 512 \
+ --width 512 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion/train_sd_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SDImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=512, height=512,
+)
+image.save("image_with_lora.jpg")
+```
+
+### Stable Diffusion XL
+
+Only one file is required in the training script. We support the mainstream checkpoints in [CivitAI](https://civitai.com/). By default, we use the base Stable Diffusion XL. You can download it from [huggingface](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) or [modelscope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors). You can use the following code to download this file:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusionXL_v1"])
+```
+
+```
+models/stable_diffusion_xl
+├── Put Stable Diffusion XL checkpoints here.txt
+└── sd_xl_base_1.0.safetensors
+```
+
+We observed that Stable Diffusion XL is not float16-safe, thus we recommand users to use float32.
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
+ --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "32" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` to see the details.
+
+After training, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/GetStarted/Installation.md b/docs/source_en/GetStarted/Installation.md
index 1afa8fc..6bfb809 100644
--- a/docs/source_en/GetStarted/Installation.md
+++ b/docs/source_en/GetStarted/Installation.md
@@ -1,24 +1,24 @@
-# Installation
-
-## From Source
-
-1. Clone the source repository:
-
- ```bash
- git clone https://github.com/modelscope/DiffSynth-Studio.git
- ```
-
-2. Navigate to the project directory and install:
-
- ```bash
- cd DiffSynth-Studio
- pip install -e .
- ```
-
-## From PyPI
-
-Install directly via PyPI:
-
-```bash
-pip install diffsynth
+# Installation
+
+## From Source
+
+1. Clone the source repository:
+
+ ```bash
+ git clone https://github.com/modelscope/DiffSynth-Studio.git
+ ```
+
+2. Navigate to the project directory and install:
+
+ ```bash
+ cd DiffSynth-Studio
+ pip install -e .
+ ```
+
+## From PyPI
+
+Install directly via PyPI:
+
+```bash
+pip install diffsynth
```
\ No newline at end of file
diff --git a/docs/source_en/GetStarted/Models.md b/docs/source_en/GetStarted/Models.md
index fe7e1a2..b7127db 100644
--- a/docs/source_en/GetStarted/Models.md
+++ b/docs/source_en/GetStarted/Models.md
@@ -1,17 +1,17 @@
-# Models
-
-Until now, DiffSynth Studio has supported the following models:
-
-* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
-* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
-* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
-* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
-* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
-* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
-* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
-* [ESRGAN](https://github.com/xinntao/ESRGAN)
-* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
-* [AnimateDiff](https://github.com/guoyww/animatediff/)
-* [ControlNet](https://github.com/lllyasviel/ControlNet)
-* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
+# Models
+
+Until now, DiffSynth Studio has supported the following models:
+
+* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
+* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
+* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
+* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
+* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
+* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
+* [ESRGAN](https://github.com/xinntao/ESRGAN)
+* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
+* [AnimateDiff](https://github.com/guoyww/animatediff/)
+* [ControlNet](https://github.com/lllyasviel/ControlNet)
+* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
diff --git a/docs/source_en/GetStarted/Pipelines.md b/docs/source_en/GetStarted/Pipelines.md
index 9d5b7de..9ca2e73 100644
--- a/docs/source_en/GetStarted/Pipelines.md
+++ b/docs/source_en/GetStarted/Pipelines.md
@@ -1,27 +1,27 @@
-# Pipelines
-
-So far, the following table lists our pipelines and the models supported by each pipeline.
-
-## Image Pipelines
-
-Pipelines for generating images from text descriptions. Each pipeline relies on specific encoder and decoder models.
-
-| Pipeline | Models |
-|----------------------------|----------------------------------------------------------------|
-| HunyuanDiTImagePipeline | text_encoder: HunyuanDiTCLIPTextEncoder
text_encoder_t5: HunyuanDiTT5TextEncoder
dit: HunyuanDiT
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder |
-| SDImagePipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter |
-| SD3ImagePipeline | text_encoder_1: SD3TextEncoder1
text_encoder_2: SD3TextEncoder2
text_encoder_3: SD3TextEncoder3
dit: SD3DiT
vae_decoder: SD3VAEDecoder
vae_encoder: SD3VAEEncoder |
-| SDXLImagePipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter |
-
-## Video Pipelines
-
-Pipelines for generating videos from text descriptions. In addition to the models required for image generation, they include models for handling motion modules.
-
-| Pipeline | Models |
-|----------------------------|----------------------------------------------------------------|
-| SDVideoPipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter
motion_modules: SDMotionModel |
-| SDXLVideoPipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter
motion_modules: SDXLMotionModel |
-| SVDVideoPipeline | image_encoder: SVDImageEncoder
unet: SVDUNet
vae_encoder: SVDVAEEncoder
vae_decoder: SVDVAEDecoder |
-
-
-
+# Pipelines
+
+So far, the following table lists our pipelines and the models supported by each pipeline.
+
+## Image Pipelines
+
+Pipelines for generating images from text descriptions. Each pipeline relies on specific encoder and decoder models.
+
+| Pipeline | Models |
+|----------------------------|----------------------------------------------------------------|
+| HunyuanDiTImagePipeline | text_encoder: HunyuanDiTCLIPTextEncoder
text_encoder_t5: HunyuanDiTT5TextEncoder
dit: HunyuanDiT
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder |
+| SDImagePipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter |
+| SD3ImagePipeline | text_encoder_1: SD3TextEncoder1
text_encoder_2: SD3TextEncoder2
text_encoder_3: SD3TextEncoder3
dit: SD3DiT
vae_decoder: SD3VAEDecoder
vae_encoder: SD3VAEEncoder |
+| SDXLImagePipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter |
+
+## Video Pipelines
+
+Pipelines for generating videos from text descriptions. In addition to the models required for image generation, they include models for handling motion modules.
+
+| Pipeline | Models |
+|----------------------------|----------------------------------------------------------------|
+| SDVideoPipeline | text_encoder: SDTextEncoder
unet: SDUNet
vae_decoder: SDVAEDecoder
vae_encoder: SDVAEEncoder
controlnet: MultiControlNetManager
ipadapter_image_encoder: IpAdapterCLIPImageEmbedder
ipadapter: SDIpAdapter
motion_modules: SDMotionModel |
+| SDXLVideoPipeline | text_encoder: SDXLTextEncoder
text_encoder_2: SDXLTextEncoder2
text_encoder_kolors: ChatGLMModel
unet: SDXLUNet
vae_decoder: SDXLVAEDecoder
vae_encoder: SDXLVAEEncoder
ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder
ipadapter: SDXLIpAdapter
motion_modules: SDXLMotionModel |
+| SVDVideoPipeline | image_encoder: SVDImageEncoder
unet: SVDUNet
vae_encoder: SVDVAEEncoder
vae_decoder: SVDVAEDecoder |
+
+
+
diff --git a/docs/source_en/GetStarted/PromptProcessing.md b/docs/source_en/GetStarted/PromptProcessing.md
index c86c762..a2043b0 100644
--- a/docs/source_en/GetStarted/PromptProcessing.md
+++ b/docs/source_en/GetStarted/PromptProcessing.md
@@ -1,35 +1,35 @@
-# Prompt Processing
-
-DiffSynth includes prompt processing functionality, which is divided into:
-
-- **Prompt Refiners (`prompt_refiner_classes`)**: Includes prompt refinement, prompt translation from Chinese to English, and both refinement and translation of prompts. Available parameters are as follows:
-
- - **English Prompt Refinement**: 'BeautifulPrompt', using the model [pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd).
-
- - **Prompt Translation from Chinese to English**: 'Translator', using the model [opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en).
-
- - **Prompt Translation and Refinement**: 'QwenPrompt', using the model [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct).
-
-- **Prompt Extenders (`prompt_extender_classes`)**: Based on Omost's prompt partition control expansion. Available parameter is:
-
- - **Prompt Partition Expansion**: 'OmostPromter'.
-
-## Usage Instructions
-
-### Prompt Refiners
-
-When loading the model pipeline, you can specify the desired prompt refiner functionality using the `prompt_refiner_classes` parameter. For example code, refer to [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py).
-
-Available `prompt_refiner_classes` parameters include: Translator, BeautifulPrompt, QwenPrompt.
-
-```python
-pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
-```
-
-### Prompt Extenders
-
-When loading the model pipeline, you can specify the desired prompt extender using the prompt_extender_classes parameter. For example code, refer to [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py).
-
-```python
-pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
-```
+# Prompt Processing
+
+DiffSynth includes prompt processing functionality, which is divided into:
+
+- **Prompt Refiners (`prompt_refiner_classes`)**: Includes prompt refinement, prompt translation from Chinese to English, and both refinement and translation of prompts. Available parameters are as follows:
+
+ - **English Prompt Refinement**: 'BeautifulPrompt', using the model [pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd).
+
+ - **Prompt Translation from Chinese to English**: 'Translator', using the model [opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en).
+
+ - **Prompt Translation and Refinement**: 'QwenPrompt', using the model [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct).
+
+- **Prompt Extenders (`prompt_extender_classes`)**: Based on Omost's prompt partition control expansion. Available parameter is:
+
+ - **Prompt Partition Expansion**: 'OmostPromter'.
+
+## Usage Instructions
+
+### Prompt Refiners
+
+When loading the model pipeline, you can specify the desired prompt refiner functionality using the `prompt_refiner_classes` parameter. For example code, refer to [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py).
+
+Available `prompt_refiner_classes` parameters include: Translator, BeautifulPrompt, QwenPrompt.
+
+```python
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
+```
+
+### Prompt Extenders
+
+When loading the model pipeline, you can specify the desired prompt extender using the prompt_extender_classes parameter. For example code, refer to [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py).
+
+```python
+pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
+```
diff --git a/docs/source_en/GetStarted/Schedulers.md b/docs/source_en/GetStarted/Schedulers.md
index 4bc25e5..495293f 100644
--- a/docs/source_en/GetStarted/Schedulers.md
+++ b/docs/source_en/GetStarted/Schedulers.md
@@ -1,11 +1,11 @@
-# Schedulers
-
-Schedulers control the entire denoising (or sampling) process of the model. When loading the Pipeline, DiffSynth automatically selects the most suitable schedulers for the current Pipeline, requiring no additional configuration.
-
-The supported schedulers are:
-
-- **EnhancedDDIMScheduler**: Extends the denoising process introduced in the Denoising Diffusion Probabilistic Models (DDPM) with non-Markovian guidance.
-
-- **FlowMatchScheduler**: Implements the flow matching sampling method introduced in Stable Diffusion 3.
-
+# Schedulers
+
+Schedulers control the entire denoising (or sampling) process of the model. When loading the Pipeline, DiffSynth automatically selects the most suitable schedulers for the current Pipeline, requiring no additional configuration.
+
+The supported schedulers are:
+
+- **EnhancedDDIMScheduler**: Extends the denoising process introduced in the Denoising Diffusion Probabilistic Models (DDPM) with non-Markovian guidance.
+
+- **FlowMatchScheduler**: Implements the flow matching sampling method introduced in Stable Diffusion 3.
+
- **ContinuousODEScheduler**: A scheduler based on Ordinary Differential Equations (ODE).
\ No newline at end of file
diff --git a/docs/source_en/conf.py b/docs/source_en/conf.py
index 97d42e2..0aa024b 100644
--- a/docs/source_en/conf.py
+++ b/docs/source_en/conf.py
@@ -1,50 +1,50 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# For the full list of built-in configuration values, see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Project information -----------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-
-
-import os
-import sys
-sys.path.insert(0, os.path.abspath('../../diffsynth'))
-
-project = 'DiffSynth-Studio'
-copyright = '2024, ModelScope'
-author = 'ModelScope'
-release = '0.1.0'
-
-
-# -- General configuration ---------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-
-extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.napoleon',
- 'sphinx.ext.doctest',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.todo',
- 'sphinx.ext.coverage',
- 'sphinx.ext.imgmath',
- 'sphinx.ext.viewcode',
- 'recommonmark',
- 'sphinx_markdown_tables'
-]
-
-templates_path = ['_templates']
-exclude_patterns = []
-
-
-source_suffix = ['.rst', '.md']
-# -- Options for HTML output -------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-
-html_theme = 'sphinx_rtd_theme'
-html_static_path = ['_static']
-# multi-language docs
-language = 'en'
-locale_dirs = ['../locales/'] # path is example but recommended.
-gettext_compact = False # optional.
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../diffsynth'))
+
+project = 'DiffSynth-Studio'
+copyright = '2024, ModelScope'
+author = 'ModelScope'
+release = '0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.napoleon',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.imgmath',
+ 'sphinx.ext.viewcode',
+ 'recommonmark',
+ 'sphinx_markdown_tables'
+]
+
+templates_path = ['_templates']
+exclude_patterns = []
+
+
+source_suffix = ['.rst', '.md']
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
+# multi-language docs
+language = 'en'
+locale_dirs = ['../locales/'] # path is example but recommended.
+gettext_compact = False # optional.
gettext_uuid = True # optional.
\ No newline at end of file
diff --git a/docs/source_en/creating/AdaptersForImageSynthesis.md b/docs/source_en/creating/AdaptersForImageSynthesis.md
new file mode 100644
index 0000000..9f03cc2
--- /dev/null
+++ b/docs/source_en/creating/AdaptersForImageSynthesis.md
@@ -0,0 +1,135 @@
+# ControlNet、LoRA、IP-Adapter——Precision Control Technology
+
+Based on the VinVL model, various adapter-based models can be used to control the generation process.
+
+Let's download the models we'll be using in the upcoming examples:
+
+* A highly praised Stable Diffusion XL architecture anime-style model
+* A ControlNet model that supports multiple control modes
+* A LoRA model for the Stable Diffusion XL model
+* An IP-Adapter model and its corresponding image encoder
+
+Please note that the names of the models are kept in English as per your instruction to retain specific terminology.
+
+```python
+from diffsynth import download_models
+
+download_models([
+ "BluePencilXL_v200",
+ "ControlNet_union_sdxl_promax",
+ "SDXL_lora_zyd23ble_diffusion_xl/bluePencilXL_v200.safetensors"])
+pipe = SDXLImagePipeline.from_model_ma2_ChineseInkStyle_SDXL_v1_0",
+ "IP-Adapter-SDXL"
+])
+```
+
+Using basic text-to-image functionality to generate a picture.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models(["models/stanager(model_manager)
+torch.manual_seed(1)
+image = pipe(
+ prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ cfg_scale=6, num_inference_steps=60,
+)
+image.save("image.jpg")
+```
+
+
+
+Next, let's transform this graceful underwater dancer into a fire mage! We'll activate the ControlNet to maintain the structure of the image while modifying the prompt.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
+import torch
+from PIL import Image
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+ "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
+])
+pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+ ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
+])
+torch.manual_seed(2)
+image = pipe(
+ prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
+ cfg_scale=6, num_inference_steps=60,
+ controlnet_image=Image.open("image.jpg")
+)
+image.save("image_controlnet.jpg")
+```
+
+
+
+Isn't that cool? There's more! Add a LoRA to make the image closer to the flat style of hand-drawn comics. This LoRA requires certain trigger words to take effect, which is mentioned on the original author's model page. Remember to add the trigger words at the beginning of the prompt.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
+import torch
+from PIL import Image
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+ "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
+])
+model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+ ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
+])
+torch.manual_seed(3)
+image = pipe(
+ prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
+ cfg_scale=6, num_inference_steps=60,
+ controlnet_image=Image.open("image.jpg")
+)
+image.save("image_lora.jpg")
+```
+
+
+
+Not done yet! Find a Chinese painting with ink-wash style as a style guide, activate the IP-Adapter, and let classical art collide with modern aesthetics!
+
+| Let's use this image as a style guide. ||
+|-|-|
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
+import torch
+from PIL import Image
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
+ "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
+ "models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
+ "models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
+])
+model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
+ ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
+])
+torch.manual_seed(2)
+image = pipe(
+ prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
+ cfg_scale=6, num_inference_steps=60,
+ controlnet_image=Image.open("image.jpg"),
+ ipadapter_images=[Image.open("ink_style.jpg")],
+ ipadapter_use_instant_style=True, ipadapter_scale=0.5
+)
+image.save("image_ipadapter.jpg")
+```
+
+
+
+The joy of generating images with Diffusion lies in the combination of various ecosystem models, which can realize all kinds of creative ideas.
diff --git a/docs/source_en/creating/BasicImageSynthesis.md b/docs/source_en/creating/BasicImageSynthesis.md
new file mode 100644
index 0000000..4f0a785
--- /dev/null
+++ b/docs/source_en/creating/BasicImageSynthesis.md
@@ -0,0 +1,64 @@
+# Text-to-Image, Image-to-Image, and High-Resolution Restoration - First Encounter with the Dazzling Diffusion.
+
+Load the text-to-image model, here we use an anime-style model from Civitai as an example.
+
+```python
+import torch
+from diffsynth import ModelManager, SDImagePipeline, download_models
+
+download_models(["AingDiffusion_v12"])
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
+pipe = SDImagePipeline.from_model_manager(model_manager)
+```
+
+Generate a picture to give it a try.
+
+```python
+torch.manual_seed(0)
+image = pipe(
+ prompt="masterpiece, best quality, a girl with long silver hair",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ height=512, width=512, num_inference_steps=80,
+)
+image.save("image.jpg")
+```
+
+Ah, a lovely young lady.
+
+
+
+Use the image-to-image feature to turn her hair red, simply by adding `input_image` and `denoising_strength` as parameters. The `denoising_strength` controls the intensity of the noise added, when set to 0, the generated image will be identical to the input image, and when set to 1, it will be completely randomly generated.
+
+```python
+torch.manual_seed(1)
+image_edited = pipe(
+ prompt="masterpiece, best quality, a girl with long red hair",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ height=512, width=512, num_inference_steps=80,
+ input_image=image, denoising_strength=0.6,
+)
+image_edited.save("image_edited.jpg")
+```
+
+Ah, a cute girl with red hair.
+
+
+
+Since the model itself was trained at a resolution of 512*512, the image appears a bit blurry. However, we can utilize the model's own capabilities to refine the image and add details. Specifically, this involves increasing the resolution and then using image-to-image generation.
+```python
+torch.manual_seed(2)
+image_highres = pipe(
+ prompt="masterpiece, best quality, a girl with long red hair",
+ negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
+ height=1024, width=1024, num_inference_steps=80,
+ input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
+)
+image_highres.save("image_highres.jpg")
+```
+
+Ah, a clear and lovely girl with red hair.
+
+
+
+It's worth noting that the image-to-image and high-resolution restoration features are globally supported, and currently, all of our image generation pipelines can be used in this way.
\ No newline at end of file
diff --git a/docs/source_en/creating/PromptRefine.md b/docs/source_en/creating/PromptRefine.md
new file mode 100644
index 0000000..99d76e7
--- /dev/null
+++ b/docs/source_en/creating/PromptRefine.md
@@ -0,0 +1,77 @@
+# Translation and Polishing — The Magic of Prompt Words
+
+When generating images, we need to write prompt words to describe the content of the image. Prompt words directly affect the outcome of the generation, but crafting them is also an art. Good prompt words can produce images with a high degree of aesthetic appeal. We offer a range of models to help users handle prompt words effectively.
+
+## Translation
+
+Most text-to-image models currently only support English prompt words, which can be challenging for users who are not native English speakers. To address this, we can use open-source translation models to translate the prompt words into English. In the following example, we take "一个女孩" (a girl) as the prompt word and use the model opus-mt-zh-en for translation(which can be downloaded from [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) or [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en)).
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, Translator
+import torch
+
+model_manager = ModelManager(
+ torch_dtype=torch.float16, device="cuda",
+ model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
+)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
+
+torch.manual_seed(0)
+prompt = "一个女孩"
+image = pipe(
+ prompt=prompt, negative_prompt="",
+ height=1024, width=1024, num_inference_steps=30
+)
+image.save("image_1.jpg")
+```
+
+
+
+## Polishing
+
+Detailed prompt words can generate images with richer details. We can use a prompt polishing model like BeautifulPrompt(which can be downloaded from [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) or [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en)) to embellish simple prompt words. This model can make the overall picture style more gorgeous.
+
+This module can be activated simultaneously with the translation module, but please pay attention to the order: translate first, then polish.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
+import torch
+
+model_manager = ModelManager(
+ torch_dtype=torch.float16, device="cuda",
+ model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
+)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
+
+torch.manual_seed(0)
+prompt = "一个女孩"
+image = pipe(
+ prompt=prompt, negative_prompt="",
+ height=1024, width=1024, num_inference_steps=30
+)
+image.save("image_2.jpg")
+```
+
+
+
+We have also integrated a Tongyi Qwen model that can seamlessly complete the translation and polishing of prompt words in one step.
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
+import torch
+
+model_manager = ModelManager(
+ torch_dtype=torch.float16, device="cuda",
+ model_id_list=["BluePencilXL_v200", "QwenPrompt"]
+)
+pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
+
+torch.manual_seed(0)
+prompt = "一个女孩"
+image = pipe(
+ prompt=prompt, negative_prompt="",
+ height=1024, width=1024, num_inference_steps=30
+)
+image.save("image_3.jpg")
+```
+
+
diff --git a/docs/source_en/creating/ToonShading.md b/docs/source_en/creating/ToonShading.md
new file mode 100644
index 0000000..87a29b8
--- /dev/null
+++ b/docs/source_en/creating/ToonShading.md
@@ -0,0 +1,95 @@
+# When Image Models Meet AnimateDiff—Model Combination Technology
+
+We have already witnessed the powerful image generation capabilities of the Stable Diffusion model and its ecosystem models. Now, we introduce a new module: AnimateDiff, which allows us to transfer the capabilities of image models to videos. In this article, we showcase an anime-style video rendering solution built on DiffSynth-Studio: Diffutoon.
+
+## Download Models
+
+The following examples will use many models, so let's download them first.
+
+* An anime-style Stable Diffusion architecture model
+* Two ControlNet models
+* A Textual Inversion model
+* An AnimateDiff model
+
+```python
+from diffsynth import download_models
+
+download_models([
+ "AingDiffusion_v12",
+ "AnimateDiff_v2",
+ "ControlNet_v11p_sd15_lineart",
+ "ControlNet_v11f1e_sd15_tile",
+ "TextualInversion_VeryBadImageNegative_v1.3"
+])
+```
+
+## Download Video
+
+You can choose any video you like. We use [this video](https://www.bilibili.com/video/BV1iG411a7sQ) as a demonstration. You can download this video file with the following command, but please note, do not use it for commercial purposes without obtaining the commercial copyright from the original video creator.
+
+```
+modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
+```
+
+## Generate Anime
+
+```python
+from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
+import torch
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+ "models/stable_diffusion/aingdiffusion_v12.safetensors",
+ "models/AnimateDiff/mm_sd_v15_v2.ckpt",
+ "models/ControlNet/control_v11p_sd15_lineart.pth",
+ "models/ControlNet/control_v11f1e_sd15_tile.pth",
+])
+
+# Build pipeline
+pipe = SDVideoPipeline.from_model_manager(
+ model_manager,
+ [
+ ControlNetConfigUnit(
+ processor_id="tile",
+ model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
+ scale=0.5
+ ),
+ ControlNetConfigUnit(
+ processor_id="lineart",
+ model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
+ scale=0.5
+ )
+ ]
+)
+pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
+
+# Load video
+video = VideoData(
+ video_file="data/examples/diffutoon/input_video.mp4",
+ height=1536, width=1536
+)
+input_video = [video[i] for i in range(30)]
+
+# Generate
+torch.manual_seed(0)
+output_video = pipe(
+ prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
+ negative_prompt="verybadimagenegative_v1.3",
+ cfg_scale=7, clip_skip=2,
+ input_frames=input_video, denoising_strength=1.0,
+ controlnet_frames=input_video, num_frames=len(input_video),
+ num_inference_steps=10, height=1536, width=1536,
+ animatediff_batch_size=16, animatediff_stride=8,
+)
+
+# Save video
+save_video(output_video, "output_video.mp4", fps=30)
+```
+
+## Effect Display
+
+
diff --git a/docs/source_en/finetune/overview.md b/docs/source_en/finetune/overview.md
new file mode 100644
index 0000000..b18abe1
--- /dev/null
+++ b/docs/source_en/finetune/overview.md
@@ -0,0 +1,102 @@
+Certainly, here is the continuation of the translation:
+
+---
+
+# Training Framework
+
+We have implemented a training framework for text-to-image diffusion models, allowing users to effortlessly train LoRA models with our framework. Our provided scripts come with the following features:
+
+* **Comprehensive Functionality**: Our training framework supports multi-GPU and multi-node configurations, is optimized for acceleration with DeepSpeed, and includes gradient checkpointing to accommodate models with higher memory requirements.
+* **Succinct Code**: We have avoided large, complex code blocks. The general module is implemented in `diffsynth/trainers/text_to_image.py`, while model-specific training scripts contain only the minimal code necessary for the model architecture, facilitating ease of use for academic researchers.
+* **Modular Design**: Built on the versatile PyTorch Lightning framework, our training framework is decoupled in functionality, enabling developers to easily incorporate additional training techniques by modifying our scripts to suit their specific needs.
+
+Examples of images fine-tuned with LoRA. Prompts are "A little dog jumping around with colorful flowers around and mountains in the background" (for Chinese models) or "a dog is jumping, flowers around the dog, the background is mountains and clouds" (for English models).
+
+||FLUX.1-dev|Kolors|Stable Diffusion 3|Hunyuan-DiT|
+|-|-|-|-|-|
+|Without LoRA|||||
+|With LoRA|||||
+
+## Install Additional Packages
+
+```bash
+pip install peft lightning
+```
+
+## Prepare the Dataset
+
+We provide an [example dataset](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files). You need to organize your training dataset in the following structure:
+
+```
+data/dog/
+└── train
+ ├── 00.jpg
+ ├── 01.jpg
+ ├── 02.jpg
+ ├── 03.jpg
+ ├── 04.jpg
+ └── metadata.csv
+```
+
+`metadata.csv`:
+
+```
+file_name,text
+00.jpg,a dog
+01.jpg,a dog
+02.jpg,a dog
+03.jpg,a dog
+04.jpg,a dog
+```
+
+Please note that if the model is a Chinese model (e.g., Hunyuan-DiT and Kolors), we recommend using Chinese text in the dataset. For example:
+
+```
+file_name,text
+00.jpg,a dog
+01.jpg,a dog
+02.jpg,a dog
+03.jpg,a dog
+04.jpg,a dog
+```
+
+## Train LoRA Model
+
+General parameter options:
+
+```
+ --lora_target_modules LORA_TARGET_MODULES
+ Layers where the LoRA modules are located.
+ --dataset_path DATASET_PATH
+ Path to the dataset.
+ --output_path OUTPUT_PATH
+ Path where the model will be saved.
+ --steps_per_epoch STEPS_PER_EPOCH
+ Number of steps per epoch.
+ --height HEIGHT The height of the image.
+ --width WIDTH The width of the image.
+ --center_crop Whether to center crop the input image to the specified resolution. If not set, the image will be randomly cropped. The image will be resized to the specified resolution before cropping.
+ --random_flip Whether to randomly horizontally flip the image.
+ --batch_size BATCH_SIZE
+ Batch size for the training data loader (per device).
+ --dataloader_num_workers DATALOADER_NUM_WORKERS
+ The number of subprocesses used for data loading. A value of 0 means the data will be loaded in the main process.
+ --precision {32,16,16-mixed}
+ The precision for training.
+ --learning_rate LEARNING_RATE
+ The learning rate.
+ --lora_rank LORA_RANK
+ The dimension of the LoRA update matrix.
+ --lora_alpha LORA_ALPHA
+ The weight of the LoRA update matrix.
+ --use_gradient_checkpointing
+ Whether to use gradient checkpointing.
+ --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
+ The number of batches for gradient accumulation.
+ --training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
+ The training strategy.
+ --max_epochs MAX_EPOCHS
+ The number of training epochs.
+ --modelscope_model_id MODELSCOPE_MODEL_ID
+ The model ID on ModelScope (https://www.modelscope.cn/). If the model ID is provided, the model will be automatically uploaded to ModelScope.
+```
diff --git a/docs/source_en/finetune/train_flux_lora.md b/docs/source_en/finetune/train_flux_lora.md
new file mode 100644
index 0000000..47ada15
--- /dev/null
+++ b/docs/source_en/finetune/train_flux_lora.md
@@ -0,0 +1,70 @@
+#Training FLUX LoRA
+
+The following files will be used to build the FLUX model. You can download them from [huggingface](https://huggingface.co/black-forest-labs/FLUX.1-dev)或[modelscope](https://www.modelscope.cn/models/ai-modelscope/flux.1-dev), or you can use the following code to download these files:
+```python
+from diffsynth import download_models
+
+download_models(["FLUX.1-dev"])
+```
+
+```
+models/FLUX/
+└── FLUX.1-dev
+ ├── ae.safetensors
+ ├── flux1-dev.safetensors
+ ├── text_encoder
+ │ └── model.safetensors
+ └── text_encoder_2
+ ├── config.json
+ ├── model-00001-of-00002.safetensors
+ ├── model-00002-of-00002.safetensors
+ └── model.safetensors.index.json
+```
+
+Start the training task with the following command:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/flux/train_flux_lora.py \
+ --pretrained_text_encoder_path models/FLUX/FLUX.1-dev/text_encoder/model.safetensors \
+ --pretrained_text_encoder_2_path models/FLUX/FLUX.1-dev/text_encoder_2 \
+ --pretrained_dit_path models/FLUX/FLUX.1-dev/flux1-dev.safetensors \
+ --pretrained_vae_path models/FLUX/FLUX.1-dev/ae.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "bf16" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information on the parameters, please use `python examples/train/flux/train_flux_lora.py -h` to view detailed information.
+
+After the training is complete, use `model_manager.load_lora` to load the LoRA for inference.
+
+```python
+from diffsynth import ModelManager, FluxImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=[
+ "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+ "models/FLUX/FLUX.1-dev/text_encoder_2",
+ "models/FLUX/FLUX.1-dev/ae.safetensors",
+ "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+ ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt=prompt,
+ num_inference_steps=30, embedded_guidance=3.5
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/finetune/train_hunyuan_dit_lora.md b/docs/source_en/finetune/train_hunyuan_dit_lora.md
new file mode 100644
index 0000000..602d1bf
--- /dev/null
+++ b/docs/source_en/finetune/train_hunyuan_dit_lora.md
@@ -0,0 +1,72 @@
+# Training Hunyuan-DiT LoRA
+
+Building the Hunyuan DiT model requires four files. You can download these files from [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) or [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary). You can use the following code to download these files:
+
+```python
+from diffsynth import download_models
+
+download_models(["HunyuanDiT"])
+```
+
+```
+models/HunyuanDiT/
+├── Put Hunyuan DiT checkpoints here.txt
+└── t2i
+ ├── clip_text_encoder
+ │ └── pytorch_model.bin
+ ├── model
+ │ └── pytorch_model_ema.pt
+ ├── mt5
+ │ └── pytorch_model.bin
+ └── sdxl-vae-fp16-fix
+ └── diffusion_pytorch_model.bin
+```
+
+Use the following command to start the training task:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
+ --pretrained_path models/HunyuanDiT/t2i \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+For more information about the parameters, please use `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` to view detailed information.
+
+After the training is complete, use `model_manager.load_lora` to load the LoRA for inference.
+
+
+```python
+from diffsynth import ModelManager, HunyuanDiTImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=[
+ "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+ "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+ "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+ "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+ ])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="A little puppy hops and jumps playfully, surrounded by a profusion of colorful flowers, with a mountain range visible in the distance.
+",
+ negative_prompt="",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/finetune/train_kolors_lora.md b/docs/source_en/finetune/train_kolors_lora.md
new file mode 100644
index 0000000..dae9d5c
--- /dev/null
+++ b/docs/source_en/finetune/train_kolors_lora.md
@@ -0,0 +1,78 @@
+# 训练 Kolors LoRA
+
+以下文件将用于构建 Kolors。你可以从 [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) 或 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) 下载 Kolors。由于精度溢出问题,我们需要下载额外的 VAE 模型(从 [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix))。你可以使用以下代码下载这些文件:
+
+
+```python
+from diffsynth import download_models
+
+download_models(["Kolors", "SDXL-vae-fp16-fix"])
+```
+
+```
+models
+├── kolors
+│ └── Kolors
+│ ├── text_encoder
+│ │ ├── config.json
+│ │ ├── pytorch_model-00001-of-00007.bin
+│ │ ├── pytorch_model-00002-of-00007.bin
+│ │ ├── pytorch_model-00003-of-00007.bin
+│ │ ├── pytorch_model-00004-of-00007.bin
+│ │ ├── pytorch_model-00005-of-00007.bin
+│ │ ├── pytorch_model-00006-of-00007.bin
+│ │ ├── pytorch_model-00007-of-00007.bin
+│ │ └── pytorch_model.bin.index.json
+│ ├── unet
+│ │ └── diffusion_pytorch_model.safetensors
+│ └── vae
+│ └── diffusion_pytorch_model.safetensors
+└── sdxl-vae-fp16-fix
+ └── diffusion_pytorch_model.safetensors
+```
+
+使用下面的命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
+ --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
+ --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
+ --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/kolors/train_kolors_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/finetune/train_sd3_lora.md b/docs/source_en/finetune/train_sd3_lora.md
new file mode 100644
index 0000000..e370175
--- /dev/null
+++ b/docs/source_en/finetune/train_sd3_lora.md
@@ -0,0 +1,59 @@
+# 训练 Stable Diffusion 3 LoRA
+
+训练脚本只需要一个文件。你可以使用 [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)(没有 T5 Encoder)或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)(有 T5 Encoder)。请使用以下代码下载这些文件:
+
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
+```
+
+```
+models/stable_diffusion_3/
+├── Put Stable Diffusion 3 checkpoints here.txt
+├── sd3_medium_incl_clips.safetensors
+└── sd3_medium_incl_clips_t5xxlfp16.safetensors
+```
+
+使用下面的命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
+ --pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, SD3ImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SD3ImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/finetune/train_sd_lora.md b/docs/source_en/finetune/train_sd_lora.md
new file mode 100644
index 0000000..e3d1abb
--- /dev/null
+++ b/docs/source_en/finetune/train_sd_lora.md
@@ -0,0 +1,59 @@
+# 训练 Stable Diffusion LoRA
+
+训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion v1.5。你可以从 [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors) 下载。你可以使用以下代码下载这个文件:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusion_v15"])
+```
+
+```
+models/stable_diffusion
+├── Put Stable Diffusion checkpoints here.txt
+└── v1-5-pruned-emaonly.safetensors
+```
+
+使用以下命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
+ --pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 512 \
+ --width 512 \
+ --center_crop \
+ --precision "16-mixed" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/stable_diffusion/train_sd_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+
+
+```python
+from diffsynth import ModelManager, SDImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=512, height=512,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/finetune/train_sdxl_lora.md b/docs/source_en/finetune/train_sdxl_lora.md
new file mode 100644
index 0000000..0b0b746
--- /dev/null
+++ b/docs/source_en/finetune/train_sdxl_lora.md
@@ -0,0 +1,57 @@
+# 训练 Stable Diffusion XL LoRA
+
+训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion XL。你可以从 [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors) 下载。也可以使用以下代码下载这个文件:
+
+```python
+from diffsynth import download_models
+
+download_models(["StableDiffusionXL_v1"])
+```
+
+```
+models/stable_diffusion_xl
+├── Put Stable Diffusion XL checkpoints here.txt
+└── sd_xl_base_1.0.safetensors
+```
+
+我们观察到 Stable Diffusion XL 在 float16 精度下会出现数值精度溢出,因此我们建议用户使用 float32 精度训练,使用以下命令启动训练任务:
+
+```
+CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
+ --pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
+ --dataset_path data/dog \
+ --output_path ./models \
+ --max_epochs 1 \
+ --steps_per_epoch 500 \
+ --height 1024 \
+ --width 1024 \
+ --center_crop \
+ --precision "32" \
+ --learning_rate 1e-4 \
+ --lora_rank 4 \
+ --lora_alpha 4 \
+ --use_gradient_checkpointing
+```
+
+有关参数的更多信息,请使用 `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` 查看详细信息。
+
+训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
+
+```python
+from diffsynth import ModelManager, SDXLImagePipeline
+import torch
+
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
+ file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
+model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
+pipe = SDXLImagePipeline.from_model_manager(model_manager)
+
+torch.manual_seed(0)
+image = pipe(
+ prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
+ negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
+ cfg_scale=7.5,
+ num_inference_steps=100, width=1024, height=1024,
+)
+image.save("image_with_lora.jpg")
+```
diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
index 7d55dba..cff3383 100644
--- a/docs/source_en/index.rst
+++ b/docs/source_en/index.rst
@@ -1,32 +1,32 @@
-.. DiffSynth-Studio documentation master file, created by
- sphinx-quickstart on Thu Sep 5 16:39:24 2024.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-DiffSynth-Studio documentation
-==============================
-
-Add your content using ``reStructuredText`` syntax. See the
-`reStructuredText `_
-documentation for details.
-
-
-.. toctree::
- :maxdepth: 1
- :caption: Contents:
-
- GetStarted/A_simple_example.md
- GetStarted/Download_models.md
- GetStarted/ModelManager.md
- GetStarted/Models.md
- GetStarted/Pipelines.md
- GetStarted/PromptProcessing.md
- GetStarted/Schedulers.md
- GetStarted/Fine-tuning.md
- GetStarted/Extensions.md
- GetStarted/WebUI.md
-
-
-.. toctree::
- :maxdepth: 1
- :caption: API Docs
+.. DiffSynth-Studio documentation master file, created by
+ sphinx-quickstart on Thu Sep 5 16:39:24 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+DiffSynth-Studio documentation
+==============================
+
+Add your content using ``reStructuredText`` syntax. See the
+`reStructuredText `_
+documentation for details.
+
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Contents:
+
+ GetStarted/A_simple_example.md
+ GetStarted/Download_models.md
+ GetStarted/ModelManager.md
+ GetStarted/Models.md
+ GetStarted/Pipelines.md
+ GetStarted/PromptProcessing.md
+ GetStarted/Schedulers.md
+ GetStarted/Fine-tuning.md
+ GetStarted/Extensions.md
+ GetStarted/WebUI.md
+
+
+.. toctree::
+ :maxdepth: 1
+ :caption: API Docs
diff --git a/docs/source_en/requirement.txt b/docs/source_en/requirement.txt
index fa5b901..6f7f63b 100644
--- a/docs/source_en/requirement.txt
+++ b/docs/source_en/requirement.txt
@@ -1,4 +1,4 @@
-recommonmark
-sphinx_rtd_theme
-myst-parser
+recommonmark
+sphinx_rtd_theme
+myst-parser
sphinx-markdown-tables
\ No newline at end of file