mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-19 23:08:13 +00:00
Add files via upload
再改一次
This commit is contained in:
49
docs/source/conf.py
Normal file
49
docs/source/conf.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# For the full list of built-in configuration values, see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
||||
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath('../../diffsynth'))
|
||||
|
||||
project = 'DiffSynth-Studio'
|
||||
copyright = '2024, ModelScope'
|
||||
author = 'ModelScope'
|
||||
release = '0.1.0'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
||||
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.napoleon',
|
||||
'sphinx.ext.doctest',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.todo',
|
||||
'sphinx.ext.coverage',
|
||||
'sphinx.ext.imgmath',
|
||||
'sphinx.ext.viewcode',
|
||||
'recommonmark',
|
||||
'sphinx_markdown_tables'
|
||||
]
|
||||
|
||||
templates_path = ['_templates']
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
source_suffix = ['.rst', '.md']
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
||||
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_static_path = ['_static']
|
||||
# multi-language docs
|
||||
language = 'zh_CN'
|
||||
locale_dirs = ['../locales/'] # path is example but recommended.
|
||||
gettext_compact = False # optional.
|
||||
gettext_uuid = True # optional.
|
||||
133
docs/source/creating/AdaptersForImageSynthesis.md
Normal file
133
docs/source/creating/AdaptersForImageSynthesis.md
Normal file
@@ -0,0 +1,133 @@
|
||||
# ControlNet、LoRA、IP-Adapter——精准控制技术
|
||||
|
||||
在文生图模型的基础上,还可以使用各种 Adapter 架构的模型对生成过程进行控制。
|
||||
|
||||
接下来的例子会用到很多模型,我们先把它们下载好。
|
||||
|
||||
* 一个广受好评的 Stable Diffusion XL 架构动漫风格模型
|
||||
* 一个支持多种控制模式的 ControlNet 模型
|
||||
* 一个 Stable Diffusion XL 模型的 LoRA 模型
|
||||
* 一个 IP-Adapter 模型及其对应的图像编码器
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models([
|
||||
"BluePencilXL_v200",
|
||||
"ControlNet_union_sdxl_promax",
|
||||
"SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
|
||||
"IP-Adapter-SDXL"
|
||||
])
|
||||
```
|
||||
|
||||
用基础文生图功能生成一张图
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models(["models/stable_diffusion_xl/bluePencilXL_v200.safetensors"])
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager)
|
||||
torch.manual_seed(1)
|
||||
image = pipe(
|
||||
prompt="masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait,",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
|
||||
cfg_scale=6, num_inference_steps=60,
|
||||
)
|
||||
image.save("image.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
接下来,我们让这位水下翩翩起舞的少女变成火系魔法师!启用 ControlNet 保持画面结构的同时,修改提示词。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
|
||||
"models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
|
||||
])
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
|
||||
ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1)
|
||||
])
|
||||
torch.manual_seed(2)
|
||||
image = pipe(
|
||||
prompt="masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
|
||||
cfg_scale=6, num_inference_steps=60,
|
||||
controlnet_image=Image.open("image.jpg")
|
||||
)
|
||||
image.save("image_controlnet.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
很酷对不对?还有更酷的,加个 LoRA,让画面更贴近手绘漫画的扁平风格。这个 LoRA 需要一定的触发词才能生效,这在原作者的模型页面有提到,记得在提示词的开头加上触发词哦。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
|
||||
"models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors"
|
||||
])
|
||||
model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
|
||||
ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
|
||||
])
|
||||
torch.manual_seed(3)
|
||||
image = pipe(
|
||||
prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
|
||||
cfg_scale=6, num_inference_steps=60,
|
||||
controlnet_image=Image.open("image.jpg")
|
||||
)
|
||||
image.save("image_lora.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
还没结束呢!找一张水墨风的中国画作为风格引导,启动 IP-Adapter,让古典艺术和现代美学碰撞!
|
||||
|
||||
|就用这张图作为风格引导吧||
|
||||
|-|-|
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline, ControlNetConfigUnit
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/stable_diffusion_xl/bluePencilXL_v200.safetensors",
|
||||
"models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors",
|
||||
"models/IpAdapter/stable_diffusion_xl/ip-adapter_sdxl.bin",
|
||||
"models/IpAdapter/stable_diffusion_xl/image_encoder/model.safetensors",
|
||||
])
|
||||
model_manager.load_lora("models/lora/zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", lora_alpha=1.0)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, controlnet_config_units=[
|
||||
ControlNetConfigUnit("depth", "models/ControlNet/controlnet_union/diffusion_pytorch_model_promax.safetensors", scale=1.0)
|
||||
])
|
||||
torch.manual_seed(2)
|
||||
image = pipe(
|
||||
prompt="zydink, ink sketch, flat anime, masterpiece, best quality, solo, long hair, wavy hair, pink hair, red eyes, red dress, medium breasts, dress, fire ball, fire background, floating hair, refraction, portrait,",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw, white background",
|
||||
cfg_scale=6, num_inference_steps=60,
|
||||
controlnet_image=Image.open("image.jpg"),
|
||||
ipadapter_images=[Image.open("ink_style.jpg")],
|
||||
ipadapter_use_instant_style=True, ipadapter_scale=0.5
|
||||
)
|
||||
image.save("image_ipadapter.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
用 Diffusion 生成图像的乐趣在于,各种生态模型的组合,可以实现各种奇思妙想。
|
||||
65
docs/source/creating/BasicImageSynthesis.md
Normal file
65
docs/source/creating/BasicImageSynthesis.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# 文生图、图生图、高分辨率修复——初识绚丽的 Diffusion
|
||||
|
||||
加载文生图模型,这里我们使用一个 Civiai 上一个动漫风格的模型作为例子。
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffsynth import ModelManager, SDImagePipeline, download_models
|
||||
|
||||
download_models(["AingDiffusion_v12"])
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models(["models/stable_diffusion/aingdiffusion_v12.safetensors"])
|
||||
pipe = SDImagePipeline.from_model_manager(model_manager)
|
||||
```
|
||||
|
||||
生成一张图小试身手。
|
||||
|
||||
```python
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="masterpiece, best quality, a girl with long silver hair",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
|
||||
height=512, width=512, num_inference_steps=80,
|
||||
)
|
||||
image.save("image.jpg")
|
||||
```
|
||||
|
||||
嗯,一个可爱的小姐姐。
|
||||
|
||||

|
||||
|
||||
用图生图功能把她的头发变成红色,只需要添加 `input_image` 和 `denoising_strength` 两个参数。其中 `denoising_strength` 用于控制加噪声的强度,为 0 时生成的图与输入的图完全一致,为 1 时完全随机生成图。
|
||||
|
||||
```python
|
||||
torch.manual_seed(1)
|
||||
image_edited = pipe(
|
||||
prompt="masterpiece, best quality, a girl with long red hair",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
|
||||
height=512, width=512, num_inference_steps=80,
|
||||
input_image=image, denoising_strength=0.6,
|
||||
)
|
||||
image_edited.save("image_edited.jpg")
|
||||
```
|
||||
|
||||
嗯,一个红色头发的可爱小姐姐。
|
||||
|
||||

|
||||
|
||||
由于模型本身是在 512*512 分辨率下训练的,所以图片看起来有点模糊,不过我们可以利用模型自身的能力润色这张图,为其填充细节。具体来说,就是提高分辨率后进行图生图。
|
||||
|
||||
```python
|
||||
torch.manual_seed(2)
|
||||
image_highres = pipe(
|
||||
prompt="masterpiece, best quality, a girl with long red hair",
|
||||
negative_prompt="worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw,",
|
||||
height=1024, width=1024, num_inference_steps=80,
|
||||
input_image=image_edited.resize((1024, 1024)), denoising_strength=0.6,
|
||||
)
|
||||
image_highres.save("image_highres.jpg")
|
||||
```
|
||||
|
||||
嗯,一个清晰的红色头发可爱小姐姐。
|
||||
|
||||

|
||||
|
||||
值得注意的是,图生图和高分辨率修复功能是全局支持的,目前我们所有的图像生成流水线都可以这样使用。
|
||||
78
docs/source/creating/PromptRefine.md
Normal file
78
docs/source/creating/PromptRefine.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# 翻译、润色——提示词的魔法
|
||||
|
||||
在生成图像时,我们需要编写提示词,用来描述图像的内容。提示词会直接影响生成的效果,但提示词的编写也是一门学问,好的提示词可以生成具有高度美感的图像,我们提供了一系列模型来帮助用户处理提示词。
|
||||
|
||||
## 翻译
|
||||
|
||||
目前大多数文生图模型都是只支持英文提示词的,对于非英文母语的用户,使用起来有些困难,我们可以使用开源的翻译模型把提示词翻译成英文。在下面这个例子中,我们以“一个女孩”为提示词,使用模型 opus-mt-zh-en(可在 [HuggingFace](https://huggingface.co/Helsinki-NLP/opus-mt-zh-en) 或 [ModelScope](https://modelscope.cn/models/moxying/opus-mt-zh-en) 下载)进行翻译。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline, Translator
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(
|
||||
torch_dtype=torch.float16, device="cuda",
|
||||
model_id_list=["BluePencilXL_v200", "opus-mt-zh-en"]
|
||||
)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator])
|
||||
|
||||
torch.manual_seed(0)
|
||||
prompt = "一个女孩"
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
height=1024, width=1024, num_inference_steps=30
|
||||
)
|
||||
image.save("image_1.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
## 润色
|
||||
|
||||
详细的提示词可以生成细节更丰富的图像,我们可以使用提示词润色模型 BeautifulPrompt(可在 [HuggingFace](https://huggingface.co/alibaba-pai/pai-bloom-1b1-text2prompt-sd) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd) 下载)对简单的提示词进行润色,这个模型能够让整体画面风格更加华丽。
|
||||
|
||||
这个模块可以和翻译模块同时启用,但请注意顺序,先翻译,后润色。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline, Translator, BeautifulPrompt
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(
|
||||
torch_dtype=torch.float16, device="cuda",
|
||||
model_id_list=["BluePencilXL_v200", "opus-mt-zh-en", "BeautifulPrompt"]
|
||||
)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
|
||||
|
||||
torch.manual_seed(0)
|
||||
prompt = "一个女孩"
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
height=1024, width=1024, num_inference_steps=30
|
||||
)
|
||||
image.save("image_2.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
我们还内置了一个通义千问模型,这个模型可以一步到位地完成提示词的翻译和润色工作。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline, QwenPrompt
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(
|
||||
torch_dtype=torch.float16, device="cuda",
|
||||
model_id_list=["BluePencilXL_v200", "QwenPrompt"]
|
||||
)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[QwenPrompt])
|
||||
|
||||
torch.manual_seed(0)
|
||||
prompt = "一个女孩"
|
||||
image = pipe(
|
||||
prompt=prompt, negative_prompt="",
|
||||
height=1024, width=1024, num_inference_steps=30
|
||||
)
|
||||
image.save("image_3.jpg")
|
||||
```
|
||||
|
||||

|
||||
95
docs/source/creating/ToonShading.md
Normal file
95
docs/source/creating/ToonShading.md
Normal file
@@ -0,0 +1,95 @@
|
||||
# 当图像模型遇见 AnimateDiff——模型组合技术
|
||||
|
||||
我们已经领略到了 Stable Diffusion 模型及其生态模型的强大图像生成能力,现在我们引入一个新的模块:AnimateDiff,这样一来就可以把图像模型的能力迁移到视频中。在本篇文章中,我们为您展示基于 DiffSynth-Studio 搭建的动漫风格视频渲染方案:Diffutoon。
|
||||
|
||||
## 下载模型
|
||||
|
||||
接下来的例子会用到很多模型,我们先把它们下载好。
|
||||
|
||||
* 一个动漫风格的 Stable Diffusion 架构模型
|
||||
* 两个 ControlNet 模型
|
||||
* 一个 Textual Inversion 模型
|
||||
* 一个 AnimateDiff 模型
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models([
|
||||
"AingDiffusion_v12",
|
||||
"AnimateDiff_v2",
|
||||
"ControlNet_v11p_sd15_lineart",
|
||||
"ControlNet_v11f1e_sd15_tile",
|
||||
"TextualInversion_VeryBadImageNegative_v1.3"
|
||||
])
|
||||
```
|
||||
|
||||
## 下载视频
|
||||
|
||||
你可以随意选择任何你喜欢的视频,我们使用[这个视频](https://www.bilibili.com/video/BV1iG411a7sQ)作为演示,你可以通过以下命令下载这个视频文件,但请注意,在没有获得视频原作者的商用版权时,请不要将其用作商业用途。
|
||||
|
||||
```
|
||||
modelscope download --dataset Artiprocher/examples_in_diffsynth data/examples/diffutoon/input_video.mp4 --local_dir ./
|
||||
```
|
||||
|
||||
## 生成动漫
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video
|
||||
import torch
|
||||
|
||||
# Load models
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/stable_diffusion/aingdiffusion_v12.safetensors",
|
||||
"models/AnimateDiff/mm_sd_v15_v2.ckpt",
|
||||
"models/ControlNet/control_v11p_sd15_lineart.pth",
|
||||
"models/ControlNet/control_v11f1e_sd15_tile.pth",
|
||||
])
|
||||
|
||||
# Build pipeline
|
||||
pipe = SDVideoPipeline.from_model_manager(
|
||||
model_manager,
|
||||
[
|
||||
ControlNetConfigUnit(
|
||||
processor_id="tile",
|
||||
model_path="models/ControlNet/control_v11f1e_sd15_tile.pth",
|
||||
scale=0.5
|
||||
),
|
||||
ControlNetConfigUnit(
|
||||
processor_id="lineart",
|
||||
model_path="models/ControlNet/control_v11p_sd15_lineart.pth",
|
||||
scale=0.5
|
||||
)
|
||||
]
|
||||
)
|
||||
pipe.prompter.load_textual_inversions(["models/textual_inversion/verybadimagenegative_v1.3.pt"])
|
||||
|
||||
# Load video
|
||||
video = VideoData(
|
||||
video_file="data/examples/diffutoon/input_video.mp4",
|
||||
height=1536, width=1536
|
||||
)
|
||||
input_video = [video[i] for i in range(30)]
|
||||
|
||||
# Generate
|
||||
torch.manual_seed(0)
|
||||
output_video = pipe(
|
||||
prompt="best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
|
||||
negative_prompt="verybadimagenegative_v1.3",
|
||||
cfg_scale=7, clip_skip=2,
|
||||
input_frames=input_video, denoising_strength=1.0,
|
||||
controlnet_frames=input_video, num_frames=len(input_video),
|
||||
num_inference_steps=10, height=1536, width=1536,
|
||||
animatediff_batch_size=16, animatediff_stride=8,
|
||||
)
|
||||
|
||||
# Save video
|
||||
save_video(output_video, "output_video.mp4", fps=30)
|
||||
```
|
||||
|
||||
## 效果展示
|
||||
|
||||
<video width="512" height="256" controls>
|
||||
<source src="https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd" type="video/mp4">
|
||||
您的浏览器不支持Video标签。
|
||||
</video>
|
||||
98
docs/source/finetune/overview.md
Normal file
98
docs/source/finetune/overview.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# 训练框架
|
||||
|
||||
我们实现了一个用于文本到图像扩散模型的训练框架,使用户能够轻松地使用我们的框架训练 LoRA 模型。我们提供的脚本具有以下特点:
|
||||
|
||||
* **功能全面**:我们的训练框架支持多GPU和多机器配置,便于使用 DeepSpeed 加速,并包括梯度检查点优化,适用于内存需求较大的模型。
|
||||
* **代码简洁**:我们避免了大块复杂的代码。通用模块实现于 `diffsynth/trainers/text_to_image.py` 中,而模型特定的训练脚本仅包含与模型架构相关的最少代码,便于学术研究人员使用。
|
||||
* **模块化设计**:基于通用的 Pytorch-Lightning 框架,我们的训练框架在功能上是解耦的,允许开发者通过修改我们的脚本轻松引入额外的训练技术,以满足他们的需求。
|
||||
|
||||
LoRA 微调的图像示例。提示词为 "一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉"(针对中文模型)或 "a dog is jumping, flowers around the dog, the background is mountains and clouds"(针对英文模型)。
|
||||
|
||||
||FLUX.1-dev|Kolors|Stable Diffusion 3|Hunyuan-DiT|
|
||||
|-|-|-|-|-|
|
||||
|Without LoRA|||||
|
||||
|With LoRA|||||
|
||||
|
||||
## 安装额外包
|
||||
|
||||
```
|
||||
pip install peft lightning
|
||||
```
|
||||
|
||||
## 准备数据集
|
||||
|
||||
我们提供了一个[示例数据集](https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/files)。你需要将训练数据集按照如下形式组织:
|
||||
|
||||
```
|
||||
data/dog/
|
||||
└── train
|
||||
├── 00.jpg
|
||||
├── 01.jpg
|
||||
├── 02.jpg
|
||||
├── 03.jpg
|
||||
├── 04.jpg
|
||||
└── metadata.csv
|
||||
```
|
||||
|
||||
`metadata.csv`:
|
||||
|
||||
```
|
||||
file_name,text
|
||||
00.jpg,a dog
|
||||
01.jpg,a dog
|
||||
02.jpg,a dog
|
||||
03.jpg,a dog
|
||||
04.jpg,a dog
|
||||
```
|
||||
|
||||
请注意,如果模型是中文模型(例如,Hunyuan-DiT 和 Kolors),我们建议在数据集中使用中文文本。例如:
|
||||
|
||||
```
|
||||
file_name,text
|
||||
00.jpg,一只小狗
|
||||
01.jpg,一只小狗
|
||||
02.jpg,一只小狗
|
||||
03.jpg,一只小狗
|
||||
04.jpg,一只小狗
|
||||
```
|
||||
|
||||
## 训练 LoRA 模型
|
||||
|
||||
通用参数选项:
|
||||
|
||||
```
|
||||
--lora_target_modules LORA_TARGET_MODULES
|
||||
LoRA 模块所在的层。
|
||||
--dataset_path DATASET_PATH
|
||||
数据集的路径。
|
||||
--output_path OUTPUT_PATH
|
||||
模型保存路径。
|
||||
--steps_per_epoch STEPS_PER_EPOCH
|
||||
每个周期的步数。
|
||||
--height HEIGHT 图像高度。
|
||||
--width WIDTH 图像宽度。
|
||||
--center_crop 是否将输入图像中心裁剪到指定分辨率。如果未设置,图像将被随机裁剪。图像会在裁剪前先调整到指定分辨率。
|
||||
--random_flip 是否随机水平翻转图像。
|
||||
--batch_size BATCH_SIZE
|
||||
训练数据加载器的批量大小(每设备)。
|
||||
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||||
数据加载使用的子进程数量。0 表示数据将在主进程中加载。
|
||||
--precision {32,16,16-mixed}
|
||||
训练精度。
|
||||
--learning_rate LEARNING_RATE
|
||||
学习率。
|
||||
--lora_rank LORA_RANK
|
||||
LoRA 更新矩阵的维度。
|
||||
--lora_alpha LORA_ALPHA
|
||||
LoRA 更新矩阵的权重。
|
||||
--use_gradient_checkpointing
|
||||
是否使用梯度检查点。
|
||||
--accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
|
||||
梯度累积的批次数量。
|
||||
--training_strategy {auto,deepspeed_stage_1,deepspeed_stage_2,deepspeed_stage_3}
|
||||
训练策略。
|
||||
--max_epochs MAX_EPOCHS
|
||||
训练轮数。
|
||||
--modelscope_model_id MODELSCOPE_MODEL_ID
|
||||
ModelScope 上的模型 ID (https://www.modelscope.cn/)。如果提供模型 ID,模型将自动上传到 ModelScope。
|
||||
```
|
||||
71
docs/source/finetune/train_flux_lora.md
Normal file
71
docs/source/finetune/train_flux_lora.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# 训练 FLUX LoRA
|
||||
|
||||
以下文件将会被用于构建 FLUX 模型。 你可以从[huggingface](https://huggingface.co/black-forest-labs/FLUX.1-dev)或[modelscope](https://www.modelscope.cn/models/ai-modelscope/flux.1-dev)下载,也可以使用以下代码下载这些文件:
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["FLUX.1-dev"])
|
||||
```
|
||||
|
||||
```
|
||||
models/FLUX/
|
||||
└── FLUX.1-dev
|
||||
├── ae.safetensors
|
||||
├── flux1-dev.safetensors
|
||||
├── text_encoder
|
||||
│ └── model.safetensors
|
||||
└── text_encoder_2
|
||||
├── config.json
|
||||
├── model-00001-of-00002.safetensors
|
||||
├── model-00002-of-00002.safetensors
|
||||
└── model.safetensors.index.json
|
||||
```
|
||||
|
||||
使用以下命令启动训练任务:
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES="0" python examples/train/flux/train_flux_lora.py \
|
||||
--pretrained_text_encoder_path models/FLUX/FLUX.1-dev/text_encoder/model.safetensors \
|
||||
--pretrained_text_encoder_2_path models/FLUX/FLUX.1-dev/text_encoder_2 \
|
||||
--pretrained_dit_path models/FLUX/FLUX.1-dev/flux1-dev.safetensors \
|
||||
--pretrained_vae_path models/FLUX/FLUX.1-dev/ae.safetensors \
|
||||
--dataset_path data/dog \
|
||||
--output_path ./models \
|
||||
--max_epochs 1 \
|
||||
--steps_per_epoch 500 \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--center_crop \
|
||||
--precision "bf16" \
|
||||
--learning_rate 1e-4 \
|
||||
--lora_rank 4 \
|
||||
--lora_alpha 4 \
|
||||
--use_gradient_checkpointing
|
||||
```
|
||||
|
||||
有关参数的更多信息,请使用 `python examples/train/flux/train_flux_lora.py -h` 查看详细信息。
|
||||
|
||||
训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, FluxImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
|
||||
file_path_list=[
|
||||
"models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
|
||||
"models/FLUX/FLUX.1-dev/text_encoder_2",
|
||||
"models/FLUX/FLUX.1-dev/ae.safetensors",
|
||||
"models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
|
||||
])
|
||||
model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=30, embedded_guidance=3.5
|
||||
)
|
||||
image.save("image_with_lora.jpg")
|
||||
```
|
||||
72
docs/source/finetune/train_hunyuan_dit_lora.md
Normal file
72
docs/source/finetune/train_hunyuan_dit_lora.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# 训练 Hunyuan-DiT LoRA
|
||||
|
||||
构建 Hunyuan DiT 需要四个文件。你可以从 [HuggingFace](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT) 或 [ModelScope](https://www.modelscope.cn/models/modelscope/HunyuanDiT/summary) 下载这些文件。你可以使用以下代码下载这些文件:
|
||||
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["HunyuanDiT"])
|
||||
```
|
||||
|
||||
```
|
||||
models/HunyuanDiT/
|
||||
├── Put Hunyuan DiT checkpoints here.txt
|
||||
└── t2i
|
||||
├── clip_text_encoder
|
||||
│ └── pytorch_model.bin
|
||||
├── model
|
||||
│ └── pytorch_model_ema.pt
|
||||
├── mt5
|
||||
│ └── pytorch_model.bin
|
||||
└── sdxl-vae-fp16-fix
|
||||
└── diffusion_pytorch_model.bin
|
||||
```
|
||||
|
||||
使用以下命令启动训练任务:
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES="0" python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py \
|
||||
--pretrained_path models/HunyuanDiT/t2i \
|
||||
--dataset_path data/dog \
|
||||
--output_path ./models \
|
||||
--max_epochs 1 \
|
||||
--steps_per_epoch 500 \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--center_crop \
|
||||
--precision "16-mixed" \
|
||||
--learning_rate 1e-4 \
|
||||
--lora_rank 4 \
|
||||
--lora_alpha 4 \
|
||||
--use_gradient_checkpointing
|
||||
```
|
||||
|
||||
有关参数的更多信息,请使用 `python examples/train/hunyuan_dit/train_hunyuan_dit_lora.py -h` 查看详细信息。
|
||||
|
||||
训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
|
||||
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, HunyuanDiTImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
|
||||
file_path_list=[
|
||||
"models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
|
||||
"models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
|
||||
"models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
|
||||
"models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
|
||||
])
|
||||
model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
|
||||
pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="一只小狗蹦蹦跳跳,周围是姹紫嫣红的鲜花,远处是山脉",
|
||||
negative_prompt="",
|
||||
cfg_scale=7.5,
|
||||
num_inference_steps=100, width=1024, height=1024,
|
||||
)
|
||||
image.save("image_with_lora.jpg")
|
||||
```
|
||||
78
docs/source/finetune/train_kolors_lora.md
Normal file
78
docs/source/finetune/train_kolors_lora.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# 训练 Kolors LoRA
|
||||
|
||||
以下文件将用于构建 Kolors。你可以从 [HuggingFace](https://huggingface.co/Kwai-Kolors/Kolors) 或 [ModelScope](https://modelscope.cn/models/Kwai-Kolors/Kolors) 下载 Kolors。由于精度溢出问题,我们需要下载额外的 VAE 模型(从 [HuggingFace](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) 或 [ModelScope](https://modelscope.cn/models/AI-ModelScope/sdxl-vae-fp16-fix))。你可以使用以下代码下载这些文件:
|
||||
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["Kolors", "SDXL-vae-fp16-fix"])
|
||||
```
|
||||
|
||||
```
|
||||
models
|
||||
├── kolors
|
||||
│ └── Kolors
|
||||
│ ├── text_encoder
|
||||
│ │ ├── config.json
|
||||
│ │ ├── pytorch_model-00001-of-00007.bin
|
||||
│ │ ├── pytorch_model-00002-of-00007.bin
|
||||
│ │ ├── pytorch_model-00003-of-00007.bin
|
||||
│ │ ├── pytorch_model-00004-of-00007.bin
|
||||
│ │ ├── pytorch_model-00005-of-00007.bin
|
||||
│ │ ├── pytorch_model-00006-of-00007.bin
|
||||
│ │ ├── pytorch_model-00007-of-00007.bin
|
||||
│ │ └── pytorch_model.bin.index.json
|
||||
│ ├── unet
|
||||
│ │ └── diffusion_pytorch_model.safetensors
|
||||
│ └── vae
|
||||
│ └── diffusion_pytorch_model.safetensors
|
||||
└── sdxl-vae-fp16-fix
|
||||
└── diffusion_pytorch_model.safetensors
|
||||
```
|
||||
|
||||
使用下面的命令启动训练任务:
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES="0" python examples/train/kolors/train_kolors_lora.py \
|
||||
--pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \
|
||||
--pretrained_text_encoder_path models/kolors/Kolors/text_encoder \
|
||||
--pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \
|
||||
--dataset_path data/dog \
|
||||
--output_path ./models \
|
||||
--max_epochs 1 \
|
||||
--steps_per_epoch 500 \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--center_crop \
|
||||
--precision "16-mixed" \
|
||||
--learning_rate 1e-4 \
|
||||
--lora_rank 4 \
|
||||
--lora_alpha 4 \
|
||||
--use_gradient_checkpointing
|
||||
```
|
||||
|
||||
有关参数的更多信息,请使用 `python examples/train/kolors/train_kolors_lora.py -h` 查看详细信息。
|
||||
|
||||
训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
|
||||
|
||||
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SD3ImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
|
||||
file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
|
||||
model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
|
||||
pipe = SD3ImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
|
||||
negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
|
||||
cfg_scale=7.5,
|
||||
num_inference_steps=100, width=1024, height=1024,
|
||||
)
|
||||
image.save("image_with_lora.jpg")
|
||||
```
|
||||
59
docs/source/finetune/train_sd3_lora.md
Normal file
59
docs/source/finetune/train_sd3_lora.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# 训练 Stable Diffusion 3 LoRA
|
||||
|
||||
训练脚本只需要一个文件。你可以使用 [`sd3_medium_incl_clips.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)(没有 T5 Encoder)或 [`sd3_medium_incl_clips_t5xxlfp16.safetensors`](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors)(有 T5 Encoder)。请使用以下代码下载这些文件:
|
||||
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["StableDiffusion3", "StableDiffusion3_without_T5"])
|
||||
```
|
||||
|
||||
```
|
||||
models/stable_diffusion_3/
|
||||
├── Put Stable Diffusion 3 checkpoints here.txt
|
||||
├── sd3_medium_incl_clips.safetensors
|
||||
└── sd3_medium_incl_clips_t5xxlfp16.safetensors
|
||||
```
|
||||
|
||||
使用下面的命令启动训练任务:
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_3/train_sd3_lora.py \
|
||||
--pretrained_path models/stable_diffusion_3/sd3_medium_incl_clips.safetensors \
|
||||
--dataset_path data/dog \
|
||||
--output_path ./models \
|
||||
--max_epochs 1 \
|
||||
--steps_per_epoch 500 \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--center_crop \
|
||||
--precision "16-mixed" \
|
||||
--learning_rate 1e-4 \
|
||||
--lora_rank 4 \
|
||||
--lora_alpha 4 \
|
||||
--use_gradient_checkpointing
|
||||
```
|
||||
|
||||
有关参数的更多信息,请使用 `python examples/train/stable_diffusion_3/train_sd3_lora.py -h` 查看详细信息。
|
||||
|
||||
训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SD3ImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
|
||||
file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
|
||||
model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
|
||||
pipe = SD3ImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
|
||||
negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
|
||||
cfg_scale=7.5,
|
||||
num_inference_steps=100, width=1024, height=1024,
|
||||
)
|
||||
image.save("image_with_lora.jpg")
|
||||
```
|
||||
59
docs/source/finetune/train_sd_lora.md
Normal file
59
docs/source/finetune/train_sd_lora.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# 训练 Stable Diffusion LoRA
|
||||
|
||||
训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion v1.5。你可以从 [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-v1-5/resolve/master/v1-5-pruned-emaonly.safetensors) 下载。你可以使用以下代码下载这个文件:
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["StableDiffusion_v15"])
|
||||
```
|
||||
|
||||
```
|
||||
models/stable_diffusion
|
||||
├── Put Stable Diffusion checkpoints here.txt
|
||||
└── v1-5-pruned-emaonly.safetensors
|
||||
```
|
||||
|
||||
使用以下命令启动训练任务:
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion/train_sd_lora.py \
|
||||
--pretrained_path models/stable_diffusion/v1-5-pruned-emaonly.safetensors \
|
||||
--dataset_path data/dog \
|
||||
--output_path ./models \
|
||||
--max_epochs 1 \
|
||||
--steps_per_epoch 500 \
|
||||
--height 512 \
|
||||
--width 512 \
|
||||
--center_crop \
|
||||
--precision "16-mixed" \
|
||||
--learning_rate 1e-4 \
|
||||
--lora_rank 4 \
|
||||
--lora_alpha 4 \
|
||||
--use_gradient_checkpointing
|
||||
```
|
||||
|
||||
有关参数的更多信息,请使用 `python examples/train/stable_diffusion/train_sd_lora.py -h` 查看详细信息。
|
||||
|
||||
训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
|
||||
|
||||
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
|
||||
file_path_list=["models/stable_diffusion/v1-5-pruned-emaonly.safetensors"])
|
||||
model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
|
||||
pipe = SDImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
|
||||
negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
|
||||
cfg_scale=7.5,
|
||||
num_inference_steps=100, width=512, height=512,
|
||||
)
|
||||
image.save("image_with_lora.jpg")
|
||||
```
|
||||
57
docs/source/finetune/train_sdxl_lora.md
Normal file
57
docs/source/finetune/train_sdxl_lora.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# 训练 Stable Diffusion XL LoRA
|
||||
|
||||
训练脚本只需要一个文件。我们支持 [CivitAI](https://civitai.com/) 中的主流检查点。默认情况下,我们使用基础的 Stable Diffusion XL。你可以从 [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors) 或 [ModelScope](https://www.modelscope.cn/models/AI-ModelScope/stable-diffusion-xl-base-1.0/resolve/master/sd_xl_base_1.0.safetensors) 下载。也可以使用以下代码下载这个文件:
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["StableDiffusionXL_v1"])
|
||||
```
|
||||
|
||||
```
|
||||
models/stable_diffusion_xl
|
||||
├── Put Stable Diffusion XL checkpoints here.txt
|
||||
└── sd_xl_base_1.0.safetensors
|
||||
```
|
||||
|
||||
我们观察到 Stable Diffusion XL 在 float16 精度下会出现数值精度溢出,因此我们建议用户使用 float32 精度训练,使用以下命令启动训练任务:
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES="0" python examples/train/stable_diffusion_xl/train_sdxl_lora.py \
|
||||
--pretrained_path models/stable_diffusion_xl/sd_xl_base_1.0.safetensors \
|
||||
--dataset_path data/dog \
|
||||
--output_path ./models \
|
||||
--max_epochs 1 \
|
||||
--steps_per_epoch 500 \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--center_crop \
|
||||
--precision "32" \
|
||||
--learning_rate 1e-4 \
|
||||
--lora_rank 4 \
|
||||
--lora_alpha 4 \
|
||||
--use_gradient_checkpointing
|
||||
```
|
||||
|
||||
有关参数的更多信息,请使用 `python examples/train/stable_diffusion_xl/train_sdxl_lora.py -h` 查看详细信息。
|
||||
|
||||
训练完成后,使用 `model_manager.load_lora` 加载 LoRA 以进行推理。
|
||||
|
||||
```python
|
||||
from diffsynth import ModelManager, SDXLImagePipeline
|
||||
import torch
|
||||
|
||||
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
|
||||
file_path_list=["models/stable_diffusion_xl/sd_xl_base_1.0.safetensors"])
|
||||
model_manager.load_lora("models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt", lora_alpha=1.0)
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="a dog is jumping, flowers around the dog, the background is mountains and clouds",
|
||||
negative_prompt="bad quality, poor quality, doll, disfigured, jpg, toy, bad anatomy, missing limbs, missing fingers, 3d, cgi, extra tails",
|
||||
cfg_scale=7.5,
|
||||
num_inference_steps=100, width=1024, height=1024,
|
||||
)
|
||||
image.save("image_with_lora.jpg")
|
||||
```
|
||||
44
docs/source/index.rst
Normal file
44
docs/source/index.rst
Normal file
@@ -0,0 +1,44 @@
|
||||
.. DiffSynth-Studio documentation master file, created by
|
||||
sphinx-quickstart on Thu Sep 5 16:39:24 2024.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
DiffSynth-Studio 文档
|
||||
==============================
|
||||
|
||||
欢迎来到 DiffSynth-Studio,我们旨在构建 Diffusion 模型的开源互联生态,在这里,你可以体验到 AIGC(AI Generated Content)技术魔法般的魅力!
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: 快速开始
|
||||
|
||||
tutorial/ASimpleExample.md
|
||||
tutorial/Installation.md
|
||||
tutorial/DownloadModels.md
|
||||
tutorial/Models.md
|
||||
tutorial/Pipelines.md
|
||||
tutorial/PromptProcessing.md
|
||||
tutorial/Extensions.md
|
||||
tutorial/Schedulers.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: 开启创作之旅
|
||||
|
||||
creating/BasicImageSynthesis.md
|
||||
creating/AdaptersForImageSynthesis.md
|
||||
creating/ToonShading.md
|
||||
creating/PromptRefine.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: 微调
|
||||
|
||||
finetune/overview.md
|
||||
finetune/train_flux_lora.md
|
||||
finetune/train_kolors_lora.md
|
||||
finetune/train_sd3_lora.md
|
||||
finetune/train_hunyuan_dit_lora.md
|
||||
finetune/train_sdxl_lora.md
|
||||
finetune/train_sd_lora.md
|
||||
|
||||
4
docs/source/requirement.txt
Normal file
4
docs/source/requirement.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
recommonmark
|
||||
sphinx_rtd_theme
|
||||
myst-parser
|
||||
sphinx-markdown-tables
|
||||
85
docs/source/tutorial/ASimpleExample.md
Normal file
85
docs/source/tutorial/ASimpleExample.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# 快速开始
|
||||
|
||||
在这篇文档中,我们通过一段代码为你介绍如何快速上手使用 DiffSynth-Studio 进行创作。
|
||||
|
||||
## 安装
|
||||
|
||||
使用以下命令从 GitHub 克隆并安装 DiffSynth-Studio。更多信息请参考[安装](./Installation.md)。
|
||||
|
||||
```shell
|
||||
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
||||
cd DiffSynth-Studio
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## 一键运行!
|
||||
|
||||
通过运行以下代码,我们将会下载模型、加载模型、生成图像。
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffsynth import ModelManager, FluxImagePipeline
|
||||
|
||||
model_manager = ModelManager(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_id_list=["FLUX.1-dev"]
|
||||
)
|
||||
pipe = FluxImagePipeline.from_model_manager(model_manager)
|
||||
|
||||
torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt="In a forest, a wooden plank sign reading DiffSynth",
|
||||
height=576, width=1024,
|
||||
)
|
||||
image.save("image.jpg")
|
||||
```
|
||||
|
||||

|
||||
|
||||
从这个例子中,我们可以看到,DiffSynth 中有两个关键模块:`ModelManager` 和 `Pipeline`,接下来我们详细介绍。
|
||||
|
||||
## 下载和加载模型
|
||||
|
||||
`ModelManager` 负责下载和加载模型,通过以下代码可以直接一步完成。
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffsynth import ModelManager
|
||||
|
||||
model_manager = ModelManager(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_id_list=["FLUX.1-dev"]
|
||||
)
|
||||
```
|
||||
|
||||
当然,我们也支持分步完成,以下代码和上述代码的行为是等价的。
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffsynth import download_models, ModelManager
|
||||
|
||||
download_models(["FLUX.1-dev"])
|
||||
model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cuda")
|
||||
model_manager.load_models([
|
||||
"models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
|
||||
"models/FLUX/FLUX.1-dev/text_encoder_2",
|
||||
"models/FLUX/FLUX.1-dev/ae.safetensors",
|
||||
"models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
|
||||
])
|
||||
```
|
||||
|
||||
下载模型时,我们支持从 [ModelScope](https://www.modelscope.cn/) 和 [HuggingFace](https://huggingface.co/) 下载模型,也支持下载非预置的模型,关于模型下载的更多信息请参考[模型下载](./DownloadModels.md)。
|
||||
|
||||
加载模型时,你可以把所有想要加载的模型路径放入其中。对于 `.safetensors` 等格式的模型权重文件,`ModelManager` 在加载后会自动判断模型类型;对于文件夹格式的模型,`ModelManager` 会尝试解析其中的 `config.json` 文件并尝试调用 `transformers` 等第三方库中的对应模块。关于 DiffSynth-Studio 支持的模型,请参考[支持的模型](./Models.md)。
|
||||
|
||||
## 构建 Pipeline
|
||||
|
||||
DiffSynth-Studio 提供了多个推理 `Pipeline`,这些 `Pipeline` 可以直接通过 `ModelManager` 获取所需的模型并初始化。例如,FLUX.1-dev 模型的文生图 `Pipeline` 可以这样构建:
|
||||
|
||||
```python
|
||||
pipe = FluxImagePipeline.from_model_manager(model_manager)
|
||||
```
|
||||
|
||||
更多用于图像生成和视频生成的 `Pipeline` 详见[推理流水线](./Pipelines.md)。
|
||||
34
docs/source/tutorial/DownloadModels.md
Normal file
34
docs/source/tutorial/DownloadModels.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# 下载模型
|
||||
|
||||
我们在 DiffSynth-Studio 中预置了一些主流 Diffusion 模型的下载链接,你可以下载并使用这些模型。
|
||||
|
||||
## 下载预置模型
|
||||
|
||||
你可以直接使用 `download_models` 函数下载预置的模型文件,其中模型 ID 可参考 [config file](/diffsynth/configs/model_config.py)。
|
||||
|
||||
```python
|
||||
from diffsynth import download_models
|
||||
|
||||
download_models(["FLUX.1-dev"])
|
||||
```
|
||||
|
||||
对于 VSCode 用户,激活 Pylance 或其他 Python 语言服务后,在代码中输入 `""` 即可显示支持的所有模型 ID。
|
||||
|
||||

|
||||
|
||||
## 下载非预置模型
|
||||
|
||||
你可以选择 [ModelScope](https://modelscope.cn/models) 和 [HuggingFace](https://huggingface.co/models) 两个下载源中的模型。当然,你也可以通过浏览器等工具选择手动下载自己所需的模型。
|
||||
|
||||
```python
|
||||
from diffsynth import download_customized_models
|
||||
|
||||
download_customized_models(
|
||||
model_id="Kwai-Kolors/Kolors",
|
||||
origin_file_path="vae/diffusion_pytorch_model.fp16.bin",
|
||||
local_dir="models/kolors/Kolors/vae",
|
||||
downloading_priority=["ModelScope", "HuggingFace"]
|
||||
)
|
||||
```
|
||||
|
||||
在这段代码中,我们将会按照下载的优先级,优先从 `ModelScope` 下载,在 ID 为 `Kwai-Kolors/Kolors` 的[模型库](https://modelscope.cn/models/Kwai-Kolors/Kolors)中,把文件 `vae/diffusion_pytorch_model.fp16.bin` 下载到本地的路径 `models/kolors/Kolors/vae` 中。
|
||||
49
docs/source/tutorial/Extensions.md
Normal file
49
docs/source/tutorial/Extensions.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# 扩展功能
|
||||
|
||||
本文档介绍了一些在 DiffSynth 实现的 Diffusion 模型之外的相关技术,这些模型在图像和视频处理方面具有显著的应用潜力。
|
||||
|
||||
- **[RIFE](https://github.com/hzwer/ECCV2022-RIFE)**:RIFE 是一个基于实时中间流估计的帧插值方法。采用 IFNet 结构的模型,能够以很快的速度端到端估计中间流。RIFE 不依赖于预训练的光流模型,能够支持任意时间步的帧插值,通过时间编码输入进行处理。
|
||||
|
||||
在这段代码中,我们用 RIFE 模型把视频的帧数提升到原来的两倍。
|
||||
|
||||
```python
|
||||
from diffsynth import VideoData, ModelManager, save_video
|
||||
from diffsynth.extensions.RIFE import RIFEInterpolater
|
||||
|
||||
model_manager = ModelManager(model_id_list=["RIFE"])
|
||||
rife = RIFEInterpolater.from_model_manager(model_manager)
|
||||
video = VideoData("input_video.mp4", height=512, width=768).raw_data()
|
||||
video = rife.interpolate(video)
|
||||
save_video(video, "output_video.mp4", fps=60)
|
||||
```
|
||||
|
||||
- **[ESRGAN](https://github.com/xinntao/ESRGAN)**: ESRGAN 是一个图像超分辨率模型,能够实现四倍的分辨率提升。该方法通过优化网络架构、对抗损失和感知损失,显著提升了生成图像的真实感。
|
||||
|
||||
在这段代码中,我们用 ESRGAN 模型把图像分辨率提升到原来的四倍。
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
from diffsynth import ModelManager
|
||||
from diffsynth.extensions.ESRGAN import ESRGAN
|
||||
|
||||
model_manager = ModelManager(model_id_list=["ESRGAN_x4"])
|
||||
rife = ESRGAN.from_model_manager(model_manager)
|
||||
image = Image.open("input_image.jpg")
|
||||
image = rife.upscale(image)
|
||||
image.save("output_image.jpg")
|
||||
```
|
||||
|
||||
- **[FastBlend](https://arxiv.org/abs/2311.09265)**: FastBlend 不依赖模型的视频去闪烁算法,在使用图像生成模型逐帧处理过的视频(风格视频)中,通常会出现闪烁问题,FastBlend 则可以根据原视频(引导视频)中的运动特征,消除风格视频中的闪烁。
|
||||
|
||||
在这段代码中,我们用 FastBlend 把风格视频中的闪烁效果删除。
|
||||
|
||||
```python
|
||||
from diffsynth import VideoData, save_video
|
||||
from diffsynth.extensions.FastBlend import FastBlendSmoother
|
||||
|
||||
fastblend = FastBlendSmoother()
|
||||
guide_video = VideoData("guide_video.mp4", height=512, width=768).raw_data()
|
||||
style_video = VideoData("style_video.mp4", height=512, width=768).raw_data()
|
||||
output_video = fastblend(style_video, original_frames=guide_video)
|
||||
save_video(output_video, "output_video.mp4", fps=30)
|
||||
```
|
||||
26
docs/source/tutorial/Installation.md
Normal file
26
docs/source/tutorial/Installation.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# 安装
|
||||
|
||||
目前,DiffSynth-Studio 支持从 GitHub 克隆安装或使用 pip 安装,我们建议用户从 GitHub 克隆安装,从而体验最新的功能。
|
||||
|
||||
## 从源码下载
|
||||
|
||||
1. 克隆源码仓库:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
||||
```
|
||||
|
||||
2. 进入项目目录并安装:
|
||||
|
||||
```bash
|
||||
cd DiffSynth-Studio
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## 使用 PyPI 下载
|
||||
|
||||
直接通过 PyPI 安装(功能更新存在延后):
|
||||
|
||||
```bash
|
||||
pip install diffsynth
|
||||
```
|
||||
18
docs/source/tutorial/Models.md
Normal file
18
docs/source/tutorial/Models.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# 模型
|
||||
|
||||
目前为止,DiffSynth Studio 支持的模型如下所示:
|
||||
|
||||
* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
|
||||
* [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
|
||||
* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
|
||||
* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
|
||||
* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
|
||||
* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
|
||||
* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
|
||||
* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
|
||||
* [ESRGAN](https://github.com/xinntao/ESRGAN)
|
||||
* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
|
||||
* [AnimateDiff](https://github.com/guoyww/animatediff/)
|
||||
* [ControlNet](https://github.com/lllyasviel/ControlNet)
|
||||
* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
||||
* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
|
||||
22
docs/source/tutorial/Pipelines.md
Normal file
22
docs/source/tutorial/Pipelines.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# 流水线
|
||||
|
||||
DiffSynth-Studio 中包括多个流水线,分为图像生成和视频生成两类。
|
||||
|
||||
## 图像生成流水线
|
||||
|
||||
| Pipeline | Models |
|
||||
|----------------------------|----------------------------------------------------------------|
|
||||
| SDImagePipeline | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter |
|
||||
| SDXLImagePipeline | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter |
|
||||
| SD3ImagePipeline | text_encoder_1: SD3TextEncoder1<br>text_encoder_2: SD3TextEncoder2<br>text_encoder_3: SD3TextEncoder3<br>dit: SD3DiT<br>vae_decoder: SD3VAEDecoder<br>vae_encoder: SD3VAEEncoder |
|
||||
| HunyuanDiTImagePipeline | text_encoder: HunyuanDiTCLIPTextEncoder<br>text_encoder_t5: HunyuanDiTT5TextEncoder<br>dit: HunyuanDiT<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder |
|
||||
| FluxImagePipeline | text_encoder_1: FluxTextEncoder1<br>text_encoder_2: FluxTextEncoder2<br>dit: FluxDiT<br>vae_decoder: FluxVAEDecoder<br>vae_encoder: FluxVAEEncoder |
|
||||
|
||||
## 视频生成流水线
|
||||
|
||||
| Pipeline | Models |
|
||||
|----------------------------|----------------------------------------------------------------|
|
||||
| SDVideoPipeline | text_encoder: SDTextEncoder<br>unet: SDUNet<br>vae_decoder: SDVAEDecoder<br>vae_encoder: SDVAEEncoder<br>controlnet: MultiControlNetManager<br>ipadapter_image_encoder: IpAdapterCLIPImageEmbedder<br>ipadapter: SDIpAdapter<br>motion_modules: SDMotionModel |
|
||||
| SDXLVideoPipeline | text_encoder: SDXLTextEncoder<br>text_encoder_2: SDXLTextEncoder2<br>text_encoder_kolors: ChatGLMModel<br>unet: SDXLUNet<br>vae_decoder: SDXLVAEDecoder<br>vae_encoder: SDXLVAEEncoder<br>ipadapter_image_encoder: IpAdapterXLCLIPImageEmbedder<br>ipadapter: SDXLIpAdapter<br>motion_modules: SDXLMotionModel |
|
||||
| SVDVideoPipeline | image_encoder: SVDImageEncoder<br>unet: SVDUNet<br>vae_encoder: SVDVAEEncoder<br>vae_decoder: SVDVAEDecoder |
|
||||
| CogVideoPipeline | text_encoder: FluxTextEncoder2<br>dit: CogDiT<br>vae_encoder: CogVAEEncoder<br>vae_decoder: CogVAEDecoder |
|
||||
37
docs/source/tutorial/PromptProcessing.md
Normal file
37
docs/source/tutorial/PromptProcessing.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# 提示词处理
|
||||
|
||||
DiffSynth 内置了提示词处理功能,分为:
|
||||
|
||||
- **提示词润色器(`prompt_refiner_classes`)**:包括提示词润色、提示词中译英、提示词同时润色与中译英,可选参数如下:
|
||||
|
||||
- **英文提示词润色**:'BeautifulPrompt',使用到的是[pai-bloom-1b1-text2prompt-sd](https://modelscope.cn/models/AI-ModelScope/pai-bloom-1b1-text2prompt-sd)。
|
||||
|
||||
- **提示词中译英**:'Translator',使用到的是[opus-mt-zh-e](https://modelscope.cn/models/moxying/opus-mt-zh-en)。
|
||||
|
||||
- **提示词中译英并润色**:'QwenPrompt',使用到的是[Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct)。
|
||||
|
||||
- **提示词扩展器(`prompt_extender_classes`)**:基于Omost的提示词分区控制扩写,可选参数如下:
|
||||
|
||||
- **提示词分区扩写**:'OmostPromter'。
|
||||
|
||||
|
||||
## 使用说明
|
||||
|
||||
### 提示词润色器
|
||||
|
||||
在加载模型 Pipeline 时,可以通过参数 `prompt_refiner_classes` 指定所需的提示词润色器功能。有关示例代码,请参考 [sd_prompt_refining.py](examples/image_synthesis/sd_prompt_refining.py)。
|
||||
|
||||
可选的 `prompt_refiner_classes` 参数包括:Translator、BeautifulPrompt、QwenPrompt。
|
||||
|
||||
```python
|
||||
pipe = SDXLImagePipeline.from_model_manager(model_manager, prompt_refiner_classes=[Translator, BeautifulPrompt])
|
||||
```
|
||||
|
||||
### 提示词扩展器
|
||||
|
||||
在加载模型 Pipeline 时,可以通过参数 `prompt_extender_classes` 指定所需的提示词扩展器。有关示例代码,请参考 [omost_flux_text_to_image.py](examples/image_synthesis/omost_flux_text_to_image.py)。
|
||||
|
||||
```python
|
||||
pipe = FluxImagePipeline.from_model_manager(model_manager, prompt_extender_classes=[OmostPromter])
|
||||
```
|
||||
|
||||
11
docs/source/tutorial/Schedulers.md
Normal file
11
docs/source/tutorial/Schedulers.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# 调度器
|
||||
|
||||
调度器(Scheduler)控制模型的整个去噪(或采样)过程。在加载 Pipeline 时,DiffSynth 会自动选择最适合当前 Pipeline 的调度器,**无需额外配置**。
|
||||
|
||||
我们支持的调度器包括:
|
||||
|
||||
- **EnhancedDDIMScheduler**:扩展了去噪扩散概率模型(DDPM)中的去噪过程,引入了非马尔可夫指导。
|
||||
|
||||
- **FlowMatchScheduler**:实现了 [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) 中提出的流量匹配采样方法。
|
||||
|
||||
- **ContinuousODEScheduler**:基于常微分方程(ODE)的调度器。
|
||||
Reference in New Issue
Block a user