mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
@@ -30,7 +30,7 @@ class ModelConfig:
|
||||
raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`. `skip_download=True` only supports the first one.""")
|
||||
|
||||
def parse_original_file_pattern(self):
|
||||
if self.origin_file_pattern is None or self.origin_file_pattern == "":
|
||||
if self.origin_file_pattern in [None, "", "./"]:
|
||||
return "*"
|
||||
elif self.origin_file_pattern.endswith("/"):
|
||||
return self.origin_file_pattern + "*"
|
||||
@@ -99,7 +99,7 @@ class ModelConfig:
|
||||
if self.require_downloading():
|
||||
self.download()
|
||||
if self.path is None:
|
||||
if self.origin_file_pattern is None or self.origin_file_pattern == "":
|
||||
if self.origin_file_pattern in [None, "", "./"]:
|
||||
self.path = os.path.join(self.local_model_path, self.model_id)
|
||||
else:
|
||||
self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern))
|
||||
|
||||
@@ -356,7 +356,7 @@ class AAATrainingModule(DiffusionTrainingModule):
|
||||
ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""),
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
|
||||
)
|
||||
self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device)
|
||||
self.pipe.freeze_except(["dit"])
|
||||
@@ -424,7 +424,7 @@ pipe = AAAImagePipeline.from_pretrained(
|
||||
ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""),
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
|
||||
)
|
||||
pipe.dit = load_model(AAADiT, "models/DiffSynth-Studio/AAAMyModel/step-600000.safetensors", torch_dtype=torch.bfloat16, device="cuda")
|
||||
```
|
||||
|
||||
@@ -295,7 +295,7 @@ class AAATrainingModule(DiffusionTrainingModule):
|
||||
ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""),
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
|
||||
)
|
||||
self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device)
|
||||
self.pipe.freeze_except(["dit"])
|
||||
|
||||
@@ -6,7 +6,7 @@ This document introduces the basic principles of Diffusion models to help you un
|
||||
|
||||
Diffusion models generate clear images or video content through iterative denoising. We start by explaining the generation process of a data sample $x_0$. Intuitively, in a complete round of denoising, we start from random Gaussian noise $x_T$ and iteratively obtain $x_{T-1}$, $x_{T-2}$, $x_{T-3}$, $\cdots$, gradually reducing the noise content at each step until we finally obtain the noise-free data sample $x_0$.
|
||||
|
||||
(Figure)
|
||||

|
||||
|
||||
This process is intuitive, but to understand the details, we need to answer several questions:
|
||||
|
||||
@@ -28,7 +28,7 @@ As for the intermediate values $\sigma_{T-1}$, $\sigma_{T-2}$, $\cdots$, $\sigma
|
||||
|
||||
At an intermediate step, we can directly synthesize noisy data samples $x_t=(1-\sigma_t)x_0+\sigma_t x_T$.
|
||||
|
||||
(Figure)
|
||||

|
||||
|
||||
## How is the iterative denoising computation performed?
|
||||
|
||||
@@ -40,8 +40,6 @@ Before understanding the iterative denoising computation, we need to clarify wha
|
||||
|
||||
Among these, the guidance condition $c$ is a newly introduced parameter that is input by the user. It can be text describing the image content or a sketch outlining the image structure.
|
||||
|
||||
(Figure)
|
||||
|
||||
The model's output $\hat \epsilon(x_t,c,t)$ approximately equals $x_T-x_0$, which is the direction of the entire diffusion process (the reverse process of denoising).
|
||||
|
||||
Next, we analyze the computation occurring in one iteration. At time step $t$, after the model computes an approximation of $x_T-x_0$, we calculate the next $x_{t-1}$:
|
||||
@@ -91,8 +89,6 @@ After understanding the iterative denoising process, we next consider how to tra
|
||||
|
||||
The training process differs from the generation process. If we retain multi-step iterations during training, the gradient would need to backpropagate through multiple steps, bringing catastrophic time and space complexity. To improve computational efficiency, we randomly select a time step $t$ for training.
|
||||
|
||||
(Figure)
|
||||
|
||||
The following is pseudocode for the training process:
|
||||
|
||||
> Obtain data sample $x_0$ and guidance condition $c$ from the dataset
|
||||
@@ -113,7 +109,7 @@ The following is pseudocode for the training process:
|
||||
|
||||
From theory to practice, more details need to be filled in. Modern Diffusion model architectures have matured, with mainstream architectures following the "three-stage" architecture proposed by Latent Diffusion, including data encoder-decoder, guidance condition encoder, and denoising model.
|
||||
|
||||
(Figure)
|
||||

|
||||
|
||||
### Data Encoder-Decoder
|
||||
|
||||
|
||||
@@ -357,7 +357,7 @@ class AAATrainingModule(DiffusionTrainingModule):
|
||||
ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""),
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
|
||||
)
|
||||
self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device)
|
||||
self.pipe.freeze_except(["dit"])
|
||||
@@ -425,7 +425,7 @@ pipe = AAAImagePipeline.from_pretrained(
|
||||
ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""),
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
|
||||
)
|
||||
pipe.dit = load_model(AAADiT, "models/DiffSynth-Studio/AAAMyModel/step-600000.safetensors", torch_dtype=torch.bfloat16, device="cuda")
|
||||
```
|
||||
|
||||
@@ -295,7 +295,7 @@ class AAATrainingModule(DiffusionTrainingModule):
|
||||
ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"),
|
||||
ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
||||
],
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""),
|
||||
tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"),
|
||||
)
|
||||
self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device)
|
||||
self.pipe.freeze_except(["dit"])
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Diffusion 模型通过多步迭代式地去噪(denoise)生成清晰的图像或视频内容,我们从一个数据样本 $x_0$ 的生成过程开始讲起。直观地,在完整的一轮 denoise 过程中,我们从随机高斯噪声 $x_T$ 开始,通过迭代依次得到 $x_{T-1}$、$x_{T-2}$、$x_{T-3}$、$\cdots$,在每一步中逐渐减少噪声含量,最终得到不含噪声的数据样本 $x_0$。
|
||||
|
||||
(图)
|
||||

|
||||
|
||||
这个过程是很直观的,但如果要理解其中的细节,我们就需要回答这几个问题:
|
||||
|
||||
@@ -28,7 +28,7 @@ Diffusion 模型通过多步迭代式地去噪(denoise)生成清晰的图像
|
||||
|
||||
那么在中间的某一步,我们可以直接合成含噪声的数据样本 $x_t=(1-\sigma_t)x_0+\sigma_t x_T$。
|
||||
|
||||
(图)
|
||||

|
||||
|
||||
## 迭代去噪的计算是如何进行的?
|
||||
|
||||
@@ -40,8 +40,6 @@ Diffusion 模型通过多步迭代式地去噪(denoise)生成清晰的图像
|
||||
|
||||
其中,引导条件 $c$ 是新引入的参数,它是由用户输入的,可以是用于描述图像内容的文本,也可以是用于勾勒图像结构的线稿图。
|
||||
|
||||
(图)
|
||||
|
||||
而模型的输出 $\hat \epsilon(x_t,c,t)$,则近似地等于 $x_T-x_0$,也就是整个扩散过程(去噪过程的反向过程)的方向。
|
||||
|
||||
接下来我们分析一步迭代中发生的计算,在时间步 $t$,模型通过计算得到近似的 $x_T-x_0$ 后,我们计算下一步的 $x_{t-1}$:
|
||||
@@ -89,8 +87,6 @@ $$
|
||||
|
||||
训练过程不同于生成过程,如果我们在训练过程中保留多步迭代,那么梯度需经过多步回传,带来的时间和空间复杂度是灾难性的。为了提高计算效率,我们在训练中随机选择某一时间步 $t$ 进行训练。
|
||||
|
||||
(图)
|
||||
|
||||
以下是训练过程的伪代码
|
||||
|
||||
> 从数据集获取数据样本 $x_0$ 和引导条件 $c$
|
||||
@@ -111,7 +107,7 @@ $$
|
||||
|
||||
从理论到实践,还需要填充更多细节。现代 Diffusion 模型架构已经发展成熟,主流的架构沿用了 Latent Diffusion 所提出的“三段式”架构,包括数据编解码器、引导条件编码器、去噪模型三部分。
|
||||
|
||||
(图)
|
||||

|
||||
|
||||
### 数据编解码器
|
||||
|
||||
|
||||
Reference in New Issue
Block a user