diff --git a/diffsynth/core/loader/config.py b/diffsynth/core/loader/config.py index 010483d..d4ce83c 100644 --- a/diffsynth/core/loader/config.py +++ b/diffsynth/core/loader/config.py @@ -30,7 +30,7 @@ class ModelConfig: raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`. `skip_download=True` only supports the first one.""") def parse_original_file_pattern(self): - if self.origin_file_pattern is None or self.origin_file_pattern == "": + if self.origin_file_pattern in [None, "", "./"]: return "*" elif self.origin_file_pattern.endswith("/"): return self.origin_file_pattern + "*" @@ -99,7 +99,7 @@ class ModelConfig: if self.require_downloading(): self.download() if self.path is None: - if self.origin_file_pattern is None or self.origin_file_pattern == "": + if self.origin_file_pattern in [None, "", "./"]: self.path = os.path.join(self.local_model_path, self.model_id) else: self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern)) diff --git a/docs/en/Research_Tutorial/train_from_scratch.md b/docs/en/Research_Tutorial/train_from_scratch.md index 6d5ff76..2a63f82 100644 --- a/docs/en/Research_Tutorial/train_from_scratch.md +++ b/docs/en/Research_Tutorial/train_from_scratch.md @@ -356,7 +356,7 @@ class AAATrainingModule(DiffusionTrainingModule): ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"), ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""), + tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"), ) self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device) self.pipe.freeze_except(["dit"]) @@ -424,7 +424,7 @@ pipe = AAAImagePipeline.from_pretrained( ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"), ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""), + tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"), ) pipe.dit = load_model(AAADiT, "models/DiffSynth-Studio/AAAMyModel/step-600000.safetensors", torch_dtype=torch.bfloat16, device="cuda") ``` diff --git a/docs/en/Research_Tutorial/train_from_scratch.py b/docs/en/Research_Tutorial/train_from_scratch.py index 622e091..328c24d 100644 --- a/docs/en/Research_Tutorial/train_from_scratch.py +++ b/docs/en/Research_Tutorial/train_from_scratch.py @@ -295,7 +295,7 @@ class AAATrainingModule(DiffusionTrainingModule): ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"), ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""), + tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"), ) self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device) self.pipe.freeze_except(["dit"]) diff --git a/docs/en/Training/Understanding_Diffusion_models.md b/docs/en/Training/Understanding_Diffusion_models.md index 5c81b6a..718df25 100644 --- a/docs/en/Training/Understanding_Diffusion_models.md +++ b/docs/en/Training/Understanding_Diffusion_models.md @@ -6,7 +6,7 @@ This document introduces the basic principles of Diffusion models to help you un Diffusion models generate clear images or video content through iterative denoising. We start by explaining the generation process of a data sample $x_0$. Intuitively, in a complete round of denoising, we start from random Gaussian noise $x_T$ and iteratively obtain $x_{T-1}$, $x_{T-2}$, $x_{T-3}$, $\cdots$, gradually reducing the noise content at each step until we finally obtain the noise-free data sample $x_0$. -(Figure) +![Image](https://github.com/user-attachments/assets/6471ae4c-a635-4924-8b36-b0bd4d42043d) This process is intuitive, but to understand the details, we need to answer several questions: @@ -28,7 +28,7 @@ As for the intermediate values $\sigma_{T-1}$, $\sigma_{T-2}$, $\cdots$, $\sigma At an intermediate step, we can directly synthesize noisy data samples $x_t=(1-\sigma_t)x_0+\sigma_t x_T$. -(Figure) +![Image](https://github.com/user-attachments/assets/e25a2f71-123c-4e18-8b34-3a066af15667) ## How is the iterative denoising computation performed? @@ -40,8 +40,6 @@ Before understanding the iterative denoising computation, we need to clarify wha Among these, the guidance condition $c$ is a newly introduced parameter that is input by the user. It can be text describing the image content or a sketch outlining the image structure. -(Figure) - The model's output $\hat \epsilon(x_t,c,t)$ approximately equals $x_T-x_0$, which is the direction of the entire diffusion process (the reverse process of denoising). Next, we analyze the computation occurring in one iteration. At time step $t$, after the model computes an approximation of $x_T-x_0$, we calculate the next $x_{t-1}$: @@ -91,8 +89,6 @@ After understanding the iterative denoising process, we next consider how to tra The training process differs from the generation process. If we retain multi-step iterations during training, the gradient would need to backpropagate through multiple steps, bringing catastrophic time and space complexity. To improve computational efficiency, we randomly select a time step $t$ for training. -(Figure) - The following is pseudocode for the training process: > Obtain data sample $x_0$ and guidance condition $c$ from the dataset @@ -113,7 +109,7 @@ The following is pseudocode for the training process: From theory to practice, more details need to be filled in. Modern Diffusion model architectures have matured, with mainstream architectures following the "three-stage" architecture proposed by Latent Diffusion, including data encoder-decoder, guidance condition encoder, and denoising model. -(Figure) +![Image](https://github.com/user-attachments/assets/43855430-6427-4aca-83a0-f684e01438b1) ### Data Encoder-Decoder diff --git a/docs/zh/Research_Tutorial/train_from_scratch.md b/docs/zh/Research_Tutorial/train_from_scratch.md index 3d5c6d0..2c620eb 100644 --- a/docs/zh/Research_Tutorial/train_from_scratch.md +++ b/docs/zh/Research_Tutorial/train_from_scratch.md @@ -357,7 +357,7 @@ class AAATrainingModule(DiffusionTrainingModule): ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"), ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""), + tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"), ) self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device) self.pipe.freeze_except(["dit"]) @@ -425,7 +425,7 @@ pipe = AAAImagePipeline.from_pretrained( ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"), ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""), + tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"), ) pipe.dit = load_model(AAADiT, "models/DiffSynth-Studio/AAAMyModel/step-600000.safetensors", torch_dtype=torch.bfloat16, device="cuda") ``` diff --git a/docs/zh/Research_Tutorial/train_from_scratch.py b/docs/zh/Research_Tutorial/train_from_scratch.py index 622e091..328c24d 100644 --- a/docs/zh/Research_Tutorial/train_from_scratch.py +++ b/docs/zh/Research_Tutorial/train_from_scratch.py @@ -295,7 +295,7 @@ class AAATrainingModule(DiffusionTrainingModule): ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors"), ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], - tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern=""), + tokenizer_config=ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="./"), ) self.pipe.dit = AAADiT().to(dtype=torch.bfloat16, device=device) self.pipe.freeze_except(["dit"]) diff --git a/docs/zh/Training/Understanding_Diffusion_models.md b/docs/zh/Training/Understanding_Diffusion_models.md index 576edc9..7613dc8 100644 --- a/docs/zh/Training/Understanding_Diffusion_models.md +++ b/docs/zh/Training/Understanding_Diffusion_models.md @@ -6,7 +6,7 @@ Diffusion 模型通过多步迭代式地去噪(denoise)生成清晰的图像或视频内容,我们从一个数据样本 $x_0$ 的生成过程开始讲起。直观地,在完整的一轮 denoise 过程中,我们从随机高斯噪声 $x_T$ 开始,通过迭代依次得到 $x_{T-1}$、$x_{T-2}$、$x_{T-3}$、$\cdots$,在每一步中逐渐减少噪声含量,最终得到不含噪声的数据样本 $x_0$。 -(图) +![Image](https://github.com/user-attachments/assets/6471ae4c-a635-4924-8b36-b0bd4d42043d) 这个过程是很直观的,但如果要理解其中的细节,我们就需要回答这几个问题: @@ -28,7 +28,7 @@ Diffusion 模型通过多步迭代式地去噪(denoise)生成清晰的图像 那么在中间的某一步,我们可以直接合成含噪声的数据样本 $x_t=(1-\sigma_t)x_0+\sigma_t x_T$。 -(图) +![Image](https://github.com/user-attachments/assets/e25a2f71-123c-4e18-8b34-3a066af15667) ## 迭代去噪的计算是如何进行的? @@ -40,8 +40,6 @@ Diffusion 模型通过多步迭代式地去噪(denoise)生成清晰的图像 其中,引导条件 $c$ 是新引入的参数,它是由用户输入的,可以是用于描述图像内容的文本,也可以是用于勾勒图像结构的线稿图。 -(图) - 而模型的输出 $\hat \epsilon(x_t,c,t)$,则近似地等于 $x_T-x_0$,也就是整个扩散过程(去噪过程的反向过程)的方向。 接下来我们分析一步迭代中发生的计算,在时间步 $t$,模型通过计算得到近似的 $x_T-x_0$ 后,我们计算下一步的 $x_{t-1}$: @@ -89,8 +87,6 @@ $$ 训练过程不同于生成过程,如果我们在训练过程中保留多步迭代,那么梯度需经过多步回传,带来的时间和空间复杂度是灾难性的。为了提高计算效率,我们在训练中随机选择某一时间步 $t$ 进行训练。 -(图) - 以下是训练过程的伪代码 > 从数据集获取数据样本 $x_0$ 和引导条件 $c$ @@ -111,7 +107,7 @@ $$ 从理论到实践,还需要填充更多细节。现代 Diffusion 模型架构已经发展成熟,主流的架构沿用了 Latent Diffusion 所提出的“三段式”架构,包括数据编解码器、引导条件编码器、去噪模型三部分。 -(图) +![Image](https://github.com/user-attachments/assets/43855430-6427-4aca-83a0-f684e01438b1) ### 数据编解码器