From 9ecb9d8fe783a5e7a6f5364c679e5aa62fb83d44 Mon Sep 17 00:00:00 2001 From: Artiprocher Date: Wed, 3 Dec 2025 19:29:18 +0800 Subject: [PATCH] update doc --- docs/en/API_Reference/core/attention.md | 79 +++ docs/en/API_Reference/core/data.md | 151 ++++++ docs/en/API_Reference/core/gradient.md | 69 +++ docs/en/API_Reference/core/loader.md | 141 ++++++ docs/en/API_Reference/core/vram.md | 66 +++ .../en/Developer_Guide/Building_a_Pipeline.md | 250 ++++++++++ .../Enabling_VRAM_management.md | 455 ++++++++++++++++++ .../Developer_Guide/Integrating_Your_Model.md | 186 +++++++ .../Training_Diffusion_Models.md | 66 +++ docs/en/Model_Details/FLUX.md | 210 ++++++++ docs/en/Model_Details/FLUX2.md | 138 ++++++ docs/en/Model_Details/Overview.md | 291 +++++++++++ docs/en/Model_Details/Qwen-Image.md | 191 ++++++++ docs/en/Model_Details/Wan.md | 253 ++++++++++ docs/en/Model_Details/Z-Image.md | 131 +++++ .../Pipeline_Usage/Environment_Variables.md | 39 ++ docs/en/Pipeline_Usage/Model_Inference.md | 105 ++++ docs/en/Pipeline_Usage/Model_Training.md | 247 ++++++++++ docs/en/Pipeline_Usage/Setup.md | 21 + docs/en/Pipeline_Usage/VRAM_management.md | 206 ++++++++ docs/en/QA.md | 28 ++ docs/en/README.md | 88 ++++ docs/en/Training/Differential_LoRA.md | 38 ++ docs/en/Training/Direct_Distill.md | 97 ++++ docs/en/Training/FP8_Precision.md | 20 + docs/en/Training/Split_Training.md | 97 ++++ docs/en/Training/Supervised_Fine_Tuning.md | 129 +++++ .../Understanding_Diffusion_models.md | 145 ++++++ docs/zh/Model_Details/Z-Image.md | 2 +- examples/dev_tools/fix_path.py | 38 +- 30 files changed, 3957 insertions(+), 20 deletions(-) create mode 100644 docs/en/API_Reference/core/attention.md create mode 100644 docs/en/API_Reference/core/data.md create mode 100644 docs/en/API_Reference/core/gradient.md create mode 100644 docs/en/API_Reference/core/loader.md create mode 100644 docs/en/API_Reference/core/vram.md create mode 100644 docs/en/Developer_Guide/Building_a_Pipeline.md create mode 100644 docs/en/Developer_Guide/Enabling_VRAM_management.md create mode 100644 docs/en/Developer_Guide/Integrating_Your_Model.md create mode 100644 docs/en/Developer_Guide/Training_Diffusion_Models.md create mode 100644 docs/en/Model_Details/FLUX.md create mode 100644 docs/en/Model_Details/FLUX2.md create mode 100644 docs/en/Model_Details/Overview.md create mode 100644 docs/en/Model_Details/Qwen-Image.md create mode 100644 docs/en/Model_Details/Wan.md create mode 100644 docs/en/Model_Details/Z-Image.md create mode 100644 docs/en/Pipeline_Usage/Environment_Variables.md create mode 100644 docs/en/Pipeline_Usage/Model_Inference.md create mode 100644 docs/en/Pipeline_Usage/Model_Training.md create mode 100644 docs/en/Pipeline_Usage/Setup.md create mode 100644 docs/en/Pipeline_Usage/VRAM_management.md create mode 100644 docs/en/QA.md create mode 100644 docs/en/README.md create mode 100644 docs/en/Training/Differential_LoRA.md create mode 100644 docs/en/Training/Direct_Distill.md create mode 100644 docs/en/Training/FP8_Precision.md create mode 100644 docs/en/Training/Split_Training.md create mode 100644 docs/en/Training/Supervised_Fine_Tuning.md create mode 100644 docs/en/Training/Understanding_Diffusion_models.md diff --git a/docs/en/API_Reference/core/attention.md b/docs/en/API_Reference/core/attention.md new file mode 100644 index 0000000..9ec3123 --- /dev/null +++ b/docs/en/API_Reference/core/attention.md @@ -0,0 +1,79 @@ +# `diffsynth.core.attention`: Attention Mechanism Implementation + +`diffsynth.core.attention` provides routing mechanisms for attention mechanism implementations, automatically selecting efficient attention implementations based on available packages in the `Python` environment and [environment variables](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_attention_implementation). + +## Attention Mechanism + +The attention mechanism is a model structure proposed in the paper ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). In the original paper, the attention mechanism is implemented according to the following formula: + +$$ +\text{Attention}(Q, K, V) = \text{Softmax}\left( + \frac{QK^T}{\sqrt{d_k}} +\right) +V. +$$ + +In `PyTorch`, it can be implemented with the following code: +```python +import torch + +def attention(query, key, value): + scale_factor = 1 / query.size(-1)**0.5 + attn_weight = query @ key.transpose(-2, -1) * scale_factor + attn_weight = torch.softmax(attn_weight, dim=-1) + return attn_weight @ value + +query = torch.rand(32, 8, 128, 64, dtype=torch.bfloat16, device="cuda") +key = torch.rand(32, 8, 128, 64, dtype=torch.bfloat16, device="cuda") +value = torch.rand(32, 8, 128, 64, dtype=torch.bfloat16, device="cuda") +output_1 = attention(query, key, value) +``` + +The dimensions of `query`, `key`, and `value` are $(b, n, s, d)$: +* $b$: Batch size +* $n$: Number of attention heads +* $s$: Sequence length +* $d$: Dimension of each attention head + +This computation does not include any trainable parameters. Modern transformer architectures will pass through Linear layers before and after this computation, but the "attention mechanism" discussed in this article refers only to the computation in the above code, not including these calculations. + +## More Efficient Implementations + +Note that the dimension of the Attention Score in the attention mechanism ( $\text{Softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)$ in the formula, `attn_weight` in the code) is $(b, n, s, s)$, where the sequence length $s$ is typically very large, causing the time and space complexity of computation to reach quadratic level. Taking image generation models as an example, when the width and height of the image increase to 2 times, the sequence length increases to 4 times, and the computational load and memory requirements increase to 16 times. To avoid high computational costs, more efficient attention mechanism implementations are needed, including: +* Flash Attention 3: [GitHub](https://github.com/Dao-AILab/flash-attention), [Paper](https://arxiv.org/abs/2407.08608) +* Flash Attention 2: [GitHub](https://github.com/Dao-AILab/flash-attention), [Paper](https://arxiv.org/abs/2307.08691) +* Sage Attention: [GitHub](https://github.com/thu-ml/SageAttention), [Paper](https://arxiv.org/abs/2505.11594) +* xFormers: [GitHub](https://github.com/facebookresearch/xformers), [Documentation](https://facebookresearch.github.io/xformers/components/ops.html#module-xformers.ops) +* PyTorch: [GitHub](https://github.com/pytorch/pytorch), [Documentation](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) + +To call attention implementations other than `PyTorch`, please follow the instructions on their GitHub pages to install the corresponding packages. `DiffSynth-Studio` will automatically route to the corresponding implementation based on available packages in the Python environment, or can be controlled through [environment variables](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_attention_implementation). + +```python +from diffsynth.core.attention import attention_forward +import torch + +def attention(query, key, value): + scale_factor = 1 / query.size(-1)**0.5 + attn_weight = query @ key.transpose(-2, -1) * scale_factor + attn_weight = torch.softmax(attn_weight, dim=-1) + return attn_weight @ value + +query = torch.rand(32, 8, 128, 64, dtype=torch.bfloat16, device="cuda") +key = torch.rand(32, 8, 128, 64, dtype=torch.bfloat16, device="cuda") +value = torch.rand(32, 8, 128, 64, dtype=torch.bfloat16, device="cuda") +output_1 = attention(query, key, value) +output_2 = attention_forward(query, key, value) +print((output_1 - output_2).abs().mean()) +``` + +Please note that acceleration will introduce errors, but in most cases, the error is negligible. + +## Developer Guide + +When integrating new models into `DiffSynth-Studio`, developers can decide whether to call `attention_forward` in `diffsynth.core.attention`, but we expect models to prioritize calling this module as much as possible, so that new attention mechanism implementations can take effect directly on these models. + +## Best Practices + +**In most cases, we recommend directly using the native `PyTorch` implementation without installing any additional packages.** Although other attention mechanism implementations can accelerate, the acceleration effect is relatively limited, and in a few cases, compatibility and precision issues may arise. + +In addition, efficient attention mechanism implementations will gradually be integrated into `PyTorch`. The `scaled_dot_product_attention` in `PyTorch` version 2.9.0 has already integrated Flash Attention 2. We still provide this interface in `DiffSynth-Studio` to allow some aggressive acceleration schemes to quickly move toward application, even though they still need time to be verified for stability. \ No newline at end of file diff --git a/docs/en/API_Reference/core/data.md b/docs/en/API_Reference/core/data.md new file mode 100644 index 0000000..0a6f11d --- /dev/null +++ b/docs/en/API_Reference/core/data.md @@ -0,0 +1,151 @@ +# `diffsynth.core.data`: Data Processing Operators and Universal Dataset + +## Data Processing Operators + +### Available Data Processing Operators + +`diffsynth.core.data` provides a series of data processing operators for data processing, including: + +* Data format conversion operators + * `ToInt`: Convert to int format + * `ToFloat`: Convert to float format + * `ToStr`: Convert to str format + * `ToList`: Convert to list format, wrapping this data in a list + * `ToAbsolutePath`: Convert relative paths to absolute paths +* File loading operators + * `LoadImage`: Read image files + * `LoadVideo`: Read video files + * `LoadAudio`: Read audio files + * `LoadGIF`: Read GIF files + * `LoadTorchPickle`: Read binary files saved by [`torch.save`](https://docs.pytorch.org/docs/stable/generated/torch.save.html) [This operator may cause code injection attacks in binary files, please use with caution!] +* Media file processing operators + * `ImageCropAndResize`: Crop and resize images +* Meta operators + * `SequencialProcess`: Route each data in the sequence to an operator + * `RouteByExtensionName`: Route to specific operators by file extension + * `RouteByType`: Route to specific operators by data type + +### Operator Usage + +Data operators are connected with the `>>` symbol to form data processing pipelines, for example: + +```python +from diffsynth.core.data.operators import * + +data = "image.jpg" +data_pipeline = ToAbsolutePath(base_path="/data") >> LoadImage() >> ImageCropAndResize(max_pixels=512*512) +data = data_pipeline(data) +``` + +After passing through each operator, the data is processed in sequence: + +* `ToAbsolutePath(base_path="/data")`: `"/data/image.jpg"` +* `LoadImage()`: `` +* `ImageCropAndResize(max_pixels=512*512)`: `` + +We can compose functionally complete data pipelines, for example, the default video data operator for the universal dataset is: + +```python +RouteByType(operator_map=[ + (str, ToAbsolutePath(base_path) >> RouteByExtensionName(operator_map=[ + (("jpg", "jpeg", "png", "webp"), LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor) >> ToList()), + (("gif",), LoadGIF( + num_frames, time_division_factor, time_division_remainder, + frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor), + )), + (("mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"), LoadVideo( + num_frames, time_division_factor, time_division_remainder, + frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor), + )), + ])), +]) +``` + +It includes the following logic: + +* If the data is of type `str` + * If it's a `"jpg", "jpeg", "png", "webp"` type file + * Load this image + * Crop and scale to a specific resolution + * Pack into a list, treating it as a single-frame video + * If it's a `"gif"` type file + * Load the GIF file content + * Crop and scale each frame to a specific resolution + * If it's a `"mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"` type file + * Load the video file content + * Crop and scale each frame to a specific resolution +* If the data is not of type `str`, an error is reported + +## Universal Dataset + +`diffsynth.core.data` provides a unified dataset implementation. The dataset requires the following parameters: + +* `base_path`: Root directory. If the dataset contains relative paths to image files, this field needs to be filled in to load the files pointed to by these paths +* `metadata_path`: Metadata directory, records the file paths of all metadata, supports `csv`, `json`, `jsonl` formats +* `repeat`: Data repetition count, defaults to 1, this parameter affects the number of training steps in an epoch +* `data_file_keys`: Data field names that need to be loaded, for example `(image, edit_image)` +* `main_data_operator`: Main loading operator, needs to assemble the data processing pipeline through data processing operators +* `special_operator_map`: Special operator mapping, operator mappings built for fields that require special processing + +### Metadata + +The dataset's `metadata_path` points to a metadata file, supporting `csv`, `json`, `jsonl` formats. The following provides examples: + +* `csv` format: High readability, does not support list data, small memory footprint + +```csv +image,prompt +image_1.jpg,"a dog" +image_2.jpg,"a cat" +``` + +* `json` format: High readability, supports list data, large memory footprint + +```json +[ + { + "image": "image_1.jpg", + "prompt": "a dog" + }, + { + "image": "image_2.jpg", + "prompt": "a cat" + } +] +``` + +* `jsonl` format: Low readability, supports list data, small memory footprint + +```json +{"image": "image_1.jpg", "prompt": "a dog"} +{"image": "image_2.jpg", "prompt": "a cat"} +``` + +How to choose the best metadata format? + +* If the data volume is large, reaching tens of millions, since `json` file parsing requires additional memory, it's not available. Please use `csv` or `jsonl` format +* If the dataset contains list data, such as edit models that require multiple images as input, since `csv` format cannot store list format data, it's not available. Please use `json` or `jsonl` format + +### Data Loading Logic + +When no additional settings are made, the dataset defaults to outputting data from the metadata set. Image and video file paths will be output in string format. To load these files, you need to set `data_file_keys`, `main_data_operator`, and `special_operator_map`. + +In the data processing flow, processing is done according to the following logic: +* If the field is in `special_operator_map`, call the corresponding operator in `special_operator_map` for processing +* If the field is not in `special_operator_map` + * If the field is in `data_file_keys`, call the `main_data_operator` operator for processing + * If the field is not in `data_file_keys`, no processing is done + +`special_operator_map` can be used to implement special data processing. For example, in the model [Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B), the input character face video `animate_face_video` is processed at a fixed resolution, inconsistent with the output video. Therefore, this field is processed by a dedicated operator: + +```python +special_operator_map={ + "animate_face_video": ToAbsolutePath(args.dataset_base_path) >> LoadVideo(args.num_frames, 4, 1, frame_processor=ImageCropAndResize(512, 512, None, 16, 16)), +} +``` + +### Other Notes + +When the data volume is too small, you can appropriately increase `repeat` to extend the training time of a single epoch, avoiding frequent model saving that generates considerable overhead. + +When data volume * `repeat` exceeds $10^9$, we observe that the dataset speed becomes significantly slower. This seems to be a `PyTorch` bug, and we are not sure if newer versions of `PyTorch` have fixed this issue. \ No newline at end of file diff --git a/docs/en/API_Reference/core/gradient.md b/docs/en/API_Reference/core/gradient.md new file mode 100644 index 0000000..eeca81c --- /dev/null +++ b/docs/en/API_Reference/core/gradient.md @@ -0,0 +1,69 @@ +# `diffsynth.core.gradient`: Gradient Checkpointing and Offload + +`diffsynth.core.gradient` provides encapsulated gradient checkpointing and its Offload version for model training. + +## Gradient Checkpointing + +Gradient checkpointing is a technique used to reduce memory usage during training. We provide an example to help you understand this technique. Here is a simple model structure: + +```python +import torch + +class ToyModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.activation = torch.nn.Sigmoid() + + def forward(self, x): + return self.activation(x) + +model = ToyModel() +x = torch.randn((2, 3)) +y = model(x) +``` + +In this model structure, the input parameter $x$ passes through the Sigmoid activation function to obtain the output value $y=\frac{1}{1+e^{-x}}$. + +During the training process, assuming our loss function value is $\mathcal L$, when backpropagating gradients, we obtain $\frac{\partial \mathcal L}{\partial y}$. At this point, we need to calculate $\frac{\partial \mathcal L}{\partial x}$. It's not difficult to find that $\frac{\partial y}{\partial x}=y(1-y)$, and thus $\frac{\partial \mathcal L}{\partial x}=\frac{\partial \mathcal L}{\partial y}\frac{\partial y}{\partial x}=\frac{\partial \mathcal L}{\partial y}y(1-y)$. If we save the value of $y$ during the model's forward propagation and directly compute $y(1-y)$ during gradient backpropagation, this will avoid complex exp computations, speeding up the calculation. However, this requires additional memory to store the intermediate variable $y$. + +When gradient checkpointing is not enabled, the training framework will default to storing all intermediate variables that assist gradient computation, thereby achieving optimal computational speed. When gradient checkpointing is enabled, intermediate variables are not stored, but the input parameter $x$ is still stored, reducing memory usage. During gradient backpropagation, these variables need to be recomputed, slowing down the calculation. + +## Enabling Gradient Checkpointing and Its Offload + +`gradient_checkpoint_forward` in `diffsynth.core.gradient` implements gradient checkpointing and its Offload. Refer to the following code for calling: + +```python +import torch +from diffsynth.core.gradient import gradient_checkpoint_forward + +class ToyModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.activation = torch.nn.Sigmoid() + + def forward(self, x): + return self.activation(x) + +model = ToyModel() +x = torch.randn((2, 3)) +y = gradient_checkpoint_forward( + model, + use_gradient_checkpointing=True, + use_gradient_checkpointing_offload=False, + x=x, +) +``` + +* When `use_gradient_checkpointing=False` and `use_gradient_checkpointing_offload=False`, the computation process is exactly the same as the original computation, not affecting the model's inference and training. You can directly integrate it into your code. +* When `use_gradient_checkpointing=True` and `use_gradient_checkpointing_offload=False`, gradient checkpointing is enabled. +* When `use_gradient_checkpointing_offload=True`, gradient checkpointing is enabled, and all gradient checkpoint input parameters are stored in memory, further reducing memory usage and slowing down computation. + +## Best Practices + +> Q: Where should gradient checkpointing be enabled? +> +> A: When enabling gradient checkpointing for the entire model, computational efficiency and memory usage are not optimal. We need to set fine-grained gradient checkpoints, but we don't want to add too much complicated code to the framework. Therefore, we recommend implementing it in the `model_fn` of `Pipeline`, for example, `model_fn_qwen_image` in `diffsynth/pipelines/qwen_image.py`, enabling gradient checkpointing at the Block level without modifying any code in the model structure. + +> Q: When should gradient checkpointing be enabled? +> +> A: As model parameters become increasingly large, gradient checkpointing has become a necessary training technique. Gradient checkpointing usually needs to be enabled. Gradient checkpointing Offload should only be enabled in models where activation values occupy excessive memory (such as video generation models). \ No newline at end of file diff --git a/docs/en/API_Reference/core/loader.md b/docs/en/API_Reference/core/loader.md new file mode 100644 index 0000000..1dccf5f --- /dev/null +++ b/docs/en/API_Reference/core/loader.md @@ -0,0 +1,141 @@ +# `diffsynth.core.loader`: Model Download and Loading + +This document introduces the model download and loading functionalities in `diffsynth.core.loader`. + +## ModelConfig + +`ModelConfig` in `diffsynth.core.loader` is used to annotate model download sources, local paths, VRAM management configurations, and other information. + +### Downloading and Loading Models from Remote Sources + +Taking the model [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny) as an example, after filling in `model_id` and `origin_file_pattern` in `ModelConfig`, the model can be automatically downloaded. By default, it downloads to the `./models` path, which can be modified through the [environment variable DIFFSYNTH_MODEL_BASE_PATH](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_model_base_path). + +By default, even if the model has already been downloaded, the program will still query the remote for any missing files. To completely disable remote requests, set the [environment variable DIFFSYNTH_SKIP_DOWNLOAD](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_skip_download) to `True`. + +```python +from diffsynth.core import ModelConfig + +config = ModelConfig( + model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny", + origin_file_pattern="model.safetensors", +) +# Download models +config.download_if_necessary() +print(config.path) +``` + +After calling `download_if_necessary`, the model will be automatically downloaded, and the path will be returned to `config.path`. + +### Loading Models from Local Paths + +If loading models from local paths, you need to fill in `path`: + +```python +from diffsynth.core import ModelConfig + +config = ModelConfig(path="models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors") +``` + +If the model contains multiple shard files, input them in list form: + +```python +from diffsynth.core import ModelConfig + +config = ModelConfig(path=[ + "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors" +]) +``` + +### VRAM Management Configuration + +`ModelConfig` also contains VRAM management configuration information. See [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md#more-usage-methods) for details. + +## Model File Loading + +`diffsynth.core.loader` provides a unified `load_state_dict` for loading state dicts from model files. + +Loading a single model file: + +```python +from diffsynth.core import load_state_dict + +state_dict = load_state_dict("models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors") +``` + +Loading multiple model files (merged into one state dict): + +```python +from diffsynth.core import load_state_dict + +state_dict = load_state_dict([ + "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors" +]) +``` + +## Model Hash + +Model hash is used to determine the model type. The hash value can be obtained through `hash_model_file`: + +```python +from diffsynth.core import hash_model_file + +print(hash_model_file("models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors")) +``` + +The hash value of multiple model files can also be calculated, which is equivalent to calculating the model hash value after merging the state dict: + +```python +from diffsynth.core import hash_model_file + +print(hash_model_file([ + "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors" +])) +``` + +The model hash value is only related to the keys and tensor shapes in the state dict of the model file, and is unrelated to the numerical values of the model parameters, file saving time, and other information. When calculating the model hash value of `.safetensors` format files, `hash_model_file` is almost instantly completed without reading the model parameters. However, when calculating the model hash value of `.bin`, `.pth`, `.ckpt`, and other binary files, all model parameters need to be read, so **we do not recommend developers to continue using these formats of files.** + +By [writing model Config](/docs/en/Developer_Guide/Integrating_Your_Model.md#step-3-writing-model-config) and filling in model hash value and other information into `diffsynth/configs/model_configs.py`, developers can let `DiffSynth-Studio` automatically identify the model type and load it. + +## Model Loading + +`load_model` is the external entry for loading models in `diffsynth.core.loader`. It will call [skip_model_initialization](/docs/en/API_Reference/core/vram.md#skipping-model-parameter-initialization) to skip model parameter initialization. If [Disk Offload](/docs/en/Pipeline_Usage/VRAM_management.md#disk-offload) is enabled, it calls [DiskMap](/docs/en/API_Reference/core/vram.md#state-dict-disk-mapping) for lazy loading. If Disk Offload is not enabled, it calls [load_state_dict](#model-file-loading) to load model parameters. If necessary, it will also call [state dict converter](/docs/en/Developer_Guide/Integrating_Your_Model.md#step-2-model-file-format-conversion) for model format conversion. Finally, it calls `model.eval()` to switch to inference mode. + +Here is a usage example with Disk Offload enabled: + +```python +from diffsynth.core import load_model, enable_vram_management, AutoWrappedLinear, AutoWrappedModule +from diffsynth.models.qwen_image_dit import QwenImageDiT, RMSNorm +import torch + +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] + +model = load_model( + QwenImageDiT, + model_path, + module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, + }, + vram_config={ + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": "disk", + "onload_device": "disk", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", + }, + vram_limit=0, +) +``` \ No newline at end of file diff --git a/docs/en/API_Reference/core/vram.md b/docs/en/API_Reference/core/vram.md new file mode 100644 index 0000000..79e51fc --- /dev/null +++ b/docs/en/API_Reference/core/vram.md @@ -0,0 +1,66 @@ +# `diffsynth.core.vram`: VRAM Management + +This document introduces the underlying VRAM management functionalities in `diffsynth.core.vram`. If you wish to use these functionalities in other codebases, you can refer to this document. + +## Skipping Model Parameter Initialization + +When loading models in `PyTorch`, model parameters default to occupying VRAM or memory and initializing parameters, but these parameters will be overwritten when loading pretrained weights, leading to redundant computations. `PyTorch` does not provide an interface to skip these redundant computations. We provide `skip_model_initialization` in `diffsynth.core.vram` to skip model parameter initialization. + +Default model loading approach: + +```python +from diffsynth.core import load_state_dict +from diffsynth.models.qwen_image_controlnet import QwenImageBlockWiseControlNet + +model = QwenImageBlockWiseControlNet() # Slow +path = "models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors" +state_dict = load_state_dict(path, device="cpu") +model.load_state_dict(state_dict, assign=True) +``` + +Model loading approach that skips parameter initialization: + +```python +from diffsynth.core import load_state_dict, skip_model_initialization +from diffsynth.models.qwen_image_controlnet import QwenImageBlockWiseControlNet + +with skip_model_initialization(): + model = QwenImageBlockWiseControlNet() # Fast +path = "models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors" +state_dict = load_state_dict(path, device="cpu") +model.load_state_dict(state_dict, assign=True) +``` + +In `DiffSynth-Studio`, all pretrained models follow this loading logic. After developers [integrate models](/docs/en/Developer_Guide/Integrating_Your_Model.md), they can directly load models quickly using this approach. + +## State Dict Disk Mapping + +For pretrained weight files of a model, if we only need to read a set of parameters rather than all parameters, State Dict Disk Mapping can accelerate this process. We provide `DiskMap` in `diffsynth.core.vram` for on-demand loading of model parameters. + +Default weight loading approach: + +```python +from diffsynth.core import load_state_dict + +path = "models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors" +state_dict = load_state_dict(path, device="cpu") # Slow +print(state_dict["img_in.weight"]) +``` + +Using `DiskMap` to load only specific parameters: + +```python +from diffsynth.core import DiskMap + +path = "models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny/model.safetensors" +state_dict = DiskMap(path, device="cpu") # Fast +print(state_dict["img_in.weight"]) +``` + +`DiskMap` is the basic component of Disk Offload in `DiffSynth-Studio`. After developers [configure fine-grained VRAM management schemes](/docs/en/Developer_Guide/Enabling_VRAM_management.md), they can directly enable Disk Offload. + +`DiskMap` is a functionality implemented using the characteristics of `.safetensors` files. Therefore, when using `.bin`, `.pth`, `.ckpt`, and other binary files, model parameters are fully loaded, which causes Disk Offload to not support these formats of files. **We do not recommend developers to continue using these formats of files.** + +## Replacable Modules for VRAM Management + +When `DiffSynth-Studio`'s VRAM management is enabled, the modules inside the model will be replaced with replacable modules in `diffsynth.core.vram.layers`. For usage, see [Fine-grained VRAM Management Scheme](/docs/en/Developer_Guide/Enabling_VRAM_management.md#writing-fine-grained-vram-management-schemes). \ No newline at end of file diff --git a/docs/en/Developer_Guide/Building_a_Pipeline.md b/docs/en/Developer_Guide/Building_a_Pipeline.md new file mode 100644 index 0000000..7d5e785 --- /dev/null +++ b/docs/en/Developer_Guide/Building_a_Pipeline.md @@ -0,0 +1,250 @@ +# Building a Pipeline + +After [integrating the required models for the Pipeline](/docs/en/Developer_Guide/Integrating_Your_Model.md), you also need to build a `Pipeline` for model inference. This document provides a standardized process for building a `Pipeline`. Developers can also refer to existing `Pipeline` implementations for construction. + +The `Pipeline` implementation is located in `diffsynth/pipelines`. Each `Pipeline` contains the following essential key components: + +* `__init__` +* `from_pretrained` +* `__call__` +* `units` +* `model_fn` + +## `__init__` + +In `__init__`, the `Pipeline` is initialized. Here is a simple implementation: + +```python +import torch +from PIL import Image +from typing import Union +from tqdm import tqdm +from ..diffusion import FlowMatchScheduler +from ..core import ModelConfig +from ..diffusion.base_pipeline import BasePipeline, PipelineUnit +from ..models.new_models import XXX_Model, YYY_Model, ZZZ_Model + +class NewDiffSynthPipeline(BasePipeline): + + def __init__(self, device="cuda", torch_dtype=torch.bfloat16): + super().__init__(device=device, torch_dtype=torch_dtype) + self.scheduler = FlowMatchScheduler() + self.text_encoder: XXX_Model = None + self.dit: YYY_Model = None + self.vae: ZZZ_Model = None + self.in_iteration_models = ("dit",) + self.units = [ + NewDiffSynthPipelineUnit_xxx(), + ... + ] + self.model_fn = model_fn_new +``` + +This includes the following parts: + +* `scheduler`: Scheduler, used to control the coefficients in the iterative formula during inference, controlling the noise content at each step. +* `text_encoder`, `dit`, `vae`: Models. Since [Latent Diffusion](https://arxiv.org/abs/2112.10752) was proposed, this three-stage model architecture has become the mainstream Diffusion model architecture. However, this is not immutable, and any number of models can be added to the `Pipeline`. +* `in_iteration_models`: Iteration models. This tuple marks which models will be called during iteration. +* `units`: Pre-processing units for model iteration. See [`units`](#units) for details. +* `model_fn`: The `forward` function of the denoising model during iteration. See [`model_fn`](#model_fn) for details. + +> Q: Model loading does not occur in `__init__`, why initialize each model as `None` here? +> +> A: By annotating the type of each model here, the code editor can provide code completion prompts based on each model, facilitating subsequent development. + +## `from_pretrained` + +`from_pretrained` is responsible for loading the required models to make the `Pipeline` callable. Here is a simple implementation: + +```python + @staticmethod + def from_pretrained( + torch_dtype: torch.dtype = torch.bfloat16, + device: Union[str, torch.device] = "cuda", + model_configs: list[ModelConfig] = [], + vram_limit: float = None, + ): + # Initialize pipeline + pipe = NewDiffSynthPipeline(device=device, torch_dtype=torch_dtype) + model_pool = pipe.download_and_load_models(model_configs, vram_limit) + + # Fetch models + pipe.text_encoder = model_pool.fetch_model("xxx_text_encoder") + pipe.dit = model_pool.fetch_model("yyy_dit") + pipe.vae = model_pool.fetch_model("zzz_vae") + # If necessary, load tokenizers here. + + # VRAM Management + pipe.vram_management_enabled = pipe.check_vram_management_state() + return pipe +``` + +Developers need to implement the logic for fetching models. The corresponding model names are the `"model_name"` in the [model Config filled in during model integration](/docs/en/Developer_Guide/Integrating_Your_Model.md#step-3-writing-model-config). + +Some models also need to load `tokenizer`. Extra `tokenizer_config` parameters can be added to `from_pretrained` as needed, and this part can be implemented after fetching the models. + +## `__call__` + +`__call__` implements the entire generation process of the Pipeline. Below is a common generation process template. Developers can modify it based on their needs. + +```python + @torch.no_grad() + def __call__( + self, + prompt: str, + negative_prompt: str = "", + cfg_scale: float = 4.0, + input_image: Image.Image = None, + denoising_strength: float = 1.0, + height: int = 1328, + width: int = 1328, + seed: int = None, + rand_device: str = "cpu", + num_inference_steps: int = 30, + progress_bar_cmd = tqdm, + ): + # Scheduler + self.scheduler.set_timesteps( + num_inference_steps, + denoising_strength=denoising_strength + ) + + # Parameters + inputs_posi = { + "prompt": prompt, + } + inputs_nega = { + "negative_prompt": negative_prompt, + } + inputs_shared = { + "cfg_scale": cfg_scale, + "input_image": input_image, + "denoising_strength": denoising_strength, + "height": height, + "width": width, + "seed": seed, + "rand_device": rand_device, + "num_inference_steps": num_inference_steps, + } + for unit in self.units: + inputs_shared, inputs_posi, inputs_nega = self.unit_runner(unit, self, inputs_shared, inputs_posi, inputs_nega) + + # Denoise + self.load_models_to_device(self.in_iteration_models) + models = {name: getattr(self, name) for name in self.in_iteration_models} + for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): + timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device) + + # Inference + noise_pred_posi = self.model_fn(**models, **inputs_shared, **inputs_posi, timestep=timestep, progress_id=progress_id) + if cfg_scale != 1.0: + noise_pred_nega = self.model_fn(**models, **inputs_shared, **inputs_nega, timestep=timestep, progress_id=progress_id) + noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega) + else: + noise_pred = noise_pred_posi + + # Scheduler + inputs_shared["latents"] = self.step(self.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs_shared) + + # Decode + self.load_models_to_device(['vae']) + image = self.vae.decode(inputs_shared["latents"], device=self.device) + image = self.vae_output_to_image(image) + self.load_models_to_device([]) + + return image +``` + +## `units` + +`units` contains all the preprocessing processes, such as: width/height checking, prompt encoding, initial noise generation, etc. In the entire model preprocessing process, data is abstracted into three mutually exclusive parts, stored in corresponding dictionaries: + +* `inputs_shared`: Shared inputs, parameters unrelated to [Classifier-Free Guidance](https://arxiv.org/abs/2207.12598) (CFG for short). +* `inputs_posi`: Positive side inputs for Classifier-Free Guidance, containing content related to positive prompts. +* `inputs_nega`: Negative side inputs for Classifier-Free Guidance, containing content related to negative prompts. + +Pipeline Unit implementations include three types: direct mode, CFG separation mode, and takeover mode. + +If some calculations are unrelated to CFG, direct mode can be used, for example, Qwen-Image's random noise initialization: + +```python +class QwenImageUnit_NoiseInitializer(PipelineUnit): + def __init__(self): + super().__init__( + input_params=("height", "width", "seed", "rand_device"), + output_params=("noise",), + ) + + def process(self, pipe: QwenImagePipeline, height, width, seed, rand_device): + noise = pipe.generate_noise((1, 16, height//8, width//8), seed=seed, rand_device=rand_device, rand_torch_dtype=pipe.torch_dtype) + return {"noise": noise} +``` + +If some calculations are related to CFG and need to separately process positive and negative prompts, but the input parameters on both sides are the same, CFG separation mode can be used, for example, Qwen-image's prompt encoding: + +```python +class QwenImageUnit_PromptEmbedder(PipelineUnit): + def __init__(self): + super().__init__( + seperate_cfg=True, + input_params_posi={"prompt": "prompt"}, + input_params_nega={"prompt": "negative_prompt"}, + input_params=("edit_image",), + output_params=("prompt_emb", "prompt_emb_mask"), + onload_model_names=("text_encoder",) + ) + + def process(self, pipe: QwenImagePipeline, prompt, edit_image=None) -> dict: + pipe.load_models_to_device(self.onload_model_names) + # Do something + return {"prompt_emb": prompt_embeds, "prompt_emb_mask": encoder_attention_mask} +``` + +If some calculations need global information, takeover mode is required, for example, Qwen-Image's entity partition control: + +```python +class QwenImageUnit_EntityControl(PipelineUnit): + def __init__(self): + super().__init__( + take_over=True, + input_params=("eligen_entity_prompts", "width", "height", "eligen_enable_on_negative", "cfg_scale"), + output_params=("entity_prompt_emb", "entity_masks", "entity_prompt_emb_mask"), + onload_model_names=("text_encoder",) + ) + + def process(self, pipe: QwenImagePipeline, inputs_shared, inputs_posi, inputs_nega): + # Do something + return inputs_shared, inputs_posi, inputs_nega +``` + +The following are the parameter configurations required for Pipeline Unit: + +* `seperate_cfg`: Whether to enable CFG separation mode +* `take_over`: Whether to enable takeover mode +* `input_params`: Shared input parameters +* `output_params`: Output parameters +* `input_params_posi`: Positive side input parameters +* `input_params_nega`: Negative side input parameters +* `onload_model_names`: Names of model components to be called + +When designing `unit`, please try to follow these principles: + +* Default fallback: For optional function `unit` input parameters, the default is `None` rather than `False` or other values. Please provide fallback processing for this default value. +* Parameter triggering: Some Adapter models may not be loaded, such as ControlNet. The corresponding `unit` should control triggering based on whether the parameter input is `None` rather than whether the model is loaded. For example, when the user inputs `controlnet_image` but does not load the ControlNet model, the code should give an error rather than ignore these input parameters and continue execution. +* Simplicity first: Use direct mode as much as possible, only use takeover mode when the function cannot be implemented. +* VRAM efficiency: When calling models in `unit`, please use `pipe.load_models_to_device(self.onload_model_names)` to activate the corresponding models. Do not call other models outside `onload_model_names`. After `unit` calculation is completed, do not manually release VRAM with `pipe.load_models_to_device([])`. + +> Q: Some parameters are not called during the inference process, such as `output_params`. Is it still necessary to configure them? +> +> A: These parameters will not affect the inference process, but they will affect some experimental features. Therefore, we recommend configuring them properly. For example, "split training" - we can complete the preprocessing offline during training, but some model calculations that require gradient backpropagation cannot be split. These parameters are used to build computational graphs to infer which calculations can be split. + +## `model_fn` + +`model_fn` is the unified `forward` interface during iteration. For models where the open-source ecosystem is not yet formed, you can directly use the denoising model's `forward`, for example: + +```python +def model_fn_new(dit=None, latents=None, timestep=None, prompt_emb=None, **kwargs): + return dit(latents, prompt_emb, timestep) +``` + +For models with rich open-source ecosystems, `model_fn` usually contains complex and chaotic cross-model inference. Taking `diffsynth/pipelines/qwen_image.py` as an example, the additional calculations implemented in this function include: entity partition control, three types of ControlNet, Gradient Checkpointing, etc. Developers need to be extra careful when implementing this part to avoid conflicts between module functions. \ No newline at end of file diff --git a/docs/en/Developer_Guide/Enabling_VRAM_management.md b/docs/en/Developer_Guide/Enabling_VRAM_management.md new file mode 100644 index 0000000..9bdd49f --- /dev/null +++ b/docs/en/Developer_Guide/Enabling_VRAM_management.md @@ -0,0 +1,455 @@ +# Fine-Grained VRAM Management Scheme + +This document introduces how to write reasonable fine-grained VRAM management schemes for models, and how to use the VRAM management functions in `DiffSynth-Studio` for other external code libraries. Before reading this document, please read the document [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). + +## How Much VRAM Does a 20B Model Need? + +Taking Qwen-Image's DiT model as an example, this model has reached 20B parameters. The following code will load this model and perform inference, requiring about 40G VRAM. This model obviously cannot run on consumer-grade GPUs with smaller VRAM. + +```python +from diffsynth.core import load_model +from diffsynth.models.qwen_image_dit import QwenImageDiT +from modelscope import snapshot_download +import torch + +snapshot_download( + model_id="Qwen/Qwen-Image", + local_dir="models/Qwen/Qwen-Image", + allow_file_pattern="transformer/*" +) +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] +inputs = { + "latents": torch.randn((1, 16, 128, 128), dtype=torch.bfloat16, device="cuda"), + "timestep": torch.zeros((1,), dtype=torch.bfloat16, device="cuda"), + "prompt_emb": torch.randn((1, 5, 3584), dtype=torch.bfloat16, device="cuda"), + "prompt_emb_mask": torch.ones((1, 5), dtype=torch.int64, device="cuda"), + "height": 1024, + "width": 1024, +} + +model = load_model(QwenImageDiT, model_path, torch_dtype=torch.bfloat16, device="cuda") +with torch.no_grad(): + output = model(**inputs) +``` + +## Writing Fine-Grained VRAM Management Scheme + +To write a fine-grained VRAM management scheme, we need to use `print(model)` to observe and analyze the model structure: + +``` +QwenImageDiT( + (pos_embed): QwenEmbedRope() + (time_text_embed): TimestepEmbeddings( + (time_proj): TemporalTimesteps() + (timestep_embedder): DiffusersCompatibleTimestepProj( + (linear_1): Linear(in_features=256, out_features=3072, bias=True) + (act): SiLU() + (linear_2): Linear(in_features=3072, out_features=3072, bias=True) + ) + ) + (txt_norm): RMSNorm() + (img_in): Linear(in_features=64, out_features=3072, bias=True) + (txt_in): Linear(in_features=3584, out_features=3072, bias=True) + (transformer_blocks): ModuleList( + (0-59): 60 x QwenImageTransformerBlock( + (img_mod): Sequential( + (0): SiLU() + (1): Linear(in_features=3072, out_features=18432, bias=True) + ) + (img_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (attn): QwenDoubleStreamAttention( + (to_q): Linear(in_features=3072, out_features=3072, bias=True) + (to_k): Linear(in_features=3072, out_features=3072, bias=True) + (to_v): Linear(in_features=3072, out_features=3072, bias=True) + (norm_q): RMSNorm() + (norm_k): RMSNorm() + (add_q_proj): Linear(in_features=3072, out_features=3072, bias=True) + (add_k_proj): Linear(in_features=3072, out_features=3072, bias=True) + (add_v_proj): Linear(in_features=3072, out_features=3072, bias=True) + (norm_added_q): RMSNorm() + (norm_added_k): RMSNorm() + (to_out): Sequential( + (0): Linear(in_features=3072, out_features=3072, bias=True) + ) + (to_add_out): Linear(in_features=3072, out_features=3072, bias=True) + ) + (img_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (img_mlp): QwenFeedForward( + (net): ModuleList( + (0): ApproximateGELU( + (proj): Linear(in_features=3072, out_features=12288, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=12288, out_features=3072, bias=True) + ) + ) + (txt_mod): Sequential( + (0): SiLU() + (1): Linear(in_features=3072, out_features=18432, bias=True) + ) + (txt_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (txt_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (txt_mlp): QwenFeedForward( + (net): ModuleList( + (0): ApproximateGELU( + (proj): Linear(in_features=3072, out_features=12288, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=12288, out_features=3072, bias=True) + ) + ) + ) + ) + (norm_out): AdaLayerNorm( + (linear): Linear(in_features=3072, out_features=6144, bias=True) + (norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + ) + (proj_out): Linear(in_features=3072, out_features=64, bias=True) +) +``` + +In VRAM management, we only care about layers containing parameters. In this model structure, `QwenEmbedRope`, `TemporalTimesteps`, `SiLU` and other Layers do not contain parameters. `LayerNorm` also does not contain parameters because `elementwise_affine=False` is set. Layers containing parameters are only `Linear` and `RMSNorm`. + +`diffsynth.core.vram` provides two replacement modules for VRAM management: +* `AutoWrappedLinear`: Used to replace `Linear` layers +* `AutoWrappedModule`: Used to replace any other layer + +Write a `module_map` to map `Linear` and `RMSNorm` in the model to the corresponding modules: + +```python +module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, +} +``` + +In addition, `vram_config` and `vram_limit` are also required, which have been introduced in [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md#more-usage-methods). + +Call `enable_vram_management` to enable VRAM management. Note that the `device` when loading the model is `cpu`, consistent with `offload_device`: + +```python +from diffsynth.core import load_model, enable_vram_management, AutoWrappedLinear, AutoWrappedModule +from diffsynth.models.qwen_image_dit import QwenImageDiT, RMSNorm +import torch + +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] +inputs = { + "latents": torch.randn((1, 16, 128, 128), dtype=torch.bfloat16, device="cuda"), + "timestep": torch.zeros((1,), dtype=torch.bfloat16, device="cuda"), + "prompt_emb": torch.randn((1, 5, 3584), dtype=torch.bfloat16, device="cuda"), + "prompt_emb_mask": torch.ones((1, 5), dtype=torch.int64, device="cuda"), + "height": 1024, + "width": 1024, +} + +model = load_model(QwenImageDiT, model_path, torch_dtype=torch.bfloat16, device="cpu") +enable_vram_management( + model, + module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, + }, + vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", + }, + vram_limit=0, +) +with torch.no_grad(): + output = model(**inputs) +``` + +The above code only requires 2G VRAM to run the `forward` of a 20B model. + +## Disk Offload + +[Disk Offload](/docs/en/Pipeline_Usage/VRAM_management.md#disk-offload) is a special VRAM management scheme that needs to be enabled during the model loading process, not after the model is loaded. Usually, when the above code can run smoothly, Disk Offload can be directly enabled: + +```python +from diffsynth.core import load_model, enable_vram_management, AutoWrappedLinear, AutoWrappedModule +from diffsynth.models.qwen_image_dit import QwenImageDiT, RMSNorm +import torch + +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] +inputs = { + "latents": torch.randn((1, 16, 128, 128), dtype=torch.bfloat16, device="cuda"), + "timestep": torch.zeros((1,), dtype=torch.bfloat16, device="cuda"), + "prompt_emb": torch.randn((1, 5, 3584), dtype=torch.bfloat16, device="cuda"), + "prompt_emb_mask": torch.ones((1, 5), dtype=torch.int64, device="cuda"), + "height": 1024, + "width": 1024, +} + +model = load_model( + QwenImageDiT, + model_path, + module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, + }, + vram_config={ + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": "disk", + "onload_device": "disk", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", + }, + vram_limit=0, +) +with torch.no_grad(): + output = model(**inputs) +``` + +Disk Offload is an extremely special VRAM management scheme. It only supports `.safetensors` format files, not binary files such as `.bin`, `.pth`, `.ckpt`, and does not support [state dict converter](/docs/en/Developer_Guide/Integrating_Your_Model.md#step-2-model-file-format-conversion) with Tensor reshape. + +If there are situations where Disk Offload cannot run normally but non-Disk Offload can run normally, please submit an issue to us on GitHub. + +## Writing Default Configuration + +To make it easier for users to use the VRAM management function, we write the fine-grained VRAM management configuration in `diffsynth/configs/vram_management_module_maps.py`. The configuration information for the above model is: + +```python +"diffsynth.models.qwen_image_dit.QwenImageDiT": { + "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", +} +```# Fine-Grained VRAM Management Scheme + +This document introduces how to write reasonable fine-grained VRAM management schemes for models, and how to use the VRAM management functions in `DiffSynth-Studio` for other external code libraries. Before reading this document, please read the document [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). + +## How Much VRAM Does a 20B Model Need? + +Taking Qwen-Image's DiT model as an example, this model has reached 20B parameters. The following code will load this model and perform inference, requiring about 40G VRAM. This model obviously cannot run on consumer-grade GPUs with smaller VRAM. + +```python +from diffsynth.core import load_model +from diffsynth.models.qwen_image_dit import QwenImageDiT +from modelscope import snapshot_download +import torch + +snapshot_download( + model_id="Qwen/Qwen-Image", + local_dir="models/Qwen/Qwen-Image", + allow_file_pattern="transformer/*" +) +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] +inputs = { + "latents": torch.randn((1, 16, 128, 128), dtype=torch.bfloat16, device="cuda"), + "timestep": torch.zeros((1,), dtype=torch.bfloat16, device="cuda"), + "prompt_emb": torch.randn((1, 5, 3584), dtype=torch.bfloat16, device="cuda"), + "prompt_emb_mask": torch.ones((1, 5), dtype=torch.int64, device="cuda"), + "height": 1024, + "width": 1024, +} + +model = load_model(QwenImageDiT, model_path, torch_dtype=torch.bfloat16, device="cuda") +with torch.no_grad(): + output = model(**inputs) +``` + +## Writing Fine-Grained VRAM Management Scheme + +To write a fine-grained VRAM management scheme, we need to use `print(model)` to observe and analyze the model structure: + +``` +QwenImageDiT( + (pos_embed): QwenEmbedRope() + (time_text_embed): TimestepEmbeddings( + (time_proj): TemporalTimesteps() + (timestep_embedder): DiffusersCompatibleTimestepProj( + (linear_1): Linear(in_features=256, out_features=3072, bias=True) + (act): SiLU() + (linear_2): Linear(in_features=3072, out_features=3072, bias=True) + ) + ) + (txt_norm): RMSNorm() + (img_in): Linear(in_features=64, out_features=3072, bias=True) + (txt_in): Linear(in_features=3584, out_features=3072, bias=True) + (transformer_blocks): ModuleList( + (0-59): 60 x QwenImageTransformerBlock( + (img_mod): Sequential( + (0): SiLU() + (1): Linear(in_features=3072, out_features=18432, bias=True) + ) + (img_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (attn): QwenDoubleStreamAttention( + (to_q): Linear(in_features=3072, out_features=3072, bias=True) + (to_k): Linear(in_features=3072, out_features=3072, bias=True) + (to_v): Linear(in_features=3072, out_features=3072, bias=True) + (norm_q): RMSNorm() + (norm_k): RMSNorm() + (add_q_proj): Linear(in_features=3072, out_features=3072, bias=True) + (add_k_proj): Linear(in_features=3072, out_features=3072, bias=True) + (add_v_proj): Linear(in_features=3072, out_features=3072, bias=True) + (norm_added_q): RMSNorm() + (norm_added_k): RMSNorm() + (to_out): Sequential( + (0): Linear(in_features=3072, out_features=3072, bias=True) + ) + (to_add_out): Linear(in_features=3072, out_features=3072, bias=True) + ) + (img_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (img_mlp): QwenFeedForward( + (net): ModuleList( + (0): ApproximateGELU( + (proj): Linear(in_features=3072, out_features=12288, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=12288, out_features=3072, bias=True) + ) + ) + (txt_mod): Sequential( + (0): SiLU() + (1): Linear(in_features=3072, out_features=18432, bias=True) + ) + (txt_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (txt_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + (txt_mlp): QwenFeedForward( + (net): ModuleList( + (0): ApproximateGELU( + (proj): Linear(in_features=3072, out_features=12288, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=12288, out_features=3072, bias=True) + ) + ) + ) + ) + (norm_out): AdaLayerNorm( + (linear): Linear(in_features=3072, out_features=6144, bias=True) + (norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) + ) + (proj_out): Linear(in_features=3072, out_features=64, bias=True) +) +``` + +In VRAM management, we only care about layers containing parameters. In this model structure, `QwenEmbedRope`, `TemporalTimesteps`, `SiLU` and other Layers do not contain parameters. `LayerNorm` also does not contain parameters because `elementwise_affine=False` is set. Layers containing parameters are only `Linear` and `RMSNorm`. + +`diffsynth.core.vram` provides two replacement modules for VRAM management: +* `AutoWrappedLinear`: Used to replace `Linear` layers +* `AutoWrappedModule`: Used to replace any other layer + +Write a `module_map` to map `Linear` and `RMSNorm` in the model to the corresponding modules: + +```python +module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, +} +``` + +In addition, `vram_config` and `vram_limit` are also required, which have been introduced in [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md#more-usage-methods). + +Call `enable_vram_management` to enable VRAM management. Note that the `device` when loading the model is `cpu`, consistent with `offload_device`: + +```python +from diffsynth.core import load_model, enable_vram_management, AutoWrappedLinear, AutoWrappedModule +from diffsynth.models.qwen_image_dit import QwenImageDiT, RMSNorm +import torch + +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] +inputs = { + "latents": torch.randn((1, 16, 128, 128), dtype=torch.bfloat16, device="cuda"), + "timestep": torch.zeros((1,), dtype=torch.bfloat16, device="cuda"), + "prompt_emb": torch.randn((1, 5, 3584), dtype=torch.bfloat16, device="cuda"), + "prompt_emb_mask": torch.ones((1, 5), dtype=torch.int64, device="cuda"), + "height": 1024, + "width": 1024, +} + +model = load_model(QwenImageDiT, model_path, torch_dtype=torch.bfloat16, device="cpu") +enable_vram_management( + model, + module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, + }, + vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", + }, + vram_limit=0, +) +with torch.no_grad(): + output = model(**inputs) +``` + +The above code only requires 2G VRAM to run the `forward` of a 20B model. + +## Disk Offload + +[Disk Offload](/docs/en/Pipeline_Usage/VRAM_management.md#disk-offload) is a special VRAM management scheme that needs to be enabled during the model loading process, not after the model is loaded. Usually, when the above code can run smoothly, Disk Offload can be directly enabled: + +```python +from diffsynth.core import load_model, enable_vram_management, AutoWrappedLinear, AutoWrappedModule +from diffsynth.models.qwen_image_dit import QwenImageDiT, RMSNorm +import torch + +prefix = "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model" +model_path = [prefix + f"-0000{i}-of-00009.safetensors" for i in range(1, 10)] +inputs = { + "latents": torch.randn((1, 16, 128, 128), dtype=torch.bfloat16, device="cuda"), + "timestep": torch.zeros((1,), dtype=torch.bfloat16, device="cuda"), + "prompt_emb": torch.randn((1, 5, 3584), dtype=torch.bfloat16, device="cuda"), + "prompt_emb_mask": torch.ones((1, 5), dtype=torch.int64, device="cuda"), + "height": 1024, + "width": 1024, +} + +model = load_model( + QwenImageDiT, + model_path, + module_map={ + torch.nn.Linear: AutoWrappedLinear, + RMSNorm: AutoWrappedModule, + }, + vram_config={ + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": "disk", + "onload_device": "disk", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", + }, + vram_limit=0, +) +with torch.no_grad(): + output = model(**inputs) +``` + +Disk Offload is an extremely special VRAM management scheme. It only supports `.safetensors` format files, not binary files such as `.bin`, `.pth`, `.ckpt`, and does not support [state dict converter](/docs/en/Developer_Guide/Integrating_Your_Model.md#step-2-model-file-format-conversion) with Tensor reshape. + +If there are situations where Disk Offload cannot run normally but non-Disk Offload can run normally, please submit an issue to us on GitHub. + +## Writing Default Configuration + +To make it easier for users to use the VRAM management function, we write the fine-grained VRAM management configuration in `diffsynth/configs/vram_management_module_maps.py`. The configuration information for the above model is: + +```python +"diffsynth.models.qwen_image_dit.QwenImageDiT": { + "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule", + "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", +} +``` \ No newline at end of file diff --git a/docs/en/Developer_Guide/Integrating_Your_Model.md b/docs/en/Developer_Guide/Integrating_Your_Model.md new file mode 100644 index 0000000..ae5e6f2 --- /dev/null +++ b/docs/en/Developer_Guide/Integrating_Your_Model.md @@ -0,0 +1,186 @@ +# Integrating Model Architecture + +This document introduces how to integrate models into the `DiffSynth-Studio` framework for use by modules such as `Pipeline`. + +## Step 1: Integrate Model Architecture Code + +All model architecture implementations in `DiffSynth-Studio` are unified in `diffsynth/models`. Each `.py` code file implements a model architecture, and all models are loaded through `ModelPool` in `diffsynth/models/model_loader.py`. When integrating new model architectures, please create a new `.py` file under this path. + +```shell +diffsynth/models/ +├── general_modules.py +├── model_loader.py +├── qwen_image_controlnet.py +├── qwen_image_dit.py +├── qwen_image_text_encoder.py +├── qwen_image_vae.py +└── ... +``` + +In most cases, we recommend integrating models in native `PyTorch` code form, with the model architecture class directly inheriting from `torch.nn.Module`, for example: + +```python +import torch + +class NewDiffSynthModel(torch.nn.Module): + def __init__(self, dim=1024): + super().__init__() + self.linear = torch.nn.Linear(dim, dim) + self.activation = torch.nn.Sigmoid() + + def forward(self, x): + x = self.linear(x) + x = self.activation(x) + return x +``` + +If the model architecture implementation contains additional dependencies, we strongly recommend removing them, otherwise this will cause heavy package dependency issues. In our existing models, Qwen-Image's Blockwise ControlNet is integrated in this way. The code is lightweight, please refer to `diffsynth/models/qwen_image_controlnet.py`. + +If the model has been integrated by Huggingface Library ([`transformers`](https://huggingface.co/docs/transformers/main/index), [`diffusers`](https://huggingface.co/docs/diffusers/main/index), etc.), we can integrate the model in a simpler way: + +
+Integrating Huggingface Library Style Model Architecture Code + +The loading method for these models in Huggingface Library is: + +```python +from transformers import XXX_Model + +model = XXX_Model.from_pretrained("path_to_your_model") +``` + +`DiffSynth-Studio` does not support loading models through `from_pretrained` because this conflicts with VRAM management and other functions. Please rewrite the model architecture in the following format: + +```python +import torch + +class DiffSynth_XXX_Model(torch.nn.Module): + def __init__(self): + super().__init__() + from transformers import XXX_Config, XXX_Model + config = XXX_Config(**{ + "architectures": ["XXX_Model"], + "other_configs": "Please copy and paste the other configs here.", + }) + self.model = XXX_Model(config) + + def forward(self, x): + outputs = self.model(x) + return outputs +``` + +Where `XXX_Config` is the Config class corresponding to the model. For example, the Config class for `Qwen2_5_VLModel` is `Qwen2_5_VLConfig`, which can be found by consulting its source code. The content inside Config can usually be found in the `config.json` file in the model library. `DiffSynth-Studio` will not read the `config.json` file, so the content needs to be copied and pasted into the code. + +In rare cases, version updates of `transformers` and `diffusers` may cause some models to be unable to import. Therefore, if possible, we still recommend using the model integration method in Step 1.1. + +In our existing models, Qwen-Image's Text Encoder is integrated in this way. The code is lightweight, please refer to `diffsynth/models/qwen_image_text_encoder.py`. + +
+ +## Step 2: Model File Format Conversion + +Due to the variety of model file formats provided by developers in the open-source community, we sometimes need to convert model file formats to form correctly formatted [state dict](https://docs.pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html). This is common in the following situations: + +* Model files built by different code libraries, for example [Wan-AI/Wan2.1-T2V-1.3B](https://www.modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) and [Wan-AI/Wan2.1-T2V-1.3B-Diffusers](https://www.modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B-Diffusers). +* Models modified during integration, for example, the Text Encoder of [Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) adds a `model.` prefix in `diffsynth/models/qwen_image_text_encoder.py`. +* Model files containing multiple models, for example, the VACE Adapter and base DiT model of [Wan-AI/Wan2.1-VACE-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B) are mixed and stored in the same set of model files. + +In our development philosophy, we hope to respect the wishes of model authors as much as possible. If we repackage the model files, for example [Comfy-Org/Qwen-Image_ComfyUI](https://www.modelscope.cn/models/Comfy-Org/Qwen-Image_ComfyUI), although we can call the model more conveniently, traffic (model page views and downloads, etc.) will be directed elsewhere, and the original author of the model will also lose the power to delete the model. Therefore, we have added the `diffsynth/utils/state_dict_converters` module to the framework for file format conversion during model loading. + +This part of logic is very simple. Taking Qwen-Image's Text Encoder as an example, only 10 lines of code are needed: + +```python +def QwenImageTextEncoderStateDictConverter(state_dict): + state_dict_ = {} + for k in state_dict: + v = state_dict[k] + if k.startswith("visual."): + k = "model." + k + elif k.startswith("model."): + k = k.replace("model.", "model.language_model.") + state_dict_[k] = v + return state_dict_ +``` + +## Step 3: Writing Model Config + +Model Config is located in `diffsynth/configs/model_configs.py`, used to identify model types and load them. The following fields need to be filled in: + +* `model_hash`: Model file hash value, which can be obtained through the `hash_model_file` function. This hash value is only related to the keys and tensor shapes in the model file's state dict, and is unrelated to other information in the file. +* `model_name`: Model name, used for `Pipeline` to identify the required model. If different structured models play the same role in `Pipeline`, the same `model_name` can be used. When integrating new models, just ensure that `model_name` is different from other existing functional models. The corresponding model is fetched through `model_name` in the `Pipeline`'s `from_pretrained`. +* `model_class`: Model architecture import path, pointing to the model architecture class implemented in Step 1, for example `diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder`. +* `state_dict_converter`: Optional parameter. If model file format conversion is needed, the import path of the model conversion logic needs to be filled in, for example `diffsynth.utils.state_dict_converters.qwen_image_text_encoder.QwenImageTextEncoderStateDictConverter`. +* `extra_kwargs`: Optional parameter. If additional parameters need to be passed when initializing the model, these parameters need to be filled in. For example, models [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny) and [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint) both adopt the `QwenImageBlockWiseControlNet` structure in `diffsynth/models/qwen_image_controlnet.py`, but the latter also needs additional configuration `additional_in_dim=4`. Therefore, this configuration information needs to be filled in the `extra_kwargs` field. + +We provide a piece of code to quickly understand how models are loaded through this configuration information: + +```python +from diffsynth.core import hash_model_file, load_state_dict, skip_model_initialization +from diffsynth.models.qwen_image_text_encoder import QwenImageTextEncoder +from diffsynth.utils.state_dict_converters.qwen_image_text_encoder import QwenImageTextEncoderStateDictConverter +import torch + +model_hash = "8004730443f55db63092006dd9f7110e" +model_name = "qwen_image_text_encoder" +model_class = QwenImageTextEncoder +state_dict_converter = QwenImageTextEncoderStateDictConverter +extra_kwargs = {} + +model_path = [ + "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors", +] +if hash_model_file(model_path) == model_hash: + with skip_model_initialization(): + model = model_class(**extra_kwargs) + state_dict = load_state_dict(model_path, torch_dtype=torch.bfloat16, device="cuda") + state_dict = state_dict_converter(state_dict) + model.load_state_dict(state_dict, assign=True) + print("Done!") +``` + +> Q: The logic of the above code looks very simple, why is this part of code in `DiffSynth-Studio` extremely complex? +> +> A: Because we provide aggressive VRAM management functions that are coupled with the model loading logic, this leads to the complexity of the framework structure. We have tried our best to simplify the interface exposed to developers. + +The `model_hash` in `diffsynth/configs/model_configs.py` is not uniquely existing. Multiple models may exist in the same model file. For this situation, please use multiple model Configs to load each model separately, and write the corresponding `state_dict_converter` to separate the parameters required by each model. + +## Step 4: Verifying Whether the Model Can Be Recognized and Loaded + +After model integration, the following code can be used to verify whether the model can be correctly recognized and loaded. The following code will attempt to load the model into memory: + +```python +from diffsynth.models.model_loader import ModelPool + +model_pool = ModelPool() +model_pool.auto_load_model( + [ + "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors", + ], +) +``` + +If the model can be recognized and loaded, you will see the following output: + +``` +Loading models from: [ + "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", + "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors" +] +Loaded model: { + "model_name": "qwen_image_text_encoder", + "model_class": "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder", + "extra_kwargs": null +} +``` + +## Step 5: Writing Model VRAM Management Scheme + +`DiffSynth-Studio` supports complex VRAM management. See [Enabling VRAM Management](/docs/en/Developer_Guide/Enabling_VRAM_management.md) for details. \ No newline at end of file diff --git a/docs/en/Developer_Guide/Training_Diffusion_Models.md b/docs/en/Developer_Guide/Training_Diffusion_Models.md new file mode 100644 index 0000000..3fc92fc --- /dev/null +++ b/docs/en/Developer_Guide/Training_Diffusion_Models.md @@ -0,0 +1,66 @@ +# Integrating Model Training + +After [integrating models](/docs/en/Developer_Guide/Integrating_Your_Model.md) and [implementing Pipeline](/docs/en/Developer_Guide/Building_a_Pipeline.md), the next step is to integrate model training functionality. + +## Training-Inference Consistent Pipeline Modification + +To ensure strict consistency between training and inference processes, we will use most of the inference code during training, but still need to make minor modifications. + +First, add extra logic during inference to switch the image-to-image/video-to-video logic based on the `scheduler` state. Taking Qwen-Image as an example: + +```python +class QwenImageUnit_InputImageEmbedder(PipelineUnit): + def __init__(self): + super().__init__( + input_params=("input_image", "noise", "tiled", "tile_size", "tile_stride"), + output_params=("latents", "input_latents"), + onload_model_names=("vae",) + ) + + def process(self, pipe: QwenImagePipeline, input_image, noise, tiled, tile_size, tile_stride): + if input_image is None: + return {"latents": noise, "input_latents": None} + pipe.load_models_to_device(['vae']) + image = pipe.preprocess_image(input_image).to(device=pipe.device, dtype=pipe.torch_dtype) + input_latents = pipe.vae.encode(image, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride) + if pipe.scheduler.training: + return {"latents": noise, "input_latents": input_latents} + else: + latents = pipe.scheduler.add_noise(input_latents, noise, timestep=pipe.scheduler.timesteps[0]) + return {"latents": latents, "input_latents": input_latents} +``` + +Then, enable Gradient Checkpointing in `model_fn`, which will significantly reduce the VRAM required for training at the cost of computational speed. This is not mandatory, but we strongly recommend doing so. + +Taking Qwen-Image as an example, before modification: + +```python +text, image = block( + image=image, + text=text, + temb=conditioning, + image_rotary_emb=image_rotary_emb, + attention_mask=attention_mask, +) +``` + +After modification: + +```python +from ..core import gradient_checkpoint_forward + +text, image = gradient_checkpoint_forward( + block, + use_gradient_checkpointing, + use_gradient_checkpointing_offload, + image=image, + text=text, + temb=conditioning, + image_rotary_emb=image_rotary_emb, + attention_mask=attention_mask, +) +``` + +## Writing Training Scripts + +`DiffSynth-Studio` does not strictly encapsulate the training framework, but exposes the script content to developers. This approach makes it more convenient to modify training scripts to implement additional functions. Developers can refer to existing training scripts, such as `examples/qwen_image/model_training/train.py`, for modification to adapt to new model training. \ No newline at end of file diff --git a/docs/en/Model_Details/FLUX.md b/docs/en/Model_Details/FLUX.md new file mode 100644 index 0000000..3482e02 --- /dev/null +++ b/docs/en/Model_Details/FLUX.md @@ -0,0 +1,210 @@ +# FLUX + +![Image](https://github.com/user-attachments/assets/c01258e2-f251-441a-aa1e-ebb22f02594d) + +FLUX is an image generation model series developed and open-sourced by Black Forest Labs. + +## Installation + +Before using this project for model inference and training, please install DiffSynth-Studio first. + +```shell +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +For more information about installation, please refer to [Install Dependencies](/docs/en/Pipeline_Usage/Setup.md). + +## Quick Start + +Run the following code to quickly load the [black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev) model and perform inference. VRAM management is enabled, and the framework will automatically control model parameter loading based on remaining VRAM. Minimum 8GB VRAM is required to run. + +```python +import torch +from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig + +vram_config = { + "offload_dtype": torch.float8_e4m3fn, + "offload_device": "cpu", + "onload_dtype": torch.float8_e4m3fn, + "onload_device": "cpu", + "preparing_dtype": torch.float8_e4m3fn, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = FluxImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors", **vram_config), + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors", **vram_config), + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors", **vram_config), + ], + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +prompt = "CG, masterpiece, best quality, solo, long hair, wavy hair, silver hair, blue eyes, blue dress, medium breasts, dress, underwater, air bubble, floating hair, refraction, portrait. The girl's flowing silver hair shimmers with every color of the rainbow and cascades down, merging with the floating flora around her." +negative_prompt = "worst quality, low quality, monochrome, zombie, interlocked fingers, Aissist, cleavage, nsfw," + +image = pipe(prompt=prompt, seed=0) +image.save("flux.jpg") + +image = pipe( + prompt=prompt, negative_prompt=negative_prompt, + seed=0, cfg_scale=2, num_inference_steps=50, +) +image.save("flux_cfg.jpg") +``` + +## Model Overview + +
+ +Model Lineage + +```mermaid +graph LR; + FLUX.1-Series-->black-forest-labs/FLUX.1-dev; + FLUX.1-Series-->black-forest-labs/FLUX.1-Krea-dev; + FLUX.1-Series-->black-forest-labs/FLUX.1-Kontext-dev; + black-forest-labs/FLUX.1-dev-->FLUX.1-dev-ControlNet-Series; + FLUX.1-dev-ControlNet-Series-->alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta; + FLUX.1-dev-ControlNet-Series-->InstantX/FLUX.1-dev-Controlnet-Union-alpha; + FLUX.1-dev-ControlNet-Series-->jasperai/Flux.1-dev-Controlnet-Upscaler; + black-forest-labs/FLUX.1-dev-->InstantX/FLUX.1-dev-IP-Adapter; + black-forest-labs/FLUX.1-dev-->ByteDance/InfiniteYou; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Eligen; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev; + black-forest-labs/FLUX.1-dev-->ostris/Flex.2-preview; + black-forest-labs/FLUX.1-dev-->stepfun-ai/Step1X-Edit; + Qwen/Qwen2.5-VL-7B-Instruct-->stepfun-ai/Step1X-Edit; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Nexus-GenV2; + Qwen/Qwen2.5-VL-7B-Instruct-->DiffSynth-Studio/Nexus-GenV2; +``` + +
+ +| Model ID | Extra Parameters | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | - | +| [black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev) | | [code](/examples/flux/model_inference/FLUX.1-dev.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev.py) | [code](/examples/flux/model_training/full/FLUX.1-dev.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev.py) | +| [black-forest-labs/FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev) | | [code](/examples/flux/model_inference/FLUX.1-Krea-dev.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-Krea-dev.py) | [code](/examples/flux/model_training/full/FLUX.1-Krea-dev.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-Krea-dev.py) | [code](/examples/flux/model_training/lora/FLUX.1-Krea-dev.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-Krea-dev.py) | +| [black-forest-labs/FLUX.1-Kontext-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Kontext-dev) | `kontext_images` | [code](/examples/flux/model_inference/FLUX.1-Kontext-dev.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py) | [code](/examples/flux/model_training/full/FLUX.1-Kontext-dev.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-Kontext-dev.py) | [code](/examples/flux/model_training/lora/FLUX.1-Kontext-dev.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-Kontext-dev.py) | +| [alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta](https://www.modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta) | `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Inpainting-Beta.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Inpainting-Beta.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | +| [InstantX/FLUX.1-dev-Controlnet-Union-alpha](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha) | `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Union-alpha.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Union-alpha.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Union-alpha.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Union-alpha.py) | +| [jasperai/Flux.1-dev-Controlnet-Upscaler](https://www.modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler) | `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Upscaler.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Upscaler.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Upscaler.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Upscaler.py) | +| [InstantX/FLUX.1-dev-IP-Adapter](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-IP-Adapter) | `ipadapter_images`, `ipadapter_scale` | [code](/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-IP-Adapter.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-IP-Adapter.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-IP-Adapter.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-IP-Adapter.py) | +| [ByteDance/InfiniteYou](https://www.modelscope.cn/models/ByteDance/InfiniteYou) | `infinityou_id_image`, `infinityou_guidance`, `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-InfiniteYou.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-InfiniteYou.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-InfiniteYou.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-InfiniteYou.py) | +| [DiffSynth-Studio/Eligen](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen) | `eligen_entity_prompts`, `eligen_entity_masks`, `eligen_enable_on_negative`, `eligen_enable_inpaint` | [code](/examples/flux/model_inference/FLUX.1-dev-EliGen.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-EliGen.py) | - | - | [code](/examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py) | +| [DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev](https://www.modelscope.cn/models/DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev) | `lora_encoder_inputs`, `lora_encoder_scale` | [code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRA-Encoder.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-LoRA-Encoder.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-LoRA-Encoder.py) | - | - | +| [DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev](https://modelscope.cn/models/DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev) | | [code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py) | - | - | - | - | - | +| [stepfun-ai/Step1X-Edit](https://www.modelscope.cn/models/stepfun-ai/Step1X-Edit) | `step1x_reference_image` | [code](/examples/flux/model_inference/Step1X-Edit.py) | [code](/examples/flux/model_inference_low_vram/Step1X-Edit.py) | [code](/examples/flux/model_training/full/Step1X-Edit.sh) | [code](/examples/flux/model_training/validate_full/Step1X-Edit.py) | [code](/examples/flux/model_training/lora/Step1X-Edit.sh) | [code](/examples/flux/model_training/validate_lora/Step1X-Edit.py) | +| [ostris/Flex.2-preview](https://www.modelscope.cn/models/ostris/Flex.2-preview) | `flex_inpaint_image`, `flex_inpaint_mask`, `flex_control_image`, `flex_control_strength`, `flex_control_stop` | [code](/examples/flux/model_inference/FLEX.2-preview.py) | [code](/examples/flux/model_inference_low_vram/FLEX.2-preview.py) | [code](/examples/flux/model_training/full/FLEX.2-preview.sh) | [code](/examples/flux/model_training/validate_full/FLEX.2-preview.py) | [code](/examples/flux/model_training/lora/FLEX.2-preview.sh) | [code](/examples/flux/model_training/validate_lora/FLEX.2-preview.py) | +| [DiffSynth-Studio/Nexus-GenV2](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2) | `nexus_gen_reference_image` | [code](/examples/flux/model_inference/Nexus-Gen-Editing.py) | [code](/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py) | [code](/examples/flux/model_training/full/Nexus-Gen.sh) | [code](/examples/flux/model_training/validate_full/Nexus-Gen.py) | [code](/examples/flux/model_training/lora/Nexus-Gen.sh) | [code](/examples/flux/model_training/validate_lora/Nexus-Gen.py) | + +Special Training Scripts: + +* Differential LoRA Training: [doc](/docs/en/Training/Differential_LoRA.md), [code](/examples/flux/model_training/special/differential_training/) +* FP8 Precision Training: [doc](/docs/en/Training/FP8_Precision.md), [code](/examples/flux/model_training/special/fp8_training/) +* Two-stage Split Training: [doc](/docs/en/Training/Split_Training.md), [code](/examples/flux/model_training/special/split_training/) +* End-to-end Direct Distillation: [doc](/docs/en/Training/Direct_Distill.md), [code](/examples/flux/model_training/lora/FLUX.1-dev-Distill-LoRA.sh) + +## Model Inference + +Models are loaded via `FluxImagePipeline.from_pretrained`, see [Loading Models](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models). + +Input parameters for `FluxImagePipeline` inference include: + +* `prompt`: Prompt describing the content appearing in the image. +* `negative_prompt`: Negative prompt describing content that should not appear in the image, default value is `""`. +* `cfg_scale`: Classifier-free guidance parameter, default value is 1. When set to a value greater than 1, CFG is enabled. +* `height`: Image height, must be a multiple of 16. +* `width`: Image width, must be a multiple of 16. +* `seed`: Random seed. Default is `None`, meaning completely random. +* `rand_device`: Computing device for generating random Gaussian noise matrix, default is `"cpu"`. When set to `cuda`, different GPUs will produce different generation results. +* `num_inference_steps`: Number of inference steps, default value is 30. +* `embedded_guidance`: Embedded guidance parameter, default value is 3.5. +* `t5_sequence_length`: Sequence length of the T5 text encoder, default is 512. +* `tiled`: Whether to enable VAE tiling inference, default is `False`. Setting to `True` can significantly reduce VRAM usage during VAE encoding/decoding stages, producing slight errors and slightly longer inference time. +* `tile_size`: Tile size during VAE encoding/decoding stages, default is 128, only effective when `tiled=True`. +* `tile_stride`: Tile stride during VAE encoding/decoding stages, default is 64, only effective when `tiled=True`, must be less than or equal to `tile_size`. +* `progress_bar_cmd`: Progress bar, default is `tqdm.tqdm`. Can be disabled by setting to `lambda x:x`. +* `controlnet_inputs`: ControlNet model inputs, type is `ControlNetInput` list. +* `ipadapter_images`: IP-Adapter model input image list. +* `ipadapter_scale`: Guidance strength of the IP-Adapter model. +* `infinityou_id_image`: InfiniteYou model input image. +* `infinityou_guidance`: Guidance strength of the InfiniteYou model. +* `kontext_images`: Kontext model input images. +* `eligen_entity_prompts`: EliGen partition control prompt list. +* `eligen_entity_masks`: EliGen partition control region mask image list. +* `eligen_enable_on_negative`: Whether to enable EliGen partition control on the negative side of CFG. +* `eligen_enable_inpaint`: Whether to enable EliGen partition control inpainting function. +* `lora_encoder_inputs`: LoRA encoder input image list. +* `lora_encoder_scale`: Guidance strength of the LoRA encoder. +* `step1x_reference_image`: Step1X model reference image. +* `flex_inpaint_image`: Flex model image to be inpainted. +* `flex_inpaint_mask`: Flex model inpainting mask. +* `flex_control_image`: Flex model control image. +* `flex_control_strength`: Flex model control strength. +* `flex_control_stop`: Flex model control stop timestep. +* `nexus_gen_reference_image`: Nexus-Gen model reference image. + +If VRAM is insufficient, please enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). We provide recommended low VRAM configurations for each model in the example code, see the table in the "Model Overview" section above. + +## Model Training + +FLUX series models are uniformly trained through [`examples/flux/model_training/train.py`](/examples/flux/model_training/train.py), and the script parameters include: + +* General Training Parameters + * Dataset Basic Configuration + * `--dataset_base_path`: Root directory of the dataset. + * `--dataset_metadata_path`: Metadata file path of the dataset. + * `--dataset_repeat`: Number of times the dataset is repeated in each epoch. + * `--dataset_num_workers`: Number of processes for each DataLoader. + * `--data_file_keys`: Field names to be loaded from metadata, usually image or video file paths, separated by `,`. + * Model Loading Configuration + * `--model_paths`: Paths of models to be loaded. JSON format. + * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"black-forest-labs/FLUX.1-dev:flux1-dev.safetensors"`. Separated by commas. + * `--extra_inputs`: Extra input parameters required by the model Pipeline, e.g., `controlnet_inputs` when training ControlNet models, separated by `,`. + * `--fp8_models`: Models loaded in FP8 format, consistent with `--model_paths` or `--model_id_with_origin_paths` format. Currently only supports models whose parameters are not updated by gradients (no gradient backpropagation, or gradients only update their LoRA). + * Training Basic Configuration + * `--learning_rate`: Learning rate. + * `--num_epochs`: Number of epochs. + * `--trainable_models`: Trainable models, e.g., `dit`, `vae`, `text_encoder`. + * `--find_unused_parameters`: Whether there are unused parameters in DDP training. Some models contain redundant parameters that do not participate in gradient calculation, and this setting needs to be enabled to avoid errors in multi-GPU training. + * `--weight_decay`: Weight decay size, see [torch.optim.AdamW](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html). + * `--task`: Training task, default is `sft`. Some models support more training modes, please refer to the documentation of each specific model. + * Output Configuration + * `--output_path`: Model saving path. + * `--remove_prefix_in_ckpt`: Remove prefix in the state dict of the model file. + * `--save_steps`: Interval of training steps to save the model. If this parameter is left blank, the model is saved once per epoch. + * LoRA Configuration + * `--lora_base_model`: Which model to add LoRA to. + * `--lora_target_modules`: Which layers to add LoRA to. + * `--lora_rank`: Rank of LoRA. + * `--lora_checkpoint`: Path of the LoRA checkpoint. If this path is provided, LoRA will be loaded from this checkpoint. + * `--preset_lora_path`: Preset LoRA checkpoint path. If this path is provided, this LoRA will be loaded in the form of being merged into the base model. This parameter is used for LoRA differential training. + * `--preset_lora_model`: Model that the preset LoRA is merged into, e.g., `dit`. + * Gradient Configuration + * `--use_gradient_checkpointing`: Whether to enable gradient checkpointing. + * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to memory. + * `--gradient_accumulation_steps`: Number of gradient accumulation steps. + * Image Width/Height Configuration (Applicable to Image Generation and Video Generation Models) + * `--height`: Height of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--width`: Width of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--max_pixels`: Maximum pixel area of image or video frames. When dynamic resolution is enabled, images with resolution larger than this value will be downscaled, and images with resolution smaller than this value will remain unchanged. +* FLUX Specific Parameters + * `--tokenizer_1_path`: Path of the CLIP tokenizer, leave blank to automatically download from remote. + * `--tokenizer_2_path`: Path of the T5 tokenizer, leave blank to automatically download from remote. + * `--align_to_opensource_format`: Whether to align LoRA format to open-source format, only applicable to DiT's LoRA. + +We have built a sample image dataset for your testing. You can download this dataset with the following command: + +```shell +modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir ./data/example_image_dataset +``` + +We have written recommended training scripts for each model, please refer to the table in the "Model Overview" section above. For how to write model training scripts, please refer to [Model Training](/docs/en/Pipeline_Usage/Model_Training.md); for more advanced training algorithms, please refer to [Training Framework Detailed Explanation](/docs/Training/). \ No newline at end of file diff --git a/docs/en/Model_Details/FLUX2.md b/docs/en/Model_Details/FLUX2.md new file mode 100644 index 0000000..bd638b4 --- /dev/null +++ b/docs/en/Model_Details/FLUX2.md @@ -0,0 +1,138 @@ +# FLUX.2 + +FLUX.2 is an image generation model trained and open-sourced by Black Forest Labs. + +## Installation + +Before using this project for model inference and training, please install DiffSynth-Studio first. + +```shell +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +For more information about installation, please refer to [Install Dependencies](/docs/en/Pipeline_Usage/Setup.md). + +## Quick Start + +Run the following code to quickly load the [black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev) model and perform inference. VRAM management is enabled, and the framework will automatically control model parameter loading based on remaining VRAM. Minimum 8GB VRAM is required to run. + +```python +from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": torch.float8_e4m3fn, + "onload_device": "cpu", + "preparing_dtype": torch.float8_e4m3fn, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = Flux2ImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="text_encoder/*.safetensors", **vram_config), + ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="transformer/*.safetensors", **vram_config), + ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="tokenizer/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) +prompt = "Realistic macro photograph of a hermit crab using a soda can as its shell, partially emerging from the can, captured with sharp detail and natural colors, on a sunlit beach with soft shadows and a shallow depth of field, with blurred ocean waves in the background. The can has the text `BFL Diffusers` on it and it has a color gradient that start with #FF5733 at the top and transitions to #33FF57 at the bottom." +image = pipe(prompt, seed=42, rand_device="cuda", num_inference_steps=50) +image.save("image_FLUX.2-dev.jpg") +``` + +## Model Overview + +| Model ID | Inference | Low VRAM Inference | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | +| [black-forest-labs/FLUX.2-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.2-dev) | [code](/examples/flux2/model_inference/FLUX.2-dev.py) | [code](/examples/flux2/model_inference_low_vram/FLUX.2-dev.py) | [code](/examples/flux2/model_training/lora/FLUX.2-dev.sh) | [code](/examples/flux2/model_training/validate_lora/FLUX.2-dev.py) | + +Special Training Scripts: + +* Differential LoRA Training: [doc](/docs/en/Training/Differential_LoRA.md), [code](/examples/flux/model_training/special/differential_training/) +* FP8 Precision Training: [doc](/docs/en/Training/FP8_Precision.md), [code](/examples/flux/model_training/special/fp8_training/) +* Two-stage Split Training: [doc](/docs/en/Training/Split_Training.md), [code](/examples/flux/model_training/special/split_training/) +* End-to-end Direct Distillation: [doc](/docs/en/Training/Direct_Distill.md), [code](/examples/flux/model_training/lora/FLUX.1-dev-Distill-LoRA.sh) + +## Model Inference + +Models are loaded via `Flux2ImagePipeline.from_pretrained`, see [Loading Models](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models). + +Input parameters for `Flux2ImagePipeline` inference include: + +* `prompt`: Prompt describing the content appearing in the image. +* `negative_prompt`: Negative prompt describing content that should not appear in the image, default value is `""`. +* `cfg_scale`: Classifier-free guidance parameter, default value is 1. When set to a value greater than 1, CFG is enabled. +* `height`: Image height, must be a multiple of 16. +* `width`: Image width, must be a multiple of 16. +* `seed`: Random seed. Default is `None`, meaning completely random. +* `rand_device`: Computing device for generating random Gaussian noise matrix, default is `"cpu"`. When set to `cuda`, different GPUs will produce different generation results. +* `num_inference_steps`: Number of inference steps, default value is 30. +* `embedded_guidance`: Embedded guidance parameter, default value is 3.5. +* `t5_sequence_length`: Sequence length of the T5 text encoder, default is 512. +* `tiled`: Whether to enable VAE tiling inference, default is `False`. Setting to `True` can significantly reduce VRAM usage during VAE encoding/decoding stages, producing slight errors and slightly longer inference time. +* `tile_size`: Tile size during VAE encoding/decoding stages, default is 128, only effective when `tiled=True`. +* `tile_stride`: Tile stride during VAE encoding/decoding stages, default is 64, only effective when `tiled=True`, must be less than or equal to `tile_size`. +* `progress_bar_cmd`: Progress bar, default is `tqdm.tqdm`. Can be disabled by setting to `lambda x:x`. + +If VRAM is insufficient, please enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). We provide recommended low VRAM configurations for each model in the example code, see the table in the "Model Overview" section above. + +## Model Training + +FLUX.2 series models are uniformly trained through [`examples/flux2/model_training/train.py`](/examples/flux2/model_training/train.py), and the script parameters include: + +* General Training Parameters + * Dataset Basic Configuration + * `--dataset_base_path`: Root directory of the dataset. + * `--dataset_metadata_path`: Metadata file path of the dataset. + * `--dataset_repeat`: Number of times the dataset is repeated in each epoch. + * `--dataset_num_workers`: Number of processes for each DataLoader. + * `--data_file_keys`: Field names to be loaded from metadata, usually image or video file paths, separated by `,`. + * Model Loading Configuration + * `--model_paths`: Paths of models to be loaded. JSON format. + * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"black-forest-labs/FLUX.2-dev:text_encoder/*.safetensors"`. Separated by commas. + * `--extra_inputs`: Extra input parameters required by the model Pipeline, e.g., `controlnet_inputs` when training ControlNet models, separated by `,`. + * `--fp8_models`: Models loaded in FP8 format, consistent with `--model_paths` or `--model_id_with_origin_paths` format. Currently only supports models whose parameters are not updated by gradients (no gradient backpropagation, or gradients only update their LoRA). + * Training Basic Configuration + * `--learning_rate`: Learning rate. + * `--num_epochs`: Number of epochs. + * `--trainable_models`: Trainable models, e.g., `dit`, `vae`, `text_encoder`. + * `--find_unused_parameters`: Whether there are unused parameters in DDP training. Some models contain redundant parameters that do not participate in gradient calculation, and this setting needs to be enabled to avoid errors in multi-GPU training. + * `--weight_decay`: Weight decay size, see [torch.optim.AdamW](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html). + * `--task`: Training task, default is `sft`. Some models support more training modes, please refer to the documentation of each specific model. + * Output Configuration + * `--output_path`: Model saving path. + * `--remove_prefix_in_ckpt`: Remove prefix in the state dict of the model file. + * `--save_steps`: Interval of training steps to save the model. If this parameter is left blank, the model is saved once per epoch. + * LoRA Configuration + * `--lora_base_model`: Which model to add LoRA to. + * `--lora_target_modules`: Which layers to add LoRA to. + * `--lora_rank`: Rank of LoRA. + * `--lora_checkpoint`: Path of the LoRA checkpoint. If this path is provided, LoRA will be loaded from this checkpoint. + * `--preset_lora_path`: Preset LoRA checkpoint path. If this path is provided, this LoRA will be loaded in the form of being merged into the base model. This parameter is used for LoRA differential training. + * `--preset_lora_model`: Model that the preset LoRA is merged into, e.g., `dit`. + * Gradient Configuration + * `--use_gradient_checkpointing`: Whether to enable gradient checkpointing. + * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to memory. + * `--gradient_accumulation_steps`: Number of gradient accumulation steps. + * Image Width/Height Configuration (Applicable to Image Generation and Video Generation Models) + * `--height`: Height of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--width`: Width of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--max_pixels`: Maximum pixel area of image or video frames. When dynamic resolution is enabled, images with resolution larger than this value will be downscaled, and images with resolution smaller than this value will remain unchanged. +* FLUX.2 Specific Parameters + * `--tokenizer_path`: Path of the tokenizer, applicable to text-to-image models, leave blank to automatically download from remote. + +We have built a sample image dataset for your testing. You can download this dataset with the following command: + +```shell +modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir ./data/example_image_dataset +``` + +We have written recommended training scripts for each model, please refer to the table in the "Model Overview" section above. For how to write model training scripts, please refer to [Model Training](/docs/en/Pipeline_Usage/Model_Training.md); for more advanced training algorithms, please refer to [Training Framework Detailed Explanation](/docs/Training/). \ No newline at end of file diff --git a/docs/en/Model_Details/Overview.md b/docs/en/Model_Details/Overview.md new file mode 100644 index 0000000..5df8593 --- /dev/null +++ b/docs/en/Model_Details/Overview.md @@ -0,0 +1,291 @@ +# Model Directory + +## Qwen-Image + +Documentation: [./Qwen-Image.md](/docs/en/Model_Details/Qwen-Image.md) + +
+ +Effect Preview + +![Image](https://github.com/user-attachments/assets/738078d8-8749-4a53-a046-571861541924) + +
+ +
+ +Quick Start + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +from PIL import Image +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。" +image = pipe( + prompt, seed=0, num_inference_steps=40, + # edit_image=Image.open("xxx.jpg").resize((1328, 1328)) # For Qwen-Image-Edit +) +image.save("image.jpg") +``` + +
+ +
+ +Model Lineage + +```mermaid +graph LR; + Qwen/Qwen-Image-->Qwen/Qwen-Image-Edit; + Qwen/Qwen-Image-Edit-->Qwen/Qwen-Image-Edit-2509; + Qwen/Qwen-Image-->EliGen-Series; + EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen; + DiffSynth-Studio/Qwen-Image-EliGen-->DiffSynth-Studio/Qwen-Image-EliGen-V2; + EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen-Poster; + Qwen/Qwen-Image-->Distill-Series; + Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-Full; + Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-LoRA; + Qwen/Qwen-Image-->ControlNet-Series; + ControlNet-Series-->Blockwise-ControlNet-Series; + Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny; + Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth; + Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint; + ControlNet-Series-->DiffSynth-Studio/Qwen-Image-In-Context-Control-Union; + Qwen/Qwen-Image-->DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix; +``` + +
+ +| Model ID | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | +| [Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) | [code](/examples/qwen_image/model_inference/Qwen-Image.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image.py) | +| [Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit) | [code](/examples/qwen_image/model_inference/Qwen-Image-Edit.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Edit.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py) | +| [Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509) | [code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py) | +| [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen) | [code](/examples/qwen_image/model_inference/Qwen-Image-EliGen.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py) | +| [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2) | [code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py) | +| [DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster) | [code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen-Poster.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py) | +| [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) | [code](/examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py) | +| [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA) | [code](/examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py) | +| [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny) | [code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py) | +| [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth) | [code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py) | +| [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint) | [code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Inpaint.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Inpaint.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | +| [DiffSynth-Studio/Qwen-Image-In-Context-Control-Union](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-In-Context-Control-Union) | [code](/examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-In-Context-Control-Union.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-In-Context-Control-Union.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-In-Context-Control-Union.py) | +| [DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix) | [code](/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py) | - | - | - | - | + +## FLUX Series + +Documentation: [./FLUX.md](/docs/en/Model_Details/FLUX.md) + +
+ +Effect Preview + +![Image](https://github.com/user-attachments/assets/c01258e2-f251-441a-aa1e-ebb22f02594d) + +
+ +
+ +Quick Start + +```python +import torch +from diffsynth.pipelines.flux_image import FluxImagePipeline, ModelConfig + +pipe = FluxImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors"), + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors"), + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors"), + ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors"), + ], +) + +image = pipe(prompt="a cat", seed=0) +image.save("image.jpg") +``` + +
+ +
+ +Model Lineage + +```mermaid +graph LR; + FLUX.1-Series-->black-forest-labs/FLUX.1-dev; + FLUX.1-Series-->black-forest-labs/FLUX.1-Krea-dev; + FLUX.1-Series-->black-forest-labs/FLUX.1-Kontext-dev; + black-forest-labs/FLUX.1-dev-->FLUX.1-dev-ControlNet-Series; + FLUX.1-dev-ControlNet-Series-->alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta; + FLUX.1-dev-ControlNet-Series-->InstantX/FLUX.1-dev-Controlnet-Union-alpha; + FLUX.1-dev-ControlNet-Series-->jasperai/Flux.1-dev-Controlnet-Upscaler; + black-forest-labs/FLUX.1-dev-->InstantX/FLUX.1-dev-IP-Adapter; + black-forest-labs/FLUX.1-dev-->ByteDance/InfiniteYou; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Eligen; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev; + black-forest-labs/FLUX.1-dev-->ostris/Flex.2-preview; + black-forest-labs/FLUX.1-dev-->stepfun-ai/Step1X-Edit; + Qwen/Qwen2.5-VL-7B-Instruct-->stepfun-ai/Step1X-Edit; + black-forest-labs/FLUX.1-dev-->DiffSynth-Studio/Nexus-GenV2; + Qwen/Qwen2.5-VL-7B-Instruct-->DiffSynth-Studio/Nexus-GenV2; +``` + +
+ +| Model ID | Extra Parameters | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | - | +| [black-forest-labs/FLUX.1-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-dev) | | [code](/examples/flux/model_inference/FLUX.1-dev.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev.py) | [code](/examples/flux/model_training/full/FLUX.1-dev.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev.py) | +| [black-forest-labs/FLUX.1-Krea-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Krea-dev) | | [code](/examples/flux/model_inference/FLUX.1-Krea-dev.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-Krea-dev.py) | [code](/examples/flux/model_training/full/FLUX.1-Krea-dev.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-Krea-dev.py) | [code](/examples/flux/model_training/lora/FLUX.1-Krea-dev.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-Krea-dev.py) | +| [black-forest-labs/FLUX.1-Kontext-dev](https://www.modelscope.cn/models/black-forest-labs/FLUX.1-Kontext-dev) | `kontext_images` | [code](/examples/flux/model_inference/FLUX.1-Kontext-dev.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-Kontext-dev.py) | [code](/examples/flux/model_training/full/FLUX.1-Kontext-dev.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-Kontext-dev.py) | [code](/examples/flux/model_training/lora/FLUX.1-Kontext-dev.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-Kontext-dev.py) | +| [alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta](https://www.modelscope.cn/models/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta) | `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Inpainting-Beta.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Inpainting-Beta.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Inpainting-Beta.py) | +| [InstantX/FLUX.1-dev-Controlnet-Union-alpha](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-Controlnet-Union-alpha) | `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Union-alpha.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Union-alpha.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Union-alpha.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Union-alpha.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Union-alpha.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Union-alpha.py) | +| [jasperai/Flux.1-dev-Controlnet-Upscaler](https://www.modelscope.cn/models/jasperai/Flux.1-dev-Controlnet-Upscaler) | `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-Controlnet-Upscaler.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-Controlnet-Upscaler.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-Controlnet-Upscaler.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-Controlnet-Upscaler.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-Controlnet-Upscaler.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-Controlnet-Upscaler.py) | +| [InstantX/FLUX.1-dev-IP-Adapter](https://www.modelscope.cn/models/InstantX/FLUX.1-dev-IP-Adapter) | `ipadapter_images`, `ipadapter_scale` | [code](/examples/flux/model_inference/FLUX.1-dev-IP-Adapter.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-IP-Adapter.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-IP-Adapter.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-IP-Adapter.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-IP-Adapter.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-IP-Adapter.py) | +| [ByteDance/InfiniteYou](https://www.modelscope.cn/models/ByteDance/InfiniteYou) | `infinityou_id_image`, `infinityou_guidance`, `controlnet_inputs` | [code](/examples/flux/model_inference/FLUX.1-dev-InfiniteYou.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-InfiniteYou.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-InfiniteYou.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-InfiniteYou.py) | [code](/examples/flux/model_training/lora/FLUX.1-dev-InfiniteYou.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-InfiniteYou.py) | +| [DiffSynth-Studio/Eligen](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen) | `eligen_entity_prompts`, `eligen_entity_masks`, `eligen_enable_on_negative`, `eligen_enable_inpaint` | [code](/examples/flux/model_inference/FLUX.1-dev-EliGen.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-EliGen.py) | - | - | [code](/examples/flux/model_training/lora/FLUX.1-dev-EliGen.sh) | [code](/examples/flux/model_training/validate_lora/FLUX.1-dev-EliGen.py) | +| [DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev](https://www.modelscope.cn/models/DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev) | `lora_encoder_inputs`, `lora_encoder_scale` | [code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Encoder.py) | [code](/examples/flux/model_inference_low_vram/FLUX.1-dev-LoRA-Encoder.py) | [code](/examples/flux/model_training/full/FLUX.1-dev-LoRA-Encoder.sh) | [code](/examples/flux/model_training/validate_full/FLUX.1-dev-LoRA-Encoder.py) | - | - | +| [DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev](https://modelscope.cn/models/DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev) | | [code](/examples/flux/model_inference/FLUX.1-dev-LoRA-Fusion.py) | - | - | - | - | - | +| [stepfun-ai/Step1X-Edit](https://www.modelscope.cn/models/stepfun-ai/Step1X-Edit) | `step1x_reference_image` | [code](/examples/flux/model_inference/Step1X-Edit.py) | [code](/examples/flux/model_inference_low_vram/Step1X-Edit.py) | [code](/examples/flux/model_training/full/Step1X-Edit.sh) | [code](/examples/flux/model_training/validate_full/Step1X-Edit.py) | [code](/examples/flux/model_training/lora/Step1X-Edit.sh) | [code](/examples/flux/model_training/validate_lora/Step1X-Edit.py) | +| [ostris/Flex.2-preview](https://www.modelscope.cn/models/ostris/Flex.2-preview) | `flex_inpaint_image`, `flex_inpaint_mask`, `flex_control_image`, `flex_control_strength`, `flex_control_stop` | [code](/examples/flux/model_inference/FLEX.2-preview.py) | [code](/examples/flux/model_inference_low_vram/FLEX.2-preview.py) | [code](/examples/flux/model_training/full/FLEX.2-preview.sh) | [code](/examples/flux/model_training/validate_full/FLEX.2-preview.py) | [code](/examples/flux/model_training/lora/FLEX.2-preview.sh) | [code](/examples/flux/model_training/validate_lora/FLEX.2-preview.py) | +| [DiffSynth-Studio/Nexus-GenV2](https://www.modelscope.cn/models/DiffSynth-Studio/Nexus-GenV2) | `nexus_gen_reference_image` | [code](/examples/flux/model_inference/Nexus-Gen-Editing.py) | [code](/examples/flux/model_inference_low_vram/Nexus-Gen-Editing.py) | [code](/examples/flux/model_training/full/Nexus-Gen.sh) | [code](/examples/flux/model_training/validate_full/Nexus-Gen.py) | [code](/examples/flux/model_training/lora/Nexus-Gen.sh) | [code](/examples/flux/model_training/validate_lora/Nexus-Gen.py) | + +## Wan Series + +Documentation: [./Wan.md](/docs/en/Model_Details/Wan.md) + +
+ +Effect Preview + +https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314 + +
+ +
+ +Quick Start + +```python +import torch +from diffsynth.utils.data import save_video +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig + +pipe = WanVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), + ], +) + +video = pipe( + prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", + negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", + seed=0, tiled=True, +) +save_video(video, "video.mp4", fps=15, quality=5) +``` + +
+ +
+ +Model Lineage + +```mermaid +graph LR; + Wan-Series-->Wan2.1-Series; + Wan-Series-->Wan2.2-Series; + Wan2.1-Series-->Wan-AI/Wan2.1-T2V-1.3B; + Wan2.1-Series-->Wan-AI/Wan2.1-T2V-14B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-I2V-14B-480P; + Wan-AI/Wan2.1-I2V-14B-480P-->Wan-AI/Wan2.1-I2V-14B-720P; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-FLF2V-14B-720P; + Wan-AI/Wan2.1-T2V-1.3B-->iic/VACE-Wan2.1-1.3B-Preview; + iic/VACE-Wan2.1-1.3B-Preview-->Wan-AI/Wan2.1-VACE-1.3B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-VACE-14B; + Wan-AI/Wan2.1-T2V-1.3B-->Wan2.1-Fun-1.3B-Series; + Wan2.1-Fun-1.3B-Series-->PAI/Wan2.1-Fun-1.3B-InP; + Wan2.1-Fun-1.3B-Series-->PAI/Wan2.1-Fun-1.3B-Control; + Wan-AI/Wan2.1-T2V-14B-->Wan2.1-Fun-14B-Series; + Wan2.1-Fun-14B-Series-->PAI/Wan2.1-Fun-14B-InP; + Wan2.1-Fun-14B-Series-->PAI/Wan2.1-Fun-14B-Control; + Wan-AI/Wan2.1-T2V-1.3B-->Wan2.1-Fun-V1.1-1.3B-Series; + Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-Control; + Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-InP; + Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera; + Wan-AI/Wan2.1-T2V-14B-->Wan2.1-Fun-V1.1-14B-Series; + Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-Control; + Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-InP; + Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-Control-Camera; + Wan-AI/Wan2.1-T2V-1.3B-->DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1; + Wan-AI/Wan2.1-T2V-14B-->krea/krea-realtime-video; + Wan-AI/Wan2.1-I2V-14B-720P-->ByteDance/Video-As-Prompt-Wan2.1-14B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.2-Animate-14B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.2-S2V-14B; + Wan2.2-Series-->Wan-AI/Wan2.2-T2V-A14B; + Wan2.2-Series-->Wan-AI/Wan2.2-I2V-A14B; + Wan2.2-Series-->Wan-AI/Wan2.2-TI2V-5B; + Wan-AI/Wan2.2-T2V-A14B-->Wan2.2-Fun-Series; + Wan2.2-Fun-Series-->PAI/Wan2.2-VACE-Fun-A14B; + Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-InP; + Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-Control; + Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-Control-Camera; +``` + +
+ +| Model ID | Extra Parameters | Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | +| [Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) | | [code](/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py) | +| [Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B) | | [code](/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py) | +| [Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py) | +| [Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py) | +| [Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py) | +| [iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py) | +| [Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py) | +| [Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py) | +| [PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py) | +| [PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control) | `control_video` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py) | +| [PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py) | +| [PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control) | `control_video` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py) | +| [PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control) | `control_video`, `reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py) | +| [PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control) | `control_video`, `reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py) | +| [PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py) | +| [PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py) | +| [PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera) | `control_camera_video`, `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py) | +| [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera) | `control_camera_video`, `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py) | +| [DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1) | `motion_bucket_id` | [code](/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py) | +| [krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video) | | [code](/examples/wanvideo/model_inference/krea-realtime-video.py) | [code](/examples/wanvideo/model_training/full/krea-realtime-video.sh) | [code](/examples/wanvideo/model_training/validate_full/krea-realtime-video.py) | [code](/examples/wanvideo/model_training/lora/krea-realtime-video.sh) | [code](/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py) | +| [meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video) | `longcat_video` | [code](/examples/wanvideo/model_inference/LongCat-Video.py) | [code](/examples/wanvideo/model_training/full/LongCat-Video.sh) | [code](/examples/wanvideo/model_training/validate_full/LongCat-Video.py) | [code](/examples/wanvideo/model_training/lora/LongCat-Video.sh) | [code](/examples/wanvideo/model_training/validate_lora/LongCat-Video.py) | +| [ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B) | `vap_video`, `vap_prompt` | [code](/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py) | [code](/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py) | [code](/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py) | +| [Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B) | | [code](/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py) | +| [Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py) | +| [Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py) | +| [Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B) | `input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video` | [code](/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py) | +| [Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B) | `input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video` | [code](/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py) | +| [PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py) | +| [PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py) | +| [PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control) | `control_video`, `reference_image` | [code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py) | +| [PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera) | `control_camera_video`, `input_image` | [code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py) | + +* FP8 Precision Training: [doc](/docs/en/Training/FP8_Precision.md), [code](/examples/wanvideo/model_training/special/fp8_training/) +* Two-stage Split Training: [doc](/docs/en/Training/Split_Training.md), [code](/examples/wanvideo/model_training/special/split_training/) +* End-to-end Direct Distillation: [doc](/docs/en/Training/Direct_Distill.md), [code](/examples/wanvideo/model_training/special/direct_distill/) \ No newline at end of file diff --git a/docs/en/Model_Details/Qwen-Image.md b/docs/en/Model_Details/Qwen-Image.md new file mode 100644 index 0000000..b0d9eb6 --- /dev/null +++ b/docs/en/Model_Details/Qwen-Image.md @@ -0,0 +1,191 @@ +# Qwen-Image + +![Image](https://github.com/user-attachments/assets/738078d8-8749-4a53-a046-571861541924) + +Qwen-Image is an image generation model trained and open-sourced by the Tongyi Lab Qwen Team of Alibaba. + +## Installation + +Before using this project for model inference and training, please install DiffSynth-Studio first. + +```shell +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +For more information about installation, please refer to [Install Dependencies](/docs/en/Pipeline_Usage/Setup.md). + +## Quick Start + +Run the following code to quickly load the [Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control model parameter loading based on remaining VRAM. Minimum 8GB VRAM is required to run. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": torch.float8_e4m3fn, + "onload_device": "cpu", + "preparing_dtype": torch.float8_e4m3fn, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) +prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。" +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +## Model Overview + +
+ +Model Lineage + +```mermaid +graph LR; + Qwen/Qwen-Image-->Qwen/Qwen-Image-Edit; + Qwen/Qwen-Image-Edit-->Qwen/Qwen-Image-Edit-2509; + Qwen/Qwen-Image-->EliGen-Series; + EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen; + DiffSynth-Studio/Qwen-Image-EliGen-->DiffSynth-Studio/Qwen-Image-EliGen-V2; + EliGen-Series-->DiffSynth-Studio/Qwen-Image-EliGen-Poster; + Qwen/Qwen-Image-->Distill-Series; + Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-Full; + Distill-Series-->DiffSynth-Studio/Qwen-Image-Distill-LoRA; + Qwen/Qwen-Image-->ControlNet-Series; + ControlNet-Series-->Blockwise-ControlNet-Series; + Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny; + Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth; + Blockwise-ControlNet-Series-->DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint; + ControlNet-Series-->DiffSynth-Studio/Qwen-Image-In-Context-Control-Union; + Qwen/Qwen-Image-->DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix; +``` + +
+ +| Model ID | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | +| [Qwen/Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) | [code](/examples/qwen_image/model_inference/Qwen-Image.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image.py) | +| [Qwen/Qwen-Image-Edit](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit) | [code](/examples/qwen_image/model_inference/Qwen-Image-Edit.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Edit.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit.py) | +| [Qwen/Qwen-Image-Edit-2509](https://www.modelscope.cn/models/Qwen/Qwen-Image-Edit-2509) | [code](/examples/qwen_image/model_inference/Qwen-Image-Edit-2509.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-2509.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Edit-2509.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Edit-2509.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Edit-2509.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Edit-2509.py) | +| [DiffSynth-Studio/Qwen-Image-EliGen](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen) | [code](/examples/qwen_image/model_inference/Qwen-Image-EliGen.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py) | +| [DiffSynth-Studio/Qwen-Image-EliGen-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-V2) | [code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-V2.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-V2.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen.py) | +| [DiffSynth-Studio/Qwen-Image-EliGen-Poster](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-EliGen-Poster) | [code](/examples/qwen_image/model_inference/Qwen-Image-EliGen-Poster.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-EliGen-Poster.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-EliGen-Poster.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-EliGen-Poster.py) | +| [DiffSynth-Studio/Qwen-Image-Distill-Full](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full) | [code](/examples/qwen_image/model_inference/Qwen-Image-Distill-Full.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-Full.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Distill-Full.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Distill-Full.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-Full.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-Full.py) | +| [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA) | [code](/examples/qwen_image/model_inference/Qwen-Image-Distill-LoRA.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Distill-LoRA.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Distill-LoRA.py) | +| [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Canny) | [code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Canny.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Canny.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Canny.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Canny.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Canny.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Canny.py) | +| [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth) | [code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Depth.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Depth.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Depth.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Depth.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Depth.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Depth.py) | +| [DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint) | [code](/examples/qwen_image/model_inference/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | [code](/examples/qwen_image/model_training/full/Qwen-Image-Blockwise-ControlNet-Inpaint.sh) | [code](/examples/qwen_image/model_training/validate_full/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | [code](/examples/qwen_image/model_training/lora/Qwen-Image-Blockwise-ControlNet-Inpaint.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-Blockwise-ControlNet-Inpaint.py) | +| [DiffSynth-Studio/Qwen-Image-In-Context-Control-Union](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-In-Context-Control-Union) | [code](/examples/qwen_image/model_inference/Qwen-Image-In-Context-Control-Union.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-In-Context-Control-Union.py) | - | - | [code](/examples/qwen_image/model_training/lora/Qwen-Image-In-Context-Control-Union.sh) | [code](/examples/qwen_image/model_training/validate_lora/Qwen-Image-In-Context-Control-Union.py) | +| [DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Edit-Lowres-Fix) | [code](/examples/qwen_image/model_inference/Qwen-Image-Edit-Lowres-Fix.py) | [code](/examples/qwen_image/model_inference_low_vram/Qwen-Image-Edit-Lowres-Fix.py) | - | - | - | - | + +Special Training Scripts: + +* Differential LoRA Training: [doc](/docs/en/Training/Differential_LoRA.md), [code](/examples/qwen_image/model_training/special/differential_training/) +* FP8 Precision Training: [doc](/docs/en/Training/FP8_Precision.md), [code](/examples/qwen_image/model_training/special/fp8_training/) +* Two-stage Split Training: [doc](/docs/en/Training/Split_Training.md), [code](/examples/qwen_image/model_training/special/split_training/) +* End-to-end Direct Distillation: [doc](/docs/en/Training/Direct_Distill.md), [code](/examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh) + +## Model Inference + +Models are loaded via `QwenImagePipeline.from_pretrained`, see [Loading Models](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models). + +Input parameters for `QwenImagePipeline` inference include: + +* `prompt`: Prompt describing the content appearing in the image. +* `negative_prompt`: Negative prompt describing content that should not appear in the image, default value is `""`. +* `cfg_scale`: Classifier-free guidance parameter, default value is 4. When set to 1, it no longer takes effect. +* `input_image`: Input image for image-to-image generation, used in conjunction with `denoising_strength`. +* `denoising_strength`: Denoising strength, range is 0~1, default value is 1. When the value approaches 0, the generated image is similar to the input image; when the value approaches 1, the generated image differs more from the input image. When `input_image` parameter is not provided, do not set this to a non-1 value. +* `inpaint_mask`: Image inpainting mask image. +* `inpaint_blur_size`: Edge blur width for image inpainting. +* `inpaint_blur_sigma`: Edge blur strength for image inpainting. +* `height`: Image height, must be a multiple of 16. +* `width`: Image width, must be a multiple of 16. +* `seed`: Random seed. Default is `None`, meaning completely random. +* `rand_device`: Computing device for generating random Gaussian noise matrix, default is `"cpu"`. When set to `cuda`, different GPUs will produce different generation results. +* `num_inference_steps`: Number of inference steps, default value is 30. +* `exponential_shift_mu`: Fixed parameter used in sampling timesteps. Leave blank to sample based on image width and height. +* `blockwise_controlnet_inputs`: Blockwise ControlNet model inputs. +* `eligen_entity_prompts`: EliGen partition control prompts. +* `eligen_entity_masks`: EliGen partition control region mask images. +* `eligen_enable_on_negative`: Whether to enable EliGen partition control on the negative side of CFG. +* `edit_image`: Edit model images to be edited, supports multiple images. +* `edit_image_auto_resize`: Whether to automatically scale edit images. +* `edit_rope_interpolation`: Whether to enable ROPE interpolation on low-resolution edit images. +* `context_image`: In-Context Control input image. +* `tiled`: Whether to enable VAE tiling inference, default is `False`. Setting to `True` can significantly reduce VRAM usage during VAE encoding/decoding stages, producing slight errors and slightly longer inference time. +* `tile_size`: Tile size during VAE encoding/decoding stages, default is 128, only effective when `tiled=True`. +* `tile_stride`: Tile stride during VAE encoding/decoding stages, default is 64, only effective when `tiled=True`, must be less than or equal to `tile_size`. +* `progress_bar_cmd`: Progress bar, default is `tqdm.tqdm`. Can be disabled by setting to `lambda x:x`. + +If VRAM is insufficient, please enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). We provide recommended low VRAM configurations for each model in the example code, see the table in the "Model Overview" section above. + +## Model Training + +Qwen-Image series models are uniformly trained through [`examples/qwen_image/model_training/train.py`](/examples/qwen_image/model_training/train.py), and the script parameters include: + +* General Training Parameters + * Dataset Basic Configuration + * `--dataset_base_path`: Root directory of the dataset. + * `--dataset_metadata_path`: Metadata file path of the dataset. + * `--dataset_repeat`: Number of times the dataset is repeated in each epoch. + * `--dataset_num_workers`: Number of processes for each DataLoader. + * `--data_file_keys`: Field names to be loaded from metadata, usually image or video file paths, separated by `,`. + * Model Loading Configuration + * `--model_paths`: Paths of models to be loaded. JSON format. + * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors"`. Separated by commas. + * `--extra_inputs`: Extra input parameters required by the model Pipeline, e.g., extra parameters `edit_image` when training image editing model Qwen-Image-Edit, separated by `,`. + * `--fp8_models`: Models loaded in FP8 format, consistent with `--model_paths` or `--model_id_with_origin_paths` format. Currently only supports models whose parameters are not updated by gradients (no gradient backpropagation, or gradients only update their LoRA). + * Training Basic Configuration + * `--learning_rate`: Learning rate. + * `--num_epochs`: Number of epochs. + * `--trainable_models`: Trainable models, e.g., `dit`, `vae`, `text_encoder`. + * `--find_unused_parameters`: Whether there are unused parameters in DDP training. Some models contain redundant parameters that do not participate in gradient calculation, and this setting needs to be enabled to avoid errors in multi-GPU training. + * `--weight_decay`: Weight decay size, see [torch.optim.AdamW](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html). + * `--task`: Training task, default is `sft`. Some models support more training modes, please refer to the documentation of each specific model. + * Output Configuration + * `--output_path`: Model saving path. + * `--remove_prefix_in_ckpt`: Remove prefix in the state dict of the model file. + * `--save_steps`: Interval of training steps to save the model. If this parameter is left blank, the model is saved once per epoch. + * LoRA Configuration + * `--lora_base_model`: Which model to add LoRA to. + * `--lora_target_modules`: Which layers to add LoRA to. + * `--lora_rank`: Rank of LoRA. + * `--lora_checkpoint`: Path of the LoRA checkpoint. If this path is provided, LoRA will be loaded from this checkpoint. + * `--preset_lora_path`: Preset LoRA checkpoint path. If this path is provided, this LoRA will be loaded in the form of being merged into the base model. This parameter is used for LoRA differential training. + * `--preset_lora_model`: Model that the preset LoRA is merged into, e.g., `dit`. + * Gradient Configuration + * `--use_gradient_checkpointing`: Whether to enable gradient checkpointing. + * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to memory. + * `--gradient_accumulation_steps`: Number of gradient accumulation steps. + * Image Width/Height Configuration (Applicable to Image Generation and Video Generation Models) + * `--height`: Height of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--width`: Width of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--max_pixels`: Maximum pixel area of image or video frames. When dynamic resolution is enabled, images with resolution larger than this value will be downscaled, and images with resolution smaller than this value will remain unchanged. +* Qwen-Image Specific Parameters + * `--tokenizer_path`: Path of the tokenizer, applicable to text-to-image models, leave blank to automatically download from remote. + * `--processor_path`: Path of the processor, applicable to image editing models, leave blank to automatically download from remote. + +We have built a sample image dataset for your testing. You can download this dataset with the following command: + +```shell +modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir ./data/example_image_dataset +``` + +We have written recommended training scripts for each model, please refer to the table in the "Model Overview" section above. For how to write model training scripts, please refer to [Model Training](/docs/en/Pipeline_Usage/Model_Training.md); for more advanced training algorithms, please refer to [Training Framework Detailed Explanation](/docs/Training/). \ No newline at end of file diff --git a/docs/en/Model_Details/Wan.md b/docs/en/Model_Details/Wan.md new file mode 100644 index 0000000..b6a3a1a --- /dev/null +++ b/docs/en/Model_Details/Wan.md @@ -0,0 +1,253 @@ +# Wan + +https://github.com/user-attachments/assets/1d66ae74-3b02-40a9-acc3-ea95fc039314 + +Wan is a video generation model series developed by the Tongyi Wanxiang Team of Alibaba Tongyi Lab. + +## Installation + +Before using this project for model inference and training, please install DiffSynth-Studio first. + +```shell +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +For more information about installation, please refer to [Install Dependencies](/docs/en/Pipeline_Usage/Setup.md). + +## Quick Start + +Run the following code to quickly load the [Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) model and perform inference. VRAM management is enabled, and the framework will automatically control model parameter loading based on remaining VRAM. Minimum 8GB VRAM is required to run. + +```python +import torch +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig + + +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = WanVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, +) + +video = pipe( + prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", + negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", + seed=0, tiled=True, +) +save_video(video, "video.mp4", fps=15, quality=5) +``` + +## Model Overview + +
+ +Model Lineage + +```mermaid +graph LR; + Wan-Series-->Wan2.1-Series; + Wan-Series-->Wan2.2-Series; + Wan2.1-Series-->Wan-AI/Wan2.1-T2V-1.3B; + Wan2.1-Series-->Wan-AI/Wan2.1-T2V-14B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-I2V-14B-480P; + Wan-AI/Wan2.1-I2V-14B-480P-->Wan-AI/Wan2.1-I2V-14B-720P; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-FLF2V-14B-720P; + Wan-AI/Wan2.1-T2V-1.3B-->iic/VACE-Wan2.1-1.3B-Preview; + iic/VACE-Wan2.1-1.3B-Preview-->Wan-AI/Wan2.1-VACE-1.3B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.1-VACE-14B; + Wan-AI/Wan2.1-T2V-1.3B-->Wan2.1-Fun-1.3B-Series; + Wan2.1-Fun-1.3B-Series-->PAI/Wan2.1-Fun-1.3B-InP; + Wan2.1-Fun-1.3B-Series-->PAI/Wan2.1-Fun-1.3B-Control; + Wan-AI/Wan2.1-T2V-14B-->Wan2.1-Fun-14B-Series; + Wan2.1-Fun-14B-Series-->PAI/Wan2.1-Fun-14B-InP; + Wan2.1-Fun-14B-Series-->PAI/Wan2.1-Fun-14B-Control; + Wan-AI/Wan2.1-T2V-1.3B-->Wan2.1-Fun-V1.1-1.3B-Series; + Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-Control; + Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-InP; + Wan2.1-Fun-V1.1-1.3B-Series-->PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera; + Wan-AI/Wan2.1-T2V-14B-->Wan2.1-Fun-V1.1-14B-Series; + Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-Control; + Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-InP; + Wan2.1-Fun-V1.1-14B-Series-->PAI/Wan2.1-Fun-V1.1-14B-Control-Camera; + Wan-AI/Wan2.1-T2V-1.3B-->DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1; + Wan-AI/Wan2.1-T2V-14B-->krea/krea-realtime-video; + Wan-AI/Wan2.1-I2V-14B-720P-->ByteDance/Video-As-Prompt-Wan2.1-14B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.2-Animate-14B; + Wan-AI/Wan2.1-T2V-14B-->Wan-AI/Wan2.2-S2V-14B; + Wan2.2-Series-->Wan-AI/Wan2.2-T2V-A14B; + Wan2.2-Series-->Wan-AI/Wan2.2-I2V-A14B; + Wan2.2-Series-->Wan-AI/Wan2.2-TI2V-5B; + Wan-AI/Wan2.2-T2V-A14B-->Wan2.2-Fun-Series; + Wan2.2-Fun-Series-->PAI/Wan2.2-VACE-Fun-A14B; + Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-InP; + Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-Control; + Wan2.2-Fun-Series-->PAI/Wan2.2-Fun-A14B-Control-Camera; +``` + +
+ +| Model ID | Extra Parameters | Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | +| [Wan-AI/Wan2.1-T2V-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-1.3B) | | [code](/examples/wanvideo/model_inference/Wan2.1-T2V-1.3B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-T2V-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-T2V-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py) | +| [Wan-AI/Wan2.1-T2V-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B) | | [code](/examples/wanvideo/model_inference/Wan2.1-T2V-14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-T2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-T2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py) | +| [Wan-AI/Wan2.1-I2V-14B-480P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-480P) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-480P.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-480P.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-480P.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py) | +| [Wan-AI/Wan2.1-I2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-I2V-14B-720P.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-I2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-I2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py) | +| [Wan-AI/Wan2.1-FLF2V-14B-720P](https://modelscope.cn/models/Wan-AI/Wan2.1-FLF2V-14B-720P) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-FLF2V-14B-720P.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-FLF2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-FLF2V-14B-720P.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py) | +| [iic/VACE-Wan2.1-1.3B-Preview](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B-Preview.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B-Preview.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B-Preview.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py) | +| [Wan-AI/Wan2.1-VACE-1.3B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-VACE-1.3B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-VACE-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-1.3B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py) | +| [Wan-AI/Wan2.1-VACE-14B](https://modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-VACE-14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-VACE-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-VACE-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py) | +| [PAI/Wan2.1-Fun-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py) | +| [PAI/Wan2.1-Fun-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-1.3B-Control) | `control_video` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-1.3B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py) | +| [PAI/Wan2.1-Fun-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py) | +| [PAI/Wan2.1-Fun-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-14B-Control) | `control_video` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-14B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py) | +| [PAI/Wan2.1-Fun-V1.1-1.3B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control) | `control_video`, `reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py) | +| [PAI/Wan2.1-Fun-V1.1-14B-Control](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control) | `control_video`, `reference_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py) | +| [PAI/Wan2.1-Fun-V1.1-1.3B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py) | +| [PAI/Wan2.1-Fun-V1.1-14B-InP](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py) | +| [PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera) | `control_camera_video`, `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py) | +| [PAI/Wan2.1-Fun-V1.1-14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.1-Fun-V1.1-14B-Control-Camera) | `control_camera_video`, `input_image` | [code](/examples/wanvideo/model_inference/Wan2.1-Fun-V1.1-14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-Fun-V1.1-14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-Fun-V1.1-14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py) | +| [DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1](https://modelscope.cn/models/DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1) | `motion_bucket_id` | [code](/examples/wanvideo/model_inference/Wan2.1-1.3b-speedcontrol-v1.py) | [code](/examples/wanvideo/model_training/full/Wan2.1-1.3b-speedcontrol-v1.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py) | [code](/examples/wanvideo/model_training/lora/Wan2.1-1.3b-speedcontrol-v1.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py) | +| [krea/krea-realtime-video](https://www.modelscope.cn/models/krea/krea-realtime-video) | | [code](/examples/wanvideo/model_inference/krea-realtime-video.py) | [code](/examples/wanvideo/model_training/full/krea-realtime-video.sh) | [code](/examples/wanvideo/model_training/validate_full/krea-realtime-video.py) | [code](/examples/wanvideo/model_training/lora/krea-realtime-video.sh) | [code](/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py) | +| [meituan-longcat/LongCat-Video](https://www.modelscope.cn/models/meituan-longcat/LongCat-Video) | `longcat_video` | [code](/examples/wanvideo/model_inference/LongCat-Video.py) | [code](/examples/wanvideo/model_training/full/LongCat-Video.sh) | [code](/examples/wanvideo/model_training/validate_full/LongCat-Video.py) | [code](/examples/wanvideo/model_training/lora/LongCat-Video.sh) | [code](/examples/wanvideo/model_training/validate_lora/LongCat-Video.py) | +| [ByteDance/Video-As-Prompt-Wan2.1-14B](https://modelscope.cn/models/ByteDance/Video-As-Prompt-Wan2.1-14B) | `vap_video`, `vap_prompt` | [code](/examples/wanvideo/model_inference/Video-As-Prompt-Wan2.1-14B.py) | [code](/examples/wanvideo/model_training/full/Video-As-Prompt-Wan2.1-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py) | [code](/examples/wanvideo/model_training/lora/Video-As-Prompt-Wan2.1-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py) | +| [Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B) | | [code](/examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py) | +| [Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py) | +| [Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B) | `input_image` | [code](/examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py) | +| [Wan-AI/Wan2.2-Animate-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-Animate-14B) | `input_image`, `animate_pose_video`, `animate_face_video`, `animate_inpaint_video`, `animate_mask_video` | [code](/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Animate-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Animate-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py) | +| [Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B) | `input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video` | [code](/examples/wanvideo/model_inference/Wan2.2-S2V-14B_multi_clips.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-S2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-S2V-14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py) | +| [PAI/Wan2.2-VACE-Fun-A14B](https://www.modelscope.cn/models/PAI/Wan2.2-VACE-Fun-A14B) | `vace_control_video`, `vace_reference_image` | [code](/examples/wanvideo/model_inference/Wan2.2-VACE-Fun-A14B.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-VACE-Fun-A14B.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-VACE-Fun-A14B.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py) | +| [PAI/Wan2.2-Fun-A14B-InP](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-InP) | `input_image`, `end_image` | [code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-InP.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-InP.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py) | +| [PAI/Wan2.2-Fun-A14B-Control](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control) | `control_video`, `reference_image` | [code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py) | +| [PAI/Wan2.2-Fun-A14B-Control-Camera](https://modelscope.cn/models/PAI/Wan2.2-Fun-A14B-Control-Camera) | `control_camera_video`, `input_image` | [code](/examples/wanvideo/model_inference/Wan2.2-Fun-A14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/full/Wan2.2-Fun-A14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py) | [code](/examples/wanvideo/model_training/lora/Wan2.2-Fun-A14B-Control-Camera.sh) | [code](/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py) | + +* FP8 Precision Training: [doc](/docs/en/Training/FP8_Precision.md), [code](/examples/wanvideo/model_training/special/fp8_training/) +* Two-stage Split Training: [doc](/docs/en/Training/Split_Training.md), [code](/examples/wanvideo/model_training/special/split_training/) +* End-to-end Direct Distillation: [doc](/docs/en/Training/Direct_Distill.md), [code](/examples/wanvideo/model_training/special/direct_distill/) + +## Model Inference + +Models are loaded via `WanVideoPipeline.from_pretrained`, see [Loading Models](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models). + +Input parameters for `WanVideoPipeline` inference include: + +* `prompt`: Prompt describing the content appearing in the video. +* `negative_prompt`: Negative prompt describing content that should not appear in the video, default value is `""`. +* `cfg_scale`: Classifier-free guidance parameter, default value is 5. When set to 1, it no longer takes effect. +* `input_image`: Input image for image-to-video generation, used in conjunction with `denoising_strength`. +* `end_image`: End image for first-and-last frame video generation. +* `input_video`: Input video for video-to-video generation, used in conjunction with `denoising_strength`. +* `denoising_strength`: Denoising strength, range is 0~1, default value is 1. When the value approaches 0, the generated video is similar to the input video; when the value approaches 1, the generated video differs more from the input video. +* `control_video`: Control video for controlling the video generation process. +* `reference_image`: Reference image for maintaining consistency of certain features in the generated video. +* `camera_control_direction`: Camera control direction, optional values are `"Left"`, `"Right"`, `"Up"`, `"Down"`, `"LeftUp"`, `"LeftDown"`, `"RightUp"`, `"RightDown"`. +* `camera_control_speed`: Camera control speed, default value is 1/54. +* `vace_video`: VACE control video. +* `vace_video_mask`: VACE control video mask. +* `vace_reference_image`: VACE reference image. +* `vace_scale`: VACE control strength, default value is 1.0. +* `animate_pose_video`: `animate` model pose video. +* `animate_face_video`: `animate` model face video. +* `animate_inpaint_video`: `animate` model local editing video. +* `animate_mask_video`: `animate` model mask video. +* `vap_video`: `video-as-prompt` input video. +* `vap_prompt`: `video-as-prompt` text description. +* `negative_vap_prompt`: `video-as-prompt` negative text description. +* `input_audio`: Input audio for speech-to-video generation. +* `audio_embeds`: Audio embedding vectors. +* `audio_sample_rate`: Audio sampling rate, default value is 16000. +* `s2v_pose_video`: S2V model pose video. +* `motion_video`: S2V model motion video. +* `height`: Video height, must be a multiple of 16. +* `width`: Video width, must be a multiple of 16. +* `num_frames`: Number of video frames, default value is 81, must be a multiple of 4 + 1. +* `seed`: Random seed. Default is `None`, meaning completely random. +* `rand_device`: Computing device for generating random Gaussian noise matrix, default is `"cpu"`. When set to `cuda`, different GPUs will produce different generation results. +* `num_inference_steps`: Number of inference steps, default value is 50. +* `motion_bucket_id`: Motion control parameter, the larger the value, the greater the motion amplitude. +* `longcat_video`: LongCat input video. +* `tiled`: Whether to enable VAE tiling inference, default is `True`. Setting to `True` can significantly reduce VRAM usage during VAE encoding/decoding stages, producing slight errors and slightly longer inference time. +* `tile_size`: Tile size during VAE encoding/decoding stages, default is `(30, 52)`, only effective when `tiled=True`. +* `tile_stride`: Tile stride during VAE encoding/decoding stages, default is `(15, 26)`, only effective when `tiled=True`, must be less than or equal to `tile_size`. +* `switch_DiT_boundary`: Time boundary for switching DiT models, default value is 0.875. +* `sigma_shift`: Timestep offset parameter, default value is 5.0. +* `sliding_window_size`: Sliding window size. +* `sliding_window_stride`: Sliding window stride. +* `tea_cache_l1_thresh`: L1 threshold for TeaCache. +* `tea_cache_model_id`: Model ID used by TeaCache. +* `progress_bar_cmd`: Progress bar, default is `tqdm.tqdm`. Can be disabled by setting to `lambda x:x`. + +If VRAM is insufficient, please enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). We provide recommended low VRAM configurations for each model in the example code, see the table in the "Model Overview" section above. + +## Model Training + +Wan series models are uniformly trained through [`examples/wanvideo/model_training/train.py`](/examples/wanvideo/model_training/train.py), and the script parameters include: + +* General Training Parameters + * Dataset Basic Configuration + * `--dataset_base_path`: Root directory of the dataset. + * `--dataset_metadata_path`: Metadata file path of the dataset. + * `--dataset_repeat`: Number of times the dataset is repeated in each epoch. + * `--dataset_num_workers`: Number of processes for each DataLoader. + * `--data_file_keys`: Field names to be loaded from metadata, usually image or video file paths, separated by `,`. + * Model Loading Configuration + * `--model_paths`: Paths of models to be loaded. JSON format. + * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors"`. Separated by commas. + * `--extra_inputs`: Extra input parameters required by the model Pipeline, e.g., extra parameters when training image editing models, separated by `,`. + * `--fp8_models`: Models loaded in FP8 format, consistent with `--model_paths` or `--model_id_with_origin_paths` format. Currently only supports models whose parameters are not updated by gradients (no gradient backpropagation, or gradients only update their LoRA). + * Training Basic Configuration + * `--learning_rate`: Learning rate. + * `--num_epochs`: Number of epochs. + * `--trainable_models`: Trainable models, e.g., `dit`, `vae`, `text_encoder`. + * `--find_unused_parameters`: Whether there are unused parameters in DDP training. Some models contain redundant parameters that do not participate in gradient calculation, and this setting needs to be enabled to avoid errors in multi-GPU training. + * `--weight_decay`: Weight decay size, see [torch.optim.AdamW](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html). + * `--task`: Training task, default is `sft`. Some models support more training modes, please refer to the documentation of each specific model. + * Output Configuration + * `--output_path`: Model saving path. + * `--remove_prefix_in_ckpt`: Remove prefix in the state dict of the model file. + * `--save_steps`: Interval of training steps to save the model. If this parameter is left blank, the model is saved once per epoch. + * LoRA Configuration + * `--lora_base_model`: Which model to add LoRA to. + * `--lora_target_modules`: Which layers to add LoRA to. + * `--lora_rank`: Rank of LoRA. + * `--lora_checkpoint`: Path of the LoRA checkpoint. If this path is provided, LoRA will be loaded from this checkpoint. + * `--preset_lora_path`: Preset LoRA checkpoint path. If this path is provided, this LoRA will be loaded in the form of being merged into the base model. This parameter is used for LoRA differential training. + * `--preset_lora_model`: Model that the preset LoRA is merged into, e.g., `dit`. + * Gradient Configuration + * `--use_gradient_checkpointing`: Whether to enable gradient checkpointing. + * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to memory. + * `--gradient_accumulation_steps`: Number of gradient accumulation steps. + * Video Width/Height Configuration + * `--height`: Height of the video. Leave `height` and `width` blank to enable dynamic resolution. + * `--width`: Width of the video. Leave `height` and `width` blank to enable dynamic resolution. + * `--max_pixels`: Maximum pixel area of video frames. When dynamic resolution is enabled, video frames with resolution larger than this value will be downscaled, and video frames with resolution smaller than this value will remain unchanged. + * `--num_frames`: Number of frames in the video. +* Wan Series Specific Parameters + * `--tokenizer_path`: Path of the tokenizer, applicable to text-to-video models, leave blank to automatically download from remote. + * `--audio_processor_path`: Path of the audio processor, applicable to speech-to-video models, leave blank to automatically download from remote. + +We have built a sample video dataset for your testing. You can download this dataset with the following command: + +```shell +modelscope download --dataset DiffSynth-Studio/example_video_dataset --local_dir ./data/example_video_dataset +``` + +We have written recommended training scripts for each model, please refer to the table in the "Model Overview" section above. For how to write model training scripts, please refer to [Model Training](/docs/en/Pipeline_Usage/Model_Training.md); for more advanced training algorithms, please refer to [Training Framework Detailed Explanation](/docs/Training/). \ No newline at end of file diff --git a/docs/en/Model_Details/Z-Image.md b/docs/en/Model_Details/Z-Image.md new file mode 100644 index 0000000..81f246e --- /dev/null +++ b/docs/en/Model_Details/Z-Image.md @@ -0,0 +1,131 @@ +# Z-Image + +Z-Image is an image generation model trained and open-sourced by the Multimodal Interaction Team of Alibaba Tongyi Lab. + +## Installation + +Before using this project for model inference and training, please install DiffSynth-Studio first. + +```shell +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +For more information about installation, please refer to [Install Dependencies](/docs/en/Pipeline_Usage/Setup.md). + +## Quick Start + +Run the following code to quickly load the [Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo) model and perform inference. + +```python +from diffsynth.pipelines.z_image import ZImagePipeline, ModelConfig +import torch + + +pipe = ZImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors"), + ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors"), + ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="tokenizer/"), +) +prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights." +image = pipe(prompt=prompt, seed=42, rand_device="cuda") +image.save("image.jpg") +``` + +## Model Overview + +| Model ID | Inference | Low VRAM Inference | Full Training | Validation After Full Training | LoRA Training | Validation After LoRA Training | +| - | - | - | - | - | - | - | +| [Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo) | [code](/examples/z_image/model_inference/Z-Image-Turbo.py) | [code](/examples/z_image/model_inference_low_vram/Z-Image-Turbo.py) | [code](/examples/z_image/model_training/full/Z-Image-Turbo.sh) | [code](/examples/z_image/model_training/validate_full/Z-Image-Turbo.py) | [code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh) | [code](/examples/z_image/model_training/validate_lora/Z-Image-Turbo.py) | + +Special Training Scripts: + +* Differential LoRA Training: [doc](/docs/en/Training/Differential_LoRA.md), [code](/examples/z_image/model_training/special/differential_training/) +* Trajectory Imitation Distillation Training (Experimental Feature): [code](/examples/z_image/model_training/special/trajectory_imitation/) + +## Model Inference + +Models are loaded via `ZImagePipeline.from_pretrained`, see [Loading Models](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models). + +Input parameters for `ZImagePipeline` inference include: + +* `prompt`: Prompt describing the content appearing in the image. +* `negative_prompt`: Negative prompt describing content that should not appear in the image, default value is `""`. +* `cfg_scale`: Classifier-free guidance parameter, default value is 1. +* `input_image`: Input image for image-to-image generation, used in conjunction with `denoising_strength`. +* `denoising_strength`: Denoising strength, range is 0~1, default value is 1. When the value approaches 0, the generated image is similar to the input image; when the value approaches 1, the generated image differs more from the input image. When `input_image` parameter is not provided, do not set this to a non-1 value. +* `height`: Image height, must be a multiple of 16. +* `width`: Image width, must be a multiple of 16. +* `seed`: Random seed. Default is `None`, meaning completely random. +* `rand_device`: Computing device for generating random Gaussian noise matrix, default is `"cpu"`. When set to `cuda`, different GPUs will produce different generation results. +* `num_inference_steps`: Number of inference steps, default value is 8. + +If VRAM is insufficient, please enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). We provide recommended low VRAM configurations for each model in the example code, see the table in the "Model Overview" section above. + +## Model Training + +Z-Image series models are uniformly trained through [`examples/z_image/model_training/train.py`](/examples/z_image/model_training/train.py), and the script parameters include: + +* General Training Parameters + * Dataset Basic Configuration + * `--dataset_base_path`: Root directory of the dataset. + * `--dataset_metadata_path`: Metadata file path of the dataset. + * `--dataset_repeat`: Number of times the dataset is repeated in each epoch. + * `--dataset_num_workers`: Number of processes for each DataLoader. + * `--data_file_keys`: Field names to be loaded from metadata, usually image or video file paths, separated by `,`. + * Model Loading Configuration + * `--model_paths`: Paths of models to be loaded. JSON format. + * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"Tongyi-MAI/Z-Image-Turbo:transformer/*.safetensors"`. Separated by commas. + * `--extra_inputs`: Extra input parameters required by the model Pipeline, e.g., extra parameters when training image editing models, separated by `,`. + * `--fp8_models`: Models loaded in FP8 format, consistent with `--model_paths` or `--model_id_with_origin_paths` format. Currently only supports models whose parameters are not updated by gradients (no gradient backpropagation, or gradients only update their LoRA). + * Training Basic Configuration + * `--learning_rate`: Learning rate. + * `--num_epochs`: Number of epochs. + * `--trainable_models`: Trainable models, e.g., `dit`, `vae`, `text_encoder`. + * `--find_unused_parameters`: Whether there are unused parameters in DDP training. Some models contain redundant parameters that do not participate in gradient calculation, and this setting needs to be enabled to avoid errors in multi-GPU training. + * `--weight_decay`: Weight decay size, see [torch.optim.AdamW](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html). + * `--task`: Training task, default is `sft`. Some models support more training modes, please refer to the documentation of each specific model. + * Output Configuration + * `--output_path`: Model saving path. + * `--remove_prefix_in_ckpt`: Remove prefix in the state dict of the model file. + * `--save_steps`: Interval of training steps to save the model. If this parameter is left blank, the model is saved once per epoch. + * LoRA Configuration + * `--lora_base_model`: Which model to add LoRA to. + * `--lora_target_modules`: Which layers to add LoRA to. + * `--lora_rank`: Rank of LoRA. + * `--lora_checkpoint`: Path of the LoRA checkpoint. If this path is provided, LoRA will be loaded from this checkpoint. + * `--preset_lora_path`: Preset LoRA checkpoint path. If this path is provided, this LoRA will be loaded in the form of being merged into the base model. This parameter is used for LoRA differential training. + * `--preset_lora_model`: Model that the preset LoRA is merged into, e.g., `dit`. + * Gradient Configuration + * `--use_gradient_checkpointing`: Whether to enable gradient checkpointing. + * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to memory. + * `--gradient_accumulation_steps`: Number of gradient accumulation steps. + * Image Width/Height Configuration (Applicable to Image Generation and Video Generation Models) + * `--height`: Height of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--width`: Width of image or video. Leave `height` and `width` blank to enable dynamic resolution. + * `--max_pixels`: Maximum pixel area of image or video frames. When dynamic resolution is enabled, images with resolution larger than this value will be downscaled, and images with resolution smaller than this value will remain unchanged. +* Z-Image Specific Parameters + * `--tokenizer_path`: Path of the tokenizer, applicable to text-to-image models, leave blank to automatically download from remote. + +We have built a sample image dataset for your testing. You can download this dataset with the following command: + +```shell +modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir ./data/example_image_dataset +``` + +We have written recommended training scripts for each model, please refer to the table in the "Model Overview" section above. For how to write model training scripts, please refer to [Model Training](/docs/en/Pipeline_Usage/Model_Training.md); for more advanced training algorithms, please refer to [Training Framework Detailed Explanation](/docs/Training/). + +Training Tips: + +* [Tongyi-MAI/Z-Image-Turbo](https://www.modelscope.cn/models/Tongyi-MAI/Z-Image-Turbo) is a distilled acceleration model. Therefore, direct training will quickly cause the model to lose its acceleration capability. The effect of inference with "acceleration configuration" (`num_inference_steps=8`, `cfg_scale=1`) becomes worse, while the effect of inference with "no acceleration configuration" (`num_inference_steps=30`, `cfg_scale=2`) becomes better. The following training and inference schemes can be adopted: + * Standard SFT Training ([code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh)) + No Acceleration Configuration Inference + * Differential LoRA Training ([code](/examples/z_image/model_training/special/differential_training/)) + Acceleration Configuration Inference + * An additional LoRA needs to be loaded in differential LoRA training, e.g., [ostris/zimage_turbo_training_adapter](https://www.modelscope.cn/models/ostris/zimage_turbo_training_adapter) + * Standard SFT Training ([code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh)) + Trajectory Imitation Distillation Training ([code](/examples/z_image/model_training/special/trajectory_imitation/)) + Acceleration Configuration Inference + * Standard SFT Training ([code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh)) + Load Distillation Acceleration LoRA During Inference ([model](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-Turbo-DistillFix)) + Acceleration Configuration Inference \ No newline at end of file diff --git a/docs/en/Pipeline_Usage/Environment_Variables.md b/docs/en/Pipeline_Usage/Environment_Variables.md new file mode 100644 index 0000000..308bf16 --- /dev/null +++ b/docs/en/Pipeline_Usage/Environment_Variables.md @@ -0,0 +1,39 @@ +# Environment Variables + +`DiffSynth-Studio` can control some settings through environment variables. + +In `Python` code, you can set environment variables using `os.environ`. Please note that environment variables must be set before `import diffsynth`. + +```python +import os +os.environ["DIFFSYNTH_MODEL_BASE_PATH"] = "./path_to_my_models" +import diffsynth +``` + +On Linux operating systems, you can also temporarily set environment variables from the command line: + +```shell +DIFFSYNTH_MODEL_BASE_PATH="./path_to_my_models" python xxx.py +``` + +Below are the environment variables supported by `DiffSynth-Studio`. + +## `DIFFSYNTH_SKIP_DOWNLOAD` + +Whether to skip model downloads. Can be set to `True`, `true`, `False`, `false`. If `skip_download` is not set in `ModelConfig`, this environment variable will determine whether to skip model downloads. + +## `DIFFSYNTH_MODEL_BASE_PATH` + +Model download root directory. Can be set to any local path. If `local_model_path` is not set in `ModelConfig`, model files will be downloaded to the path pointed to by this environment variable. If neither is set, model files will be downloaded to `./models`. + +## `DIFFSYNTH_ATTENTION_IMPLEMENTATION` + +Attention mechanism implementation method. Can be set to `flash_attention_3`, `flash_attention_2`, `sage_attention`, `xformers`, or `torch`. See [`./core/attention.md`](/docs/en/API_Reference/core/attention.md) for details. + +## `DIFFSYNTH_DISK_MAP_BUFFER_SIZE` + +Buffer size in disk mapping. Default is 1B (1000000000). Larger values occupy more memory but result in faster speeds. + +## `DIFFSYNTH_DOWNLOAD_RESOURCE` + +Remote model download source. Can be set to `modelscope` or `huggingface` to control the source of model downloads. Default value is `modelscope`. \ No newline at end of file diff --git a/docs/en/Pipeline_Usage/Model_Inference.md b/docs/en/Pipeline_Usage/Model_Inference.md new file mode 100644 index 0000000..8901ee6 --- /dev/null +++ b/docs/en/Pipeline_Usage/Model_Inference.md @@ -0,0 +1,105 @@ +# Model Inference + +This document uses the Qwen-Image model as an example to introduce how to use `DiffSynth-Studio` for model inference. + +## Loading Models + +Models are loaded through `from_pretrained`: + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +``` + +Where `torch_dtype` and `device` are computation precision and computation device (not model precision and device). `model_configs` can be configured in multiple ways for model paths. For how models are loaded internally in this project, please refer to [`diffsynth.core.loader`](/docs/en/API_Reference/core/loader.md). + +
+ +Download and load models from remote sources + +> `DiffSynth-Studio` downloads and loads models from [ModelScope](https://www.modelscope.cn/) by default. You need to fill in `model_id` and `origin_file_pattern`, for example: +> +> ```python +> ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), +> ``` +> +> Model files are downloaded to the `./models` path by default, which can be modified through [environment variable DIFFSYNTH_MODEL_BASE_PATH](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_model_base_path). + +
+ +
+ +Load models from local file paths + +> Fill in `path`, for example: +> +> ```python +> ModelConfig(path="models/xxx.safetensors") +> ``` +> +> For models loaded from multiple files, use a list, for example: +> +> ```python +> ModelConfig(path=[ +> "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors", +> ]) +> ``` + +
+ +By default, even after models have been downloaded, the program will still query remotely for missing files. To completely disable remote requests, set [environment variable DIFFSYNTH_SKIP_DOWNLOAD](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_skip_download) to `True`. + +```shell +import os +os.environ["DIFFSYNTH_SKIP_DOWNLOAD"] = "True" +import diffsynth +``` + +To download models from [HuggingFace](https://huggingface.co/), set [environment variable DIFFSYNTH_DOWNLOAD_RESOURCE](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_download_resource) to `huggingface`. + +```shell +import os +os.environ["DIFFSYNTH_DOWNLOAD_RESOURCE"] = "huggingface" +import diffsynth +``` + +## Starting Inference + +Input a prompt to start the inference process and generate an image. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +prompt = "Exquisite portrait, underwater girl, blue dress flowing, hair floating, translucent light, bubbles surrounding, peaceful face, intricate details, dreamy and ethereal." +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +Each model `Pipeline` has different input parameters. Please refer to the documentation for each model. + +If the model parameters are too large, causing insufficient VRAM, please enable [VRAM management](/docs/en/Pipeline_Usage/VRAM_management.md). \ No newline at end of file diff --git a/docs/en/Pipeline_Usage/Model_Training.md b/docs/en/Pipeline_Usage/Model_Training.md new file mode 100644 index 0000000..3c5bffd --- /dev/null +++ b/docs/en/Pipeline_Usage/Model_Training.md @@ -0,0 +1,247 @@ +# Model Training + +This document introduces how to use `DiffSynth-Studio` for model training. + +## Script Parameters + +Training scripts typically include the following parameters: + +* Dataset base configuration + * `--dataset_base_path`: Root directory of the dataset. + * `--dataset_metadata_path`: Metadata file path of the dataset. + * `--dataset_repeat`: Number of times the dataset is repeated in each epoch. + * `--dataset_num_workers`: Number of processes for each Dataloader. + * `--data_file_keys`: Field names that need to be loaded from metadata, usually image or video file paths, separated by `,`. +* Model loading configuration + * `--model_paths`: Paths of models to be loaded. JSON format. + * `--model_id_with_origin_paths`: Model IDs with original paths, for example `"Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors"`. Separated by commas. + * `--extra_inputs`: Extra input parameters required by the model Pipeline, for example, training image editing model Qwen-Image-Edit requires extra parameter `edit_image`, separated by `,`. + * `--fp8_models`: Models loaded in FP8 format, consistent with the format of `--model_paths` or `--model_id_with_origin_paths`. Currently only supports models whose parameters are not updated by gradients (no gradient backpropagation, or gradients only update their LoRA). +* Training base configuration + * `--learning_rate`: Learning rate. + * `--num_epochs`: Number of epochs. + * `--trainable_models`: Trainable models, for example `dit`, `vae`, `text_encoder`. + * `--find_unused_parameters`: Whether there are unused parameters in DDP training. Some models contain redundant parameters that do not participate in gradient calculation, and this setting needs to be enabled to avoid errors in multi-GPU training. + * `--weight_decay`: Weight decay size. See [torch.optim.AdamW](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html) for details. + * `--task`: Training task, default is `sft`. Some models support more training modes. Please refer to the documentation for each specific model. +* Output configuration + * `--output_path`: Model save path. + * `--remove_prefix_in_ckpt`: Remove prefixes in the state dict of model files. + * `--save_steps`: Interval of training steps for saving models. If this parameter is left blank, the model will be saved once per epoch. +* LoRA configuration + * `--lora_base_model`: Which model LoRA is added to. + * `--lora_target_modules`: Which layers LoRA is added to. + * `--lora_rank`: Rank of LoRA. + * `--lora_checkpoint`: Path of LoRA checkpoint. If this path is provided, LoRA will be loaded from this checkpoint. + * `--preset_lora_path`: Preset LoRA checkpoint path. If this path is provided, this LoRA will be loaded in the form of being merged into the base model. This parameter is used for LoRA differential training. + * `--preset_lora_model`: Model that preset LoRA is merged into, for example `dit`. +* Gradient configuration + * `--use_gradient_checkpointing`: Whether to enable gradient checkpointing. + * `--use_gradient_checkpointing_offload`: Whether to offload gradient checkpointing to memory. + * `--gradient_accumulation_steps`: Number of gradient accumulation steps. +* Image dimension configuration (applicable to image generation models and video generation models) + * `--height`: Height of images or videos. Leave `height` and `width` blank to enable dynamic resolution. + * `--width`: Width of images or videos. Leave `height` and `width` blank to enable dynamic resolution. + * `--max_pixels`: Maximum pixel area of images or video frames. When dynamic resolution is enabled, images with resolution larger than this value will be scaled down, and images with resolution smaller than this value will remain unchanged. + +Some models' training scripts also contain additional parameters. See the documentation for each model for details. + +## Preparing Datasets + +`DiffSynth-Studio` adopts a universal dataset format. The dataset contains a series of data files (images, videos, etc.) and annotated metadata files. We recommend organizing dataset files as follows: + +``` +data/example_image_dataset/ +├── metadata.csv +├── image_1.jpg +└── image_2.jpg +``` + +Where `image_1.jpg`, `image_2.jpg` are training image data, and `metadata.csv` is the metadata list, for example: + +``` +image,prompt +image_1.jpg,"a dog" +image_2.jpg,"a cat" +``` + +We have built sample datasets for your testing. To understand how the universal dataset architecture is implemented, please refer to [`diffsynth.core.data`](/docs/en/API_Reference/core/data.md). + +
+ +Sample Image Dataset + +> ```shell +> modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir ./data/example_image_dataset +> ``` +> +> Applicable to training of image generation models such as Qwen-Image and FLUX. + +
+ +
+ +Sample Video Dataset + +> ```shell +> modelscope download --dataset DiffSynth-Studio/example_video_dataset --local_dir ./data/example_video_dataset +> ``` +> +> Applicable to training of video generation models such as Wan. + +
+ +## Loading Models + +Similar to [model loading during inference](/docs/en/Pipeline_Usage/Model_Inference.md#loading-models), we support multiple ways to configure model paths, and the two methods can be mixed. + +
+ +Download and load models from remote sources + +> If we load models during inference through the following settings: +> +> ```python +> model_configs=[ +> ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), +> ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), +> ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), +> ] +> ``` +> +> Then during training, fill in the following parameters to load the corresponding models: +> +> ```shell +> --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" +> ``` +> +> Model files are downloaded to the `./models` path by default, which can be modified through [environment variable DIFFSYNTH_MODEL_BASE_PATH](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_model_base_path). +> +> By default, even after models have been downloaded, the program will still query remotely for missing files. To completely disable remote requests, set [environment variable DIFFSYNTH_SKIP_DOWNLOAD](/docs/en/Pipeline_Usage/Environment_Variables.md#diffsynth_skip_download) to `True`. + +
+ +
+ +
+ +Load models from local file paths + +> If loading models from local files during inference, for example: +> +> ```python +> model_configs=[ +> ModelConfig([ +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00001-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00002-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00003-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00004-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00005-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00006-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00007-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00008-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00009-of-00009.safetensors" +> ]), +> ModelConfig([ +> "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors" +> ]), +> ModelConfig("models/Qwen/Qwen-Image/vae/diffusion_pytorch_model.safetensors") +> ] +> ``` +> +> Then during training, set to: +> +> ```shell +> --model_paths '[ +> [ +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00001-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00002-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00003-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00004-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00005-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00006-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00007-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00008-of-00009.safetensors", +> "models/Qwen/Qwen-Image/transformer/diffusion_pytorch_model-00009-of-00009.safetensors" +> ], +> [ +> "models/Qwen/Qwen-Image/text_encoder/model-00001-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00002-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00003-of-00004.safetensors", +> "models/Qwen/Qwen-Image/text_encoder/model-00004-of-00004.safetensors" +> ], +> "models/Qwen/Qwen-Image/vae/diffusion_pytorch_model.safetensors" +> ]' \ +> ``` +> +> Note that `--model_paths` is in JSON format, and extra `,` cannot appear in it, otherwise it cannot be parsed normally. + +
+ +## Setting Trainable Modules + +The training framework supports training of any model. Taking Qwen-Image as an example, to fully train the DiT model, set to: + +```shell +--trainable_models "dit" +``` + +To train LoRA of the DiT model, set to: + +```shell +--lora_base_model dit --lora_target_modules "to_q,to_k,to_v" --lora_rank 32 +``` + +We hope to leave enough room for technical exploration, so the framework supports training any number of modules simultaneously. For example, to train the text encoder, controlnet, and LoRA of the DiT simultaneously: + +```shell +--trainable_models "text_encoder,controlnet" --lora_base_model dit --lora_target_modules "to_q,to_k,to_v" --lora_rank 32 +``` + +Additionally, since the training script loads multiple modules (text encoder, dit, vae, etc.), prefixes need to be removed when saving model files. For example, when fully training the DiT part or training the LoRA model of the DiT part, please set `--remove_prefix_in_ckpt pipe.dit.`. If multiple modules are trained simultaneously, developers need to write code to split the state dict in the model file after training is completed. + +## Starting the Training Program + +The training framework is built on [`accelerate`](https://huggingface.co/docs/accelerate/index). Training commands are written in the following format: + +```shell +accelerate launch xxx/train.py \ + --xxx yyy \ + --xxxx yyyy +``` + +We have written preset training scripts for each model. See the documentation for each model for details. + +By default, `accelerate` will train according to the configuration in `~/.cache/huggingface/accelerate/default_config.yaml`. Use `accelerate config` to configure interactively in the terminal, including multi-GPU training, [`DeepSpeed`](https://www.deepspeed.ai/), etc. + +We provide recommended `accelerate` configuration files for some models, which can be set through `--config_file`. For example, full training of the Qwen-Image model: + +```shell +accelerate launch --config_file examples/qwen_image/model_training/full/accelerate_config_zero2offload.yaml examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image_full" \ + --trainable_models "dit" \ + --use_gradient_checkpointing \ + --find_unused_parameters +``` + +## Training Considerations + +* In addition to the `csv` format, dataset metadata also supports `json` and `jsonl` formats. For how to choose the best metadata format, please refer to [/docs/en/API_Reference/core/data.md#metadata](/docs/en/API_Reference/core/data.md#metadata) +* Training effectiveness is usually strongly correlated with training steps and weakly correlated with epoch count. Therefore, we recommend using the `--save_steps` parameter to save model files at training step intervals. +* When data volume * `dataset_repeat` exceeds $10^9$, we observed that the dataset speed becomes significantly slower, which seems to be a `PyTorch` bug. We are not sure if newer versions of `PyTorch` have fixed this issue. +* For learning rate `--learning_rate`, it is recommended to set to `1e-4` in LoRA training and `1e-5` in full training. +* The training framework does not support batch size > 1. The reasons are complex. See [Q&A: Why doesn't the training framework support batch size > 1?](/docs/en/QA.md#why-doesnt-the-training-framework-support-batch-size--1) +* Some models contain redundant parameters. For example, the text encoding part of the last layer of Qwen-Image's DiT part. When training these models, `--find_unused_parameters` needs to be set to avoid errors in multi-GPU training. For compatibility with community models, we do not intend to remove these redundant parameters. +* The loss function value of Diffusion models has little relationship with actual effects. Therefore, we do not record loss function values during training. We recommend setting `--num_epochs` to a sufficiently large value, testing while training, and manually closing the training program after the effect converges. +* `--use_gradient_checkpointing` is usually enabled unless GPU VRAM is sufficient; `--use_gradient_checkpointing_offload` is enabled as needed. See [`diffsynth.core.gradient`](/docs/en/API_Reference/core/gradient.md) for details. \ No newline at end of file diff --git a/docs/en/Pipeline_Usage/Setup.md b/docs/en/Pipeline_Usage/Setup.md new file mode 100644 index 0000000..c9fba68 --- /dev/null +++ b/docs/en/Pipeline_Usage/Setup.md @@ -0,0 +1,21 @@ +# Installing Dependencies + +Install from source (recommended): + +``` +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +Install from PyPI (there may be delays in version updates; for latest features, install from source): + +``` +pip install diffsynth +``` + +If you encounter issues during installation, they may be caused by upstream dependency packages. Please refer to the documentation for these packages: + +* [torch](https://pytorch.org/get-started/locally/) +* [sentencepiece](https://github.com/google/sentencepiece) +* [cmake](https://cmake.org) \ No newline at end of file diff --git a/docs/en/Pipeline_Usage/VRAM_management.md b/docs/en/Pipeline_Usage/VRAM_management.md new file mode 100644 index 0000000..ecf5379 --- /dev/null +++ b/docs/en/Pipeline_Usage/VRAM_management.md @@ -0,0 +1,206 @@ +# VRAM Management + +VRAM management is a distinctive feature of `DiffSynth-Studio` that enables GPUs with low VRAM to run inference with large parameter models. This document uses Qwen-Image as an example to introduce how to use the VRAM management solution. + +## Basic Inference + +The following code does not enable any VRAM management, occupying 56G VRAM as a reference. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +prompt = "Exquisite portrait, underwater girl, blue dress flowing, hair floating, translucent light, bubbles surrounding, peaceful face, intricate details, dreamy and ethereal." +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +## CPU Offload + +Since the model `Pipeline` consists of multiple components that are not called simultaneously, we can move some components to memory when they are not needed for computation, reducing VRAM usage. The following code implements this logic, occupying 40G VRAM. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cuda", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +prompt = "Exquisite portrait, underwater girl, blue dress flowing, hair floating, translucent light, bubbles surrounding, peaceful face, intricate details, dreamy and ethereal." +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +## FP8 Quantization + +Building upon CPU Offload, we further enable FP8 quantization to reduce VRAM requirements. The following code allows model parameters to be stored in VRAM with FP8 precision and temporarily converted to BF16 precision for computation during inference, occupying 21G VRAM. However, this quantization scheme has minor image quality degradation issues. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": torch.float8_e4m3fn, + "offload_device": "cpu", + "onload_dtype": torch.float8_e4m3fn, + "onload_device": "cuda", + "preparing_dtype": torch.float8_e4m3fn, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +prompt = "Exquisite portrait, underwater girl, blue dress flowing, hair floating, translucent light, bubbles surrounding, peaceful face, intricate details, dreamy and ethereal." +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +> Q: Why temporarily convert to BF16 precision during inference instead of computing with FP8 precision? +> +> A: Native FP8 computation is only supported on Hopper architecture GPUs (such as H20) and has significant computational errors. We currently do not enable FP8 precision computation. The current FP8 quantization only reduces VRAM usage but does not improve computation speed. + +## Dynamic VRAM Management + +In CPU Offload, we control model components. In fact, we support Layer-level Offload, splitting a model into multiple Layers, keeping some resident in VRAM and storing others in memory for on-demand transfer to VRAM for computation. This feature requires model developers to provide detailed VRAM management solutions for each model. Related configurations are in `diffsynth/configs/vram_management_module_maps.py`. + +By adding the `vram_limit` parameter to the `Pipeline`, the framework can automatically sense the remaining VRAM of the device and decide how to split the model between VRAM and memory. The smaller the `vram_limit`, the less VRAM occupied, but slower the speed. +* When `vram_limit=None`, the default state, the framework assumes unlimited VRAM and dynamic VRAM management is disabled +* When `vram_limit=10`, the framework will limit the model after VRAM usage exceeds 10G, moving the excess parts to memory storage +* When `vram_limit=0`, the framework will do its best to reduce VRAM usage, storing all model parameters in memory and transferring them to VRAM for computation only when necessary + +When VRAM is insufficient to run model inference, the framework will attempt to exceed the `vram_limit` restriction to keep the model inference running. Therefore, the VRAM management framework cannot always guarantee that VRAM usage will be less than `vram_limit`. We recommend setting it to slightly less than the actual available VRAM. For example, when GPU VRAM is 16G, set it to `vram_limit=15.5`. In `PyTorch`, you can use `torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3)` to get the GPU's VRAM. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": torch.float8_e4m3fn, + "offload_device": "cpu", + "onload_dtype": torch.float8_e4m3fn, + "onload_device": "cpu", + "preparing_dtype": torch.float8_e4m3fn, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) +prompt = "Exquisite portrait, underwater girl, blue dress flowing, hair floating, translucent light, bubbles surrounding, peaceful face, intricate details, dreamy and ethereal." +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +## Disk Offload + +In more extreme cases, when memory is also insufficient to store the entire model, the Disk Offload feature allows lazy loading of model parameters, meaning each Layer of the model only reads the corresponding parameters from disk when the forward function is called. When enabling this feature, we recommend using high-speed SSD drives. + +Disk Offload is a very special VRAM management solution that only supports `.safetensors` format files, not `.bin`, `.pth`, `.ckpt`, or other binary files, and does not support [state dict converter](/docs/en/Developer_Guide/Integrating_Your_Model.md#step-2-model-file-format-conversion) with Tensor reshape. + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": "disk", + "onload_device": "disk", + "preparing_dtype": torch.float8_e4m3fn, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), + vram_limit=10, +) +prompt = "Exquisite portrait, underwater girl, blue dress flowing, hair floating, translucent light, bubbles surrounding, peaceful face, intricate details, dreamy and ethereal." +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +## More Usage Methods + +Information in `vram_config` can be filled in manually, for example, Disk Offload without FP8 quantization: + +```python +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": "disk", + "onload_device": "disk", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +``` + +Specifically, the VRAM management module divides model Layers into the following four states: + +* Offload: This model will not be called in the short term. This state is controlled by switching `Pipeline` +* Onload: This model will be called at any time soon. This state is controlled by switching `Pipeline` +* Preparing: Intermediate state between Onload and Computation. A temporary storage state when VRAM allows. This state is controlled by the VRAM management mechanism and enters this state if and only if [vram_limit is set to unlimited] or [vram_limit is set and there is spare VRAM] +* Computation: The model is being computed. This state is controlled by the VRAM management mechanism and is temporarily entered only during `forward` + +If you are a model developer and want to control the VRAM management granularity of a specific model, please refer to [../Developer_Guide/Enabling_VRAM_management.md](/docs/en/Developer_Guide/Enabling_VRAM_management.md). + +## Best Practices + +* Sufficient VRAM -> Use [Basic Inference](#basic-inference) +* Insufficient VRAM + * Sufficient memory -> Use [Dynamic VRAM Management](#dynamic-vram-management) + * Insufficient memory -> Use [Disk Offload](#disk-offload) \ No newline at end of file diff --git a/docs/en/QA.md b/docs/en/QA.md new file mode 100644 index 0000000..fe75460 --- /dev/null +++ b/docs/en/QA.md @@ -0,0 +1,28 @@ +# Frequently Asked Questions + +## Why doesn't the training framework support batch size > 1? + +* **Larger batch sizes no longer achieve significant acceleration**: Due to acceleration technologies such as flash attention that have fully improved GPU utilization, larger batch sizes will only bring greater VRAM usage without significant acceleration. The experience with small models like Stable Diffusion 1.5 is no longer applicable to the latest large models. +* **Larger batch sizes can be achieved through other solutions**: Multi-GPU training and Gradient Accumulation can both mathematically equivalently achieve larger batch sizes. +* **Larger batch sizes contradict the framework's general design**: We hope to build a general training framework. Many models cannot accommodate larger batch sizes, such as text encodings of different lengths and images of different resolutions, which cannot be merged into larger batches. + +## Why aren't redundant parameters removed from certain models? + +In some models, redundant parameters exist. For example, in Qwen-Image's DiT model, the text portion of the last layer does not participate in any calculations. This is a minor bug left by the model developers. Setting it as trainable directly will also cause errors in multi-GPU training. + +To maintain compatibility with other models in the open-source community, we have decided to retain these parameters. These redundant parameters can avoid errors in multi-GPU training through the `--find_unused_parameters` parameter. + +## Why does FP8 quantization show no acceleration effect? + +Native FP8 computation relies on Hopper architecture GPUs and has significant precision errors. It is currently immature technology, so this project does not support native FP8 computation. + +FP8 computation in VRAM management refers to storing model parameters in memory or VRAM with FP8 precision and temporarily converting them to other precisions when needed for computation. Therefore, it can only reduce VRAM usage without acceleration effects. + +## Why doesn't the training framework support native FP8 precision training? + +Even with suitable hardware conditions, we currently have no plans to support native FP8 precision training. + +* The main challenge of native FP8 precision training is precision overflow caused by gradient explosion. To ensure training stability, the model structure needs to be redesigned accordingly. However, no model developers are willing to do so at present. +* Additionally, models trained with native FP8 precision can only be computed with BF16 precision during inference without Hopper architecture GPUs, theoretically resulting in generation quality inferior to FP8. + +Therefore, native FP8 precision training technology is extremely immature. We will observe the technological developments in the open-source community. \ No newline at end of file diff --git a/docs/en/README.md b/docs/en/README.md new file mode 100644 index 0000000..17c8e88 --- /dev/null +++ b/docs/en/README.md @@ -0,0 +1,88 @@ +# DiffSynth-Studio Documentation + +Welcome to the magical world of Diffusion models! `DiffSynth-Studio` is an open-source Diffusion model engine developed and maintained by the [ModelScope Community](https://www.modelscope.cn/). We aim to build a universal Diffusion model framework that fosters technological innovation through framework construction, aggregates the power of the open-source community, and explores the boundaries of generative model technology! + +
+ +Documentation Reading Guide + +```mermaid +graph LR; + I_want_to_use_models_for_inference_and_training-->sec1[Section 1: Getting Started]; + I_want_to_use_models_for_inference_and_training-->sec2[Section 2: Model Details]; + I_want_to_use_models_for_inference_and_training-->sec3[Section 3: Training Framework]; + I_want_to_develop_based_on_this_framework-->sec3[Section 3: Training Framework]; + I_want_to_develop_based_on_this_framework-->sec4[Section 4: Model Integration]; + I_want_to_develop_based_on_this_framework-->sec5[Section 5: API Reference]; + I_want_to_explore_new_technologies_based_on_this_project-->sec4[Section 4: Model Integration]; + I_want_to_explore_new_technologies_based_on_this_project-->sec5[Section 5: API Reference]; + I_want_to_explore_new_technologies_based_on_this_project-->sec6[Section 6: Academic Guide]; + I_encountered_a_problem-->sec7[Section 7: Frequently Asked Questions]; +``` + +
+ +## Section 1: Getting Started + +This section introduces the basic usage of `DiffSynth-Studio`, including how to enable VRAM management for inference on GPUs with extremely low VRAM, and how to train various base models, LoRAs, ControlNets, and other models. + +* [Installation Dependencies](/docs/en/Pipeline_Usage/Setup.md) +* [Model Inference](/docs/en/Pipeline_Usage/Model_Inference.md) +* [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md) +* [Model Training](/docs/en/Pipeline_Usage/Model_Training.md) +* [Environment Variables](/docs/en/Pipeline_Usage/Environment_Variables.md) + +## Section 2: Model Details + +This section introduces the Diffusion models supported by `DiffSynth-Studio`. Some model pipelines feature special functionalities such as controllable generation and parallel acceleration. + +* [FLUX.1](/docs/en/Model_Details/FLUX.md) +* [Wan](/docs/en/Model_Details/Wan.md) +* [Qwen-Image](/docs/en/Model_Details/Qwen-Image.md) +* [FLUX.2](/docs/en/Model_Details/FLUX2.md) +* [Z-Image](/docs/en/Model_Details/Z-Image.md) + +## Section 3: Training Framework + +This section introduces the design philosophy of the training framework in `DiffSynth-Studio`, helping developers understand the principles of Diffusion model training algorithms. + +* [Basic Principles of Diffusion Models](/docs/en/Training/Understanding_Diffusion_models.md) +* [Standard Supervised Training](/docs/en/Training/Supervised_Fine_Tuning.md) +* [Enabling FP8 Precision in Training](/docs/en/Training/FP8_Precision.md) +* [End-to-End Distillation Accelerated Training](/docs/en/Training/Direct_Distill.md) +* [Two-Stage Split Training](/docs/en/Training/Split_Training.md) +* [Differential LoRA Training](/docs/en/Training/Differential_LoRA.md) + +## Section 4: Model Integration + +This section introduces how to integrate models into `DiffSynth-Studio` to utilize the framework's basic functions, helping developers provide support for new models in this project or perform inference and training of private models. + +* [Integrating Model Architecture](/docs/en/Developer_Guide/Integrating_Your_Model.md) +* [Building a Pipeline](/docs/en/Developer_Guide/Building_a_Pipeline.md) +* [Enabling Fine-Grained VRAM Management](/docs/en/Developer_Guide/Enabling_VRAM_management.md) +* [Model Training Integration](/docs/en/Developer_Guide/Training_Diffusion_Models.md) + +## Section 5: API Reference + +This section introduces the independent core module `diffsynth.core` in `DiffSynth-Studio`, explaining how internal functions are designed and operate. Developers can use these functional modules in other codebase developments if needed. + +* [`diffsynth.core.attention`](/docs/en/API_Reference/core/attention.md): Attention mechanism implementation +* [`diffsynth.core.data`](/docs/en/API_Reference/core/data.md): Data processing operators and general datasets +* [`diffsynth.core.gradient`](/docs/en/API_Reference/core/gradient.md): Gradient checkpointing +* [`diffsynth.core.loader`](/docs/en/API_Reference/core/loader.md): Model download and loading +* [`diffsynth.core.vram`](/docs/en/API_Reference/core/vram.md): VRAM management + +## Section 6: Academic Guide + +This section introduces how to use `DiffSynth-Studio` to train new models, helping researchers explore new model technologies. + +* Training models from scratch 【coming soon】 +* Inference improvement techniques 【coming soon】 +* Designing controllable generation models 【coming soon】 +* Creating new training paradigms 【coming soon】 + +## Section 7: Frequently Asked Questions + +This section summarizes common developer questions. If you encounter issues during usage or development, please refer to this section. If you still cannot resolve the problem, please submit an issue on GitHub. + +* [Frequently Asked Questions](/docs/en/QA.md) \ No newline at end of file diff --git a/docs/en/Training/Differential_LoRA.md b/docs/en/Training/Differential_LoRA.md new file mode 100644 index 0000000..febe507 --- /dev/null +++ b/docs/en/Training/Differential_LoRA.md @@ -0,0 +1,38 @@ +# Differential LoRA Training + +Differential LoRA training is a special form of LoRA training designed to enable models to learn differences between images. + +## Training Approach + +We were unable to identify the original proposer of differential LoRA training, as this technique has been circulating in the open-source community for a long time. + +Assume we have two similar-content images: Image 1 and Image 2. For example, both images contain a car, but Image 1 has fewer details while Image 2 has more details. In differential LoRA training, we perform two-step training: + +* Train LoRA 1 using Image 1 as training data with [standard supervised training](/docs/en/Training/Supervised_Fine_Tuning.md) +* Train LoRA 2 using Image 2 as training data, after integrating LoRA 1 into the base model, with [standard supervised training](/docs/en/Training/Supervised_Fine_Tuning.md) + +In the first training step, since there is only one training image, the LoRA model easily overfits. Therefore, after training, LoRA 1 will cause the model to generate Image 1 without hesitation, regardless of the random seed. In the second training step, the LoRA model overfits again. Thus, after training, with the combined effect of LoRA 1 and LoRA 2, the model will generate Image 2 without hesitation. In short: + +* LoRA 1 = Generate Image 1 +* LoRA 1 + LoRA 2 = Generate Image 2 + +At this point, discarding LoRA 1 and using only LoRA 2, the model will understand the difference between Image 1 and Image 2, making the generated content tend toward "less like Image 1, more like Image 2." + +Single training data can ensure the model overfits to the training data, but lacks stability. To improve stability, we can train with multiple image pairs and average the trained LoRA 2 models to obtain a more stable LoRA. + +Using this training approach, some functionally unique LoRA models can be trained. For example, using ugly and beautiful image pairs to train LoRAs that enhance image aesthetics; using low-detail and high-detail image pairs to train LoRAs that increase image detail. + +## Model Effects + +We have trained several aesthetic enhancement LoRAs using differential LoRA training techniques. You can visit the corresponding model pages to view the generation effects. + +* [DiffSynth-Studio/Qwen-Image-LoRA-ArtAug-v1](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-LoRA-ArtAug-v1) +* [DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1](https://modelscope.cn/models/DiffSynth-Studio/ArtAug-lora-FLUX.1dev-v1) + +## Using Differential LoRA Training in the Training Framework + +The first step of training is identical to ordinary LoRA training. In the second step's training command, fill in the path of the first step's LoRA model file through the `--preset_lora_path` parameter, and set `--preset_lora_model` to the same parameters as `lora_base_model` to load LoRA 1 into the base model. + +## Framework Design Concept + +In the training framework, the model pointed to by `--preset_lora_path` is loaded in the `switch_pipe_to_training_mode` of `DiffusionTrainingModule`. \ No newline at end of file diff --git a/docs/en/Training/Direct_Distill.md b/docs/en/Training/Direct_Distill.md new file mode 100644 index 0000000..4cbeb59 --- /dev/null +++ b/docs/en/Training/Direct_Distill.md @@ -0,0 +1,97 @@ +# End-to-End Distillation Accelerated Training + +## Distillation Accelerated Training + +The inference process of Diffusion models typically requires multi-step iterations, which improves generation quality but also makes the generation process slow. Through distillation accelerated training, the number of steps required to generate clear content can be reduced. The essence of distillation accelerated training technology is to align the generation effects of a small number of steps with those of a large number of steps. + +There are diverse methods for distillation accelerated training, such as: + +* Adversarial training ADD (Adversarial Diffusion Distillation) + * Paper: https://arxiv.org/abs/2311.17042 + * Model: [stabilityai/sdxl-turbo](https://modelscope.cn/models/stabilityai/sdxl-turbo) +* Progressive training Hyper-SD + * Paper: https://arxiv.org/abs/2404.13686 + * Model: [ByteDance/Hyper-SD](https://www.modelscope.cn/models/ByteDance/Hyper-SD) + +## Direct Distillation + +At the framework level, supporting these distillation accelerated training schemes is extremely difficult. In the design of the training framework, we need to ensure that the training scheme meets the following conditions: + +* Generality: The training scheme applies to most Diffusion models supported within the framework, rather than only working for a specific model, which is a basic requirement for code framework construction. +* Stability: The training scheme must ensure stable training effects without requiring manual fine-tuning of parameters. Adversarial training in ADD cannot guarantee stability. +* Simplicity: The training scheme does not introduce additional complex modules. According to Occam's Razor principle, complex solutions may introduce potential risks. The Human Feedback Learning in Hyper-SD makes the training process overly complex. + +Therefore, in the training framework of `DiffSynth-Studio`, we designed an end-to-end distillation accelerated training scheme, which we call Direct Distillation. The pseudocode for the training process is as follows: + +``` +seed = xxx +with torch.no_grad(): + image_1 = pipe(prompt, steps=50, seed=seed, cfg=4) +image_2 = pipe(prompt, steps=4, seed=seed, cfg=1) +loss = torch.nn.functional.mse_loss(image_1, image_2) +``` + +Yes, it's a very end-to-end training scheme that produces immediate results with minimal training. + +## Models Trained with Direct Distillation + +We trained two models based on Qwen-Image using this scheme: + +* [DiffSynth-Studio/Qwen-Image-Distill-Full](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-Full): Full distillation training +* [DiffSynth-Studio/Qwen-Image-Distill-LoRA](https://modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Distill-LoRA): LoRA distillation training + +Click on the model links to go to the model pages and view the model effects. + +## Using Distillation Accelerated Training in the Training Framework + +First, you need to generate training data. Please refer to the [Model Inference](/docs/en/Pipeline_Usage/Model_Inference.md) section to write inference code and generate training data with a sufficient number of inference steps. + +Taking Qwen-Image as an example, the following code can generate an image: + +```python +from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig +import torch + +pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), +) +prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。" +image = pipe(prompt, seed=0, num_inference_steps=40) +image.save("image.jpg") +``` + +Then, we compile the necessary information into [metadata files](/docs/en/API_Reference/core/data.md#metadata): + +```csv +image,prompt,seed,rand_device,num_inference_steps,cfg_scale +distill_qwen/image.jpg,"精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。",0,cpu,4,1 +``` + +This sample dataset can be downloaded directly: + +```shell +modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir ./data/example_image_dataset +``` + +Then start LoRA distillation accelerated training: + +```shell +bash examples/qwen_image/model_training/lora/Qwen-Image-Distill-LoRA.sh +``` + +Please note that in the [training script parameters](/docs/en/Pipeline_Usage/Model_Training.md#script-parameters), the image resolution setting for the dataset should avoid triggering scaling processing. When setting `--height` and `--width` to enable fixed resolution, all training data must be generated with exactly the same width and height. When setting `--max_pixels` to enable dynamic resolution, the value of `--max_pixels` must be greater than or equal to the pixel area of any training image. + +## Framework Design Concept + +Compared to [Standard Supervised Training](/docs/en/Training/Supervised_Fine_Tuning.md), Direct Distillation only differs in the training loss function. The loss function for Direct Distillation is `DirectDistillLoss` in `diffsynth.diffusion.loss`. + +## Future Work + +Direct Distillation is a highly general acceleration scheme, but it may not be the best-performing scheme. Therefore, we have not yet published this technology in paper form. We hope to leave this problem to the academic and open-source communities to solve together, and we look forward to developers providing more complete general training schemes. \ No newline at end of file diff --git a/docs/en/Training/FP8_Precision.md b/docs/en/Training/FP8_Precision.md new file mode 100644 index 0000000..5f23abb --- /dev/null +++ b/docs/en/Training/FP8_Precision.md @@ -0,0 +1,20 @@ +# Enabling FP8 Precision in Training + +Although `DiffSynth-Studio` supports [VRAM management](/docs/en/Pipeline_Usage/VRAM_management.md) in model inference, most of the techniques for reducing VRAM usage are not suitable for training. Offloading would cause extremely slow training processes. + +FP8 precision is the only VRAM management strategy that can be enabled during training. However, this framework currently does not support native FP8 precision training. For reasons, see [Q&A: Why doesn't the training framework support native FP8 precision training?](/docs/en/QA.md#why-doesnt-the-training-framework-support-native-fp8-precision-training). It only supports storing models whose parameters are not updated by gradients (models that do not require gradient backpropagation, or whose gradients only update their LoRA) in FP8 precision. + +## Enabling FP8 + +In our provided training scripts, you can quickly set models to be stored in FP8 precision through the `--fp8_models` parameter. Taking Qwen-Image LoRA training as an example, we provide a script for enabling FP8 training located at [`/examples/qwen_image/model_training/special/fp8_training/Qwen-Image-LoRA.sh`](/examples/qwen_image/model_training/special/fp8_training/Qwen-Image-LoRA.sh). After training is completed, you can verify the training results with the script [`/examples/qwen_image/model_training/special/fp8_training/validate.py`](/examples/qwen_image/model_training/special/fp8_training/validate.py). + +Please note that this FP8 VRAM management strategy does not support gradient updates. When a model is set to be trainable, FP8 precision cannot be enabled for that model. Models that support FP8 include two types: + +* Parameters are not trainable, such as VAE models +* Gradients do not update their parameters, such as DiT models in LoRA training + +Experimental verification shows that LoRA training with FP8 enabled does not cause significant image quality degradation. However, theoretical errors do exist. If you encounter training results inferior to BF16 precision training when using this feature, please provide feedback through GitHub issues. + +## Training Framework Design Concept + +The training framework completely reuses the inference VRAM management, and only parses VRAM management configurations through `parse_model_configs` in `DiffusionTrainingModule` during training. \ No newline at end of file diff --git a/docs/en/Training/Split_Training.md b/docs/en/Training/Split_Training.md new file mode 100644 index 0000000..07068d2 --- /dev/null +++ b/docs/en/Training/Split_Training.md @@ -0,0 +1,97 @@ +# Two-Stage Split Training + +This document introduces split training, which can automatically divide the training process into two stages, reducing VRAM usage while accelerating training speed. + +(Split training is an experimental feature that has not yet undergone large-scale validation. If you encounter any issues while using it, please submit an issue on GitHub.) + +## Split Training + +In the training process of most models, a large amount of computation occurs in "preprocessing," i.e., "computations unrelated to the denoising model," including VAE encoding, text encoding, etc. When the corresponding model parameters are fixed, the results of these computations are repetitive. For each data sample, the computational results are identical across multiple epochs. Therefore, we provide a "split training" feature that can automatically analyze and split the training process. + +For standard supervised training of ordinary text-to-image models, the splitting process is straightforward. It only requires splitting the computation of all [`Pipeline Units`](/docs/en/Developer_Guide/Building_a_Pipeline.md#units) into the first stage, storing the computational results to disk, and then reading these results from disk in the second stage for subsequent computations. However, if gradient backpropagation is required during preprocessing, the situation becomes extremely complex. To address this, we introduced a computational graph splitting algorithm to analyze how to split the computation. + +## Computational Graph Splitting Algorithm + +> (We will supplement the detailed specifics of the computational graph splitting algorithm in future document updates) + +## Using Split Training + +Split training already supports [Standard Supervised Training](/docs/en/Training/Supervised_Fine_Tuning.md) and [Direct Distillation Training](/docs/en/Training/Direct_Distill.md). The `--task` parameter in the training command controls this. Taking LoRA training of the Qwen-Image model as an example, the pre-split training command is: + +```shell +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors,Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image_lora" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters +``` + +After splitting, in the first stage, make the following modifications: + +* Change `--dataset_repeat` to 1 to avoid redundant computation +* Change `--output_path` to the path where the first-stage computation results are saved +* Add the additional parameter `--task "sft:data_process"` +* Remove the DiT model from `--model_id_with_origin_paths` + +```shell +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 1 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-LoRA-splited-cache" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters \ + --task "sft:data_process" +``` + +In the second stage, make the following modifications: + +* Change `--dataset_base_path` to the `--output_path` of the first stage +* Remove `--dataset_metadata_path` +* Add the additional parameter `--task "sft:train"` +* Remove the Text Encoder and VAE models from `--model_id_with_origin_paths` + +```shell +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path "./models/train/Qwen-Image-LoRA-splited-cache" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-LoRA-splited" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters \ + --task "sft:train" +``` + +We provide sample training scripts and validation scripts located at `examples/qwen_image/model_training/special/split_training`. + +## Training Framework Design Concept + +The training framework splits the computational units in the `Pipeline` through the `split_pipeline_units` method of `DiffusionTrainingModule`. \ No newline at end of file diff --git a/docs/en/Training/Supervised_Fine_Tuning.md b/docs/en/Training/Supervised_Fine_Tuning.md new file mode 100644 index 0000000..fd29c10 --- /dev/null +++ b/docs/en/Training/Supervised_Fine_Tuning.md @@ -0,0 +1,129 @@ +# Standard Supervised Training + +After understanding the [Basic Principles of Diffusion Models](/docs/en/Training/Understanding_Diffusion_models.md), this document introduces how the framework implements Diffusion model training. This document explains the framework's principles to help developers write new training code. If you want to use our provided default training functions, please refer to [Model Training](/docs/en/Pipeline_Usage/Model_Training.md). + +Recalling the model training pseudocode from earlier, when we actually write code, the situation becomes extremely complex. Some models require additional guidance conditions and preprocessing, such as ControlNet; some models require cross-computation with the denoising model, such as VACE; some models require Gradient Checkpointing due to excessive VRAM demands, such as Qwen-Image's DiT. + +To achieve strict consistency between inference and training, we abstractly encapsulate components like `Pipeline`, reusing inference code extensively during training. Please refer to [Integrating Pipeline](/docs/en/Developer_Guide/Building_a_Pipeline.md) to understand the design of `Pipeline` components. Next, we'll introduce how the training framework utilizes `Pipeline` components to build training algorithms. + +## Framework Design Concept + +The training module is encapsulated on top of the `Pipeline`, inheriting `DiffusionTrainingModule` from `diffsynth.diffusion.training_module`. We need to provide the necessary `__init__` and `forward` methods for the training module. Taking Qwen-Image's LoRA training as an example, we provide a simple script containing only basic training functions in `examples/qwen_image/model_training/special/simple/train.py` to help developers understand the design concept of the training module. + +```python +class QwenImageTrainingModule(DiffusionTrainingModule): + def __init__(self, device): + # Initialize models here. + pass + + def forward(self, data): + # Compute loss here. + return loss +``` + +### `__init__` + +In `__init__`, model initialization is required. First load the model, then switch it to training mode. + +```python + def __init__(self, device): + super().__init__() + # Load the pipeline + self.pipe = QwenImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device=device, + model_configs=[ + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"), + ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"), + ) + # Switch to training mode + self.switch_pipe_to_training_mode( + self.pipe, + lora_base_model="dit", + lora_target_modules="to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj", + lora_rank=32, + ) +``` + +The logic for loading models is basically consistent with inference, supporting loading models from remote and local paths. See [Model Inference](/docs/en/Pipeline_Usage/Model_Inference.md) for details, but please note not to enable [VRAM Management](/docs/en/Pipeline_Usage/VRAM_management.md). + +`switch_pipe_to_training_mode` can switch the model to training mode. See `switch_pipe_to_training_mode` for details. + +### `forward` + +In `forward`, the loss function value needs to be calculated. First perform preprocessing, then compute the loss function through the `Pipeline`'s [`model_fn`](/docs/en/Developer_Guide/Building_a_Pipeline.md#model_fn). + +```python + def forward(self, data): + # Preprocess + inputs_posi = {"prompt": data["prompt"]} + inputs_nega = {"negative_prompt": ""} + inputs_shared = { + # Assume you are using this pipeline for inference, + # please fill in the input parameters. + "input_image": data["image"], + "height": data["image"].size[1], + "width": data["image"].size[0], + # Please do not modify the following parameters + # unless you clearly know what this will cause. + "cfg_scale": 1, + "rand_device": self.pipe.device, + "use_gradient_checkpointing": True, + "use_gradient_checkpointing_offload": False, + } + for unit in self.pipe.units: + inputs_shared, inputs_posi, inputs_nega = self.pipe.unit_runner(unit, self.pipe, inputs_shared, inputs_posi, inputs_nega) + # Loss + loss = FlowMatchSFTLoss(self.pipe, **inputs_shared, **inputs_posi) + return loss +``` + +The preprocessing process is consistent with the inference phase. Developers only need to assume they are using the `Pipeline` for inference and fill in the input parameters. + +The loss function calculation reuses `FlowMatchSFTLoss` from `diffsynth.diffusion.loss`. + +### Starting Training + +The training framework requires other modules, including: + +* accelerator: Training launcher provided by `accelerate`, see [`accelerate`](https://huggingface.co/docs/accelerate/index) for details +* dataset: Generic dataset, see [`diffsynth.core.data`](/docs/en/API_Reference/core/data.md) for details +* model_logger: Model logger, see `diffsynth.diffusion.logger` for details + +```python +if __name__ == "__main__": + accelerator = accelerate.Accelerator( + kwargs_handlers=[accelerate.DistributedDataParallelKwargs(find_unused_parameters=True)], + ) + dataset = UnifiedDataset( + base_path="data/example_image_dataset", + metadata_path="data/example_image_dataset/metadata.csv", + repeat=50, + data_file_keys="image", + main_data_operator=UnifiedDataset.default_image_operator( + base_path="data/example_image_dataset", + height=512, + width=512, + height_division_factor=16, + width_division_factor=16, + ) + ) + model = QwenImageTrainingModule(accelerator.device) + model_logger = ModelLogger( + output_path="models/toy_model", + remove_prefix_in_ckpt="pipe.dit.", + ) + launch_training_task( + accelerator, dataset, model, model_logger, + learning_rate=1e-5, num_epochs=1, + ) +``` + +Assembling all the above code results in `examples/qwen_image/model_training/special/simple/train.py`. Use the following command to start training: + +``` +accelerate launch examples/qwen_image/model_training/special/simple/train.py +``` \ No newline at end of file diff --git a/docs/en/Training/Understanding_Diffusion_models.md b/docs/en/Training/Understanding_Diffusion_models.md new file mode 100644 index 0000000..5c81b6a --- /dev/null +++ b/docs/en/Training/Understanding_Diffusion_models.md @@ -0,0 +1,145 @@ +# Basic Principles of Diffusion Models + +This document introduces the basic principles of Diffusion models to help you understand how the training framework is constructed. To make these complex mathematical theories easier for readers to understand, we have reconstructed the theoretical framework of Diffusion models, abandoning complex stochastic differential equations and presenting them in a more concise and understandable form. + +## Introduction + +Diffusion models generate clear images or video content through iterative denoising. We start by explaining the generation process of a data sample $x_0$. Intuitively, in a complete round of denoising, we start from random Gaussian noise $x_T$ and iteratively obtain $x_{T-1}$, $x_{T-2}$, $x_{T-3}$, $\cdots$, gradually reducing the noise content at each step until we finally obtain the noise-free data sample $x_0$. + +(Figure) + +This process is intuitive, but to understand the details, we need to answer several questions: + +* How is the noise content at each step defined? +* How is the iterative denoising computation performed? +* How to train such Diffusion models? +* What is the architecture of modern Diffusion models? +* How does this project encapsulate and implement model training? + +## How is the noise content at each step defined? + +In the theoretical system of Diffusion models, the noise content is determined by a series of parameters $\sigma_T$, $\sigma_{T-1}$, $\sigma_{T-2}$, $\cdots$, $\sigma_0$. Where: + +* $\sigma_T=1$, corresponding to $x_T$ as pure Gaussian noise +* $\sigma_T>\sigma_{T-1}>\sigma_{T-2}>\cdots>x_0$, the noise content gradually decreases during iteration +* $\sigma_0=0$, corresponding to $x_0$ as a data sample without any noise + +As for the intermediate values $\sigma_{T-1}$, $\sigma_{T-2}$, $\cdots$, $\sigma_1$, they are not fixed and only need to satisfy the decreasing condition. + +At an intermediate step, we can directly synthesize noisy data samples $x_t=(1-\sigma_t)x_0+\sigma_t x_T$. + +(Figure) + +## How is the iterative denoising computation performed? + +Before understanding the iterative denoising computation, we need to clarify what the input and output of the denoising model are. We abstract the model as a symbol $\hat \epsilon$, whose input typically consists of three parts: + +* Time step $t$, the model needs to understand which stage of the denoising process it is currently in +* Noisy data sample $x_t$, the model needs to understand what data to denoise +* Guidance condition $c$, the model needs to understand what kind of data sample to generate through denoising + +Among these, the guidance condition $c$ is a newly introduced parameter that is input by the user. It can be text describing the image content or a sketch outlining the image structure. + +(Figure) + +The model's output $\hat \epsilon(x_t,c,t)$ approximately equals $x_T-x_0$, which is the direction of the entire diffusion process (the reverse process of denoising). + +Next, we analyze the computation occurring in one iteration. At time step $t$, after the model computes an approximation of $x_T-x_0$, we calculate the next $x_{t-1}$: + +$$ +\begin{aligned} +x_{t-1}&=x_t + (\sigma_{t-1} - \sigma_t) \cdot \hat \epsilon(x_t,c,t)\\ +&\approx x_t + (\sigma_{t-1} - \sigma_t) \cdot (x_T-x_0)\\ +&=(1-\sigma_t)x_0+\sigma_t x_T + (\sigma_{t-1} - \sigma_t) \cdot (x_T-x_0)\\ +&=(1-\sigma_{t-1})x_0+\sigma_{t-1}x_T +\end{aligned} +$$ + +Perfect! It perfectly matches the noise content definition at time step $t-1$. + +> (This part might be a bit difficult to understand. Don't worry; it's recommended to skip this part on first reading without affecting the rest of the document.) +> +> After completing this somewhat complex formula derivation, let's consider a question: why should the model's output approximately equal $x_T-x_0$? Can it be set to other values? +> +> Actually, Diffusion models rely on two definitions to form a complete theory. From the above formulas, we can extract these two definitions and derive the iterative formula: +> +> * Data definition: $x_t=(1-\sigma_t)x_0+\sigma_t x_T$ +> * Model definition: $\hat \epsilon(x_t,c,t)=x_T-x_0$ +> * Derived iterative formula: $x_{t-1}=x_t + (\sigma_{t-1} - \sigma_t) \cdot \hat \epsilon(x_t,c,t)$ +> +> These three mathematical formulas are complete. For example, in the previous derivation, substituting the data definition and model definition into the iterative formula yields $x_{t-1}$ that matches the data definition. +> +> These are two definitions built on Flow Matching theory, but Diffusion models can also be implemented with other definitions. For example, early models based on DDPM (Denoising Diffusion Probabilistic Models) have their two definitions and derived iterative formulas as: +> +> * Data definition: $x_t=\sqrt{\alpha_t}x_0+\sqrt{1-\alpha_t}x_T$ +> * Model definition: $\hat \epsilon(x_t,c,t)=x_T$ +> * Derived iterative formula: $x_{t-1}=\sqrt{\alpha_{t-1}}\left(\frac{x_t-\sqrt{1-\alpha_t}\hat \epsilon(x_t,c,t)}{\sqrt{\sigma_t}}\right)+\sqrt{1-\alpha_{t-1}}\hat \epsilon(x_t,c,t)$ +> +> More generally, we describe the derivation process of the iterative formula using matrices. For any data definition and model definition: +> +> * Data definition: $x_t=C_T(x_0,x_T)^T$ +> * Model definition: $\hat \epsilon(x_t,c,t)=C_T^{[\epsilon]}(x_0,x_T)^T$ +> * Derived iterative formula: $x_{t-1}=C_{t-1}(C_t,C_t^{[\epsilon]})^{-T}(x_t,\hat \epsilon(x_t,c,t))^T$ +> +> Where $C_t$ and $C_t^{[\epsilon]}$ are $1\times 2$ coefficient matrices. It's not difficult to see that when constructing the two definitions, the matrix $(C_t,C_t^{[\epsilon]})^T$ must be invertible. +> +> Although Flow Matching and DDPM have been widely verified by numerous pre-trained models, this doesn't mean they are optimal solutions. We encourage developers to design new Diffusion model theories for better training results. + +## How to train such Diffusion models? + +After understanding the iterative denoising process, we next consider how to train such Diffusion models. + +The training process differs from the generation process. If we retain multi-step iterations during training, the gradient would need to backpropagate through multiple steps, bringing catastrophic time and space complexity. To improve computational efficiency, we randomly select a time step $t$ for training. + +(Figure) + +The following is pseudocode for the training process: + +> Obtain data sample $x_0$ and guidance condition $c$ from the dataset +> +> Randomly sample time step $t\in(0,T]$ +> +> Randomly sample Gaussian noise $x_T\in \mathcal N(O,I)$ +> +> $x_t=(1-\sigma_t)x_0+\sigma_t x_T$ +> +> $\hat \epsilon(x_t,c,t)$ +> +> Loss function $\mathcal L=||\hat \epsilon(x_t,c,t)-(x_T-x_0)||_2^2$ +> +> Backpropagate gradients and update model parameters + +## What is the architecture of modern Diffusion models? + +From theory to practice, more details need to be filled in. Modern Diffusion model architectures have matured, with mainstream architectures following the "three-stage" architecture proposed by Latent Diffusion, including data encoder-decoder, guidance condition encoder, and denoising model. + +(Figure) + +### Data Encoder-Decoder + +In the previous text, we consistently referred to $x_0$ as a "data sample" rather than an image or video because modern Diffusion models typically don't process images or videos directly. Instead, they use an Encoder-Decoder architecture model, usually a VAE (Variational Auto-Encoders) model, to encode images or videos into Embedding tensors, obtaining $x_0$. + +After data is encoded by the encoder and then decoded by the decoder, the reconstructed content is approximately consistent with the original, with minor errors. So why process on the encoded Embedding tensor instead of directly on images or videos? The main reasons are twofold: + +* Encoding compresses the data simultaneously, reducing computational load during processing. +* Encoded data distribution is more similar to Gaussian distribution, making it easier for denoising models to model the data. + +During generation, the encoder part doesn't participate in computation. After iteration completes, the decoder part decodes $x_0$ to obtain clear images or videos. During training, the decoder part doesn't participate in computation; only the encoder is used to compute $x_0$. + +### Guidance Condition Encoder + +User-input guidance conditions $c$ can be complex and diverse, requiring specialized encoder models to process them into Embedding tensors. According to the type of guidance condition, we classify guidance condition encoders into the following categories: + +* Text type, such as CLIP, Qwen-VL +* Image type, such as ControlNet, IP-Adapter +* Video type, such as VAE + +> The model $\hat \epsilon$ mentioned in the previous text refers to the entirety of all guidance condition encoders and the denoising model. We list guidance condition encoders separately because these models are typically frozen during Diffusion training, and their output values are independent of time step $t$, allowing guidance condition encoder computations to be performed offline. + +### Denoising Model + +The denoising model is the true essence of Diffusion models, with diverse model structures such as UNet and DiT. Model developers can freely innovate on these structures. + +## How does this project encapsulate and implement model training? + +Please read the next document: [Standard Supervised Training](/docs/en/Training/Supervised_Fine_Tuning.md) \ No newline at end of file diff --git a/docs/zh/Model_Details/Z-Image.md b/docs/zh/Model_Details/Z-Image.md index 4bd92a6..2aea18b 100644 --- a/docs/zh/Model_Details/Z-Image.md +++ b/docs/zh/Model_Details/Z-Image.md @@ -128,4 +128,4 @@ modelscope download --dataset DiffSynth-Studio/example_image_dataset --local_dir * 差分 LoRA 训练([code](/examples/z_image/model_training/special/differential_training/)) + 加速配置推理 * 差分 LoRA 训练中需加载一个额外的 LoRA,例如 [ostris/zimage_turbo_training_adapter](https://www.modelscope.cn/models/ostris/zimage_turbo_training_adapter) * 标准 SFT 训练([code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh))+ 轨迹模仿蒸馏训练([code](/examples/z_image/model_training/special/trajectory_imitation/))+ 加速配置推理 - * 标准 SFT 训练([code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh))+ 推理时加载蒸馏加速 LoRA([link](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-Turbo-DistillFix)) + 加速配置推理 + * 标准 SFT 训练([code](/examples/z_image/model_training/lora/Z-Image-Turbo.sh))+ 推理时加载蒸馏加速 LoRA([model](https://www.modelscope.cn/models/DiffSynth-Studio/Z-Image-Turbo-DistillFix)) + 加速配置推理 diff --git a/examples/dev_tools/fix_path.py b/examples/dev_tools/fix_path.py index 1e0f000..0e6ef24 100644 --- a/examples/dev_tools/fix_path.py +++ b/examples/dev_tools/fix_path.py @@ -13,24 +13,24 @@ def get_files(files, path): elif path.endswith(".md"): files.append(path) +def fix_path(doc_root_path): + files = [] + get_files(files, doc_root_path) + file_map = {} + for file in files: + name = file.split("/")[-1] + file_map[name] = "/" + file -test_str = read_file("docs/zh/API_Reference/core/attention.md") -files = [] -get_files(files, "docs/zh") -file_map = {} -for file in files: - name = file.split("/")[-1] - file_map[name] = "/" + file + pattern = re.compile(r'\]\([^)]*\.md') + for file in files: + context = read_file(file) + matches = pattern.findall(context) + + for match in matches: + target = "](" + file_map[match.split("/")[-1].replace("](", "")] + context = context.replace(match, target) + print(match, target) + + with open(file, "w", encoding="utf-8") as f: + f.write(context) -pattern = re.compile(r'\]\([^)]*\.md') -for file in files: - context = read_file(file) - matches = pattern.findall(context) - - for match in matches: - target = "](" + file_map[match.split("/")[-1].replace("](", "")] - context = context.replace(match, target) - print(match, target) - - with open(file, "w", encoding="utf-8") as f: - f.write(context)