diff --git a/README.md b/README.md index 1fd1ed0..650f090 100644 --- a/README.md +++ b/README.md @@ -883,7 +883,7 @@ Example code for Wan is available at: [/examples/wanvideo/](/examples/wanvideo/) Quick Start -Running the following code will quickly load the [baidu/ERNIE-Image](https://www.modelscope.cn/models/baidu/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM. +Running the following code will quickly load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model and perform inference. VRAM management is enabled, and the framework will automatically control the loading of model parameters based on available VRAM. The model can run with a minimum of 3GB VRAM. ```python from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig @@ -903,11 +903,11 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device='cuda', model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), ], - tokenizer_config=ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) @@ -933,7 +933,8 @@ Example code for ERNIE-Image is available at: [/examples/ernie_image/](/examples | Model ID | Inference | Low VRAM Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation | |-|-|-|-|-|-|-| -|[baidu/ERNIE-Image: T2I](https://www.modelscope.cn/models/baidu/ERNIE-Image)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/full/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py)| +|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)| +|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—| diff --git a/README_zh.md b/README_zh.md index 29e293a..f52a9b1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -883,7 +883,7 @@ Wan 的示例代码位于:[/examples/wanvideo/](/examples/wanvideo/) 快速开始 -运行以下代码可以快速加载 [baidu/ERNIE-Image](https://www.modelscope.cn/models/baidu/ERNIE-Image) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 3G 显存即可运行。 +运行以下代码可以快速加载 [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 3G 显存即可运行。 ```python from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig @@ -903,11 +903,11 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device='cuda', model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), ], - tokenizer_config=ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) @@ -933,7 +933,8 @@ ERNIE-Image 的示例代码位于:[/examples/ernie_image/](/examples/ernie_ima | 模型 ID | 推理 | 低显存推理 | 全量训练 | 全量训练后验证 | LoRA 训练 | LoRA 训练后验证 | |-|-|-|-|-|-|-| -|[baidu/ERNIE-Image: T2I](https://www.modelscope.cn/models/baidu/ERNIE-Image)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/full/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py)| +|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)| +|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—| diff --git a/diffsynth/configs/model_configs.py b/diffsynth/configs/model_configs.py index 7428962..de9dbdb 100644 --- a/diffsynth/configs/model_configs.py +++ b/diffsynth/configs/model_configs.py @@ -543,13 +543,13 @@ flux2_series = [ ernie_image_series = [ { - # Example: ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors") + # Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors") "model_hash": "584c13713849f1af4e03d5f1858b8b7b", "model_name": "ernie_image_dit", "model_class": "diffsynth.models.ernie_image_dit.ErnieImageDiT", }, { - # Example: ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors") + # Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors") "model_hash": "404ed9f40796a38dd34c1620f1920207", "model_name": "ernie_image_text_encoder", "model_class": "diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder", diff --git a/diffsynth/diffusion/flow_match.py b/diffsynth/diffusion/flow_match.py index 1ac5c49..c4d87cb 100644 --- a/diffsynth/diffusion/flow_match.py +++ b/diffsynth/diffusion/flow_match.py @@ -131,11 +131,14 @@ class FlowMatchScheduler(): return sigmas, timesteps @staticmethod - def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0): - """ERNIE-Image scheduler: pure linear sigmas from 1.0 to 0.0, no shift.""" + def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0, shift=3.0): + sigma_min = 0.0 + sigma_max = 1.0 num_train_timesteps = 1000 - sigma_start = denoising_strength - sigmas = torch.linspace(sigma_start, 0.0, num_inference_steps + 1)[:-1] + sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength + sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1] + if shift is not None and shift != 1.0: + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) timesteps = sigmas * num_train_timesteps return sigmas, timesteps @@ -185,9 +188,6 @@ class FlowMatchScheduler(): return sigmas, timesteps def set_training_weight(self): - if self.set_timesteps_fn == FlowMatchScheduler.set_timesteps_ernie_image: - self.set_uniform_training_weight() - return steps = 1000 x = self.timesteps y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2) @@ -199,13 +199,6 @@ class FlowMatchScheduler(): bsmntw_weighing = bsmntw_weighing + bsmntw_weighing[1] self.linear_timesteps_weights = bsmntw_weighing - def set_uniform_training_weight(self): - """Assign equal weight to every timestep, suitable for linear schedulers like ERNIE-Image.""" - steps = 1000 - num_steps = len(self.timesteps) - uniform_weight = torch.full((num_steps,), steps / num_steps, dtype=self.timesteps.dtype) - self.linear_timesteps_weights = uniform_weight - def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, **kwargs): self.sigmas, self.timesteps = self.set_timesteps_fn( num_inference_steps=num_inference_steps, diff --git a/diffsynth/models/ernie_image_dit.py b/diffsynth/models/ernie_image_dit.py index fd0e022..1becf78 100644 --- a/diffsynth/models/ernie_image_dit.py +++ b/diffsynth/models/ernie_image_dit.py @@ -2,7 +2,7 @@ Ernie-Image DiT for DiffSynth-Studio. Refactored from diffusers ErnieImageTransformer2DModel to use DiffSynth core modules. -Default parameters from actual checkpoint config.json (baidu/ERNIE-Image transformer). +Default parameters from actual checkpoint config.json (PaddlePaddle/ERNIE-Image transformer). """ import torch diff --git a/diffsynth/pipelines/ernie_image.py b/diffsynth/pipelines/ernie_image.py index a2b411d..0be776e 100644 --- a/diffsynth/pipelines/ernie_image.py +++ b/diffsynth/pipelines/ernie_image.py @@ -46,7 +46,7 @@ class ErnieImagePipeline(BasePipeline): torch_dtype: torch.dtype = torch.bfloat16, device: Union[str, torch.device] = get_device_type(), model_configs: list[ModelConfig] = [], - tokenizer_config: ModelConfig = ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), + tokenizer_config: ModelConfig = ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), vram_limit: float = None, ): pipe = ErnieImagePipeline(device=device, torch_dtype=torch_dtype) @@ -78,11 +78,12 @@ class ErnieImagePipeline(BasePipeline): rand_device: str = "cuda", # Steps num_inference_steps: int = 50, + sigma_shift: float = 3.0, # Progress bar progress_bar_cmd=tqdm, ): # Scheduler - self.scheduler.set_timesteps(num_inference_steps=num_inference_steps) + self.scheduler.set_timesteps(num_inference_steps=num_inference_steps, shift=sigma_shift) # Parameters inputs_posi = {"prompt": prompt} diff --git a/docs/en/Model_Details/ERNIE-Image.md b/docs/en/Model_Details/ERNIE-Image.md index 601b26c..72683c5 100644 --- a/docs/en/Model_Details/ERNIE-Image.md +++ b/docs/en/Model_Details/ERNIE-Image.md @@ -16,7 +16,7 @@ For more information on installation, please refer to [Setup Dependencies](../Pi ## Quick Start -Running the following code will load the [baidu/ERNIE-Image](https://www.modelscope.cn/models/baidu/ERNIE-Image) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 3G VRAM. +Running the following code will load the [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) model for inference. VRAM management is enabled, the framework automatically controls parameter loading based on available VRAM, requiring a minimum of 3G VRAM. ```python from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig @@ -36,11 +36,11 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device='cuda', model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), ], - tokenizer_config=ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) @@ -60,7 +60,8 @@ image.save("output.jpg") |Model ID|Inference|Low VRAM Inference|Full Training|Full Training Validation|LoRA Training|LoRA Training Validation| |-|-|-|-|-|-|-| -|[baidu/ERNIE-Image: T2I](https://www.modelscope.cn/models/baidu/ERNIE-Image)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/full/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py)| +|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)| +|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—| ## Model Inference @@ -92,7 +93,7 @@ ERNIE-Image series models are trained uniformly via [`examples/ernie_image/model * `--data_file_keys`: Field names to load from metadata, typically paths to image or video files, separated by `,`. * Model Loading Configuration * `--model_paths`: Paths to load models from, in JSON format. - * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"baidu/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors"`, separated by commas. + * `--model_id_with_origin_paths`: Model IDs with original paths, e.g., `"PaddlePaddle/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors"`, separated by commas. * `--extra_inputs`: Additional input parameters required by the model Pipeline, separated by `,`. * `--fp8_models`: Models to load in FP8 format, currently only supported for models whose parameters are not updated by gradients. * Basic Training Configuration diff --git a/docs/zh/Model_Details/ERNIE-Image.md b/docs/zh/Model_Details/ERNIE-Image.md index f7acbe7..b286d04 100644 --- a/docs/zh/Model_Details/ERNIE-Image.md +++ b/docs/zh/Model_Details/ERNIE-Image.md @@ -16,7 +16,7 @@ pip install -e . ## 快速开始 -运行以下代码可以快速加载 [baidu/ERNIE-Image](https://www.modelscope.cn/models/baidu/ERNIE-Image) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 3G 显存即可运行。 +运行以下代码可以快速加载 [PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image) 模型并进行推理。显存管理已启动,框架会自动根据剩余显存控制模型参数的加载,最低 3G 显存即可运行。 ```python from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig @@ -36,11 +36,11 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device='cuda', model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), ], - tokenizer_config=ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) @@ -60,7 +60,8 @@ image.save("output.jpg") |模型 ID|推理|低显存推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证| |-|-|-|-|-|-|-| -|[baidu/ERNIE-Image: T2I](https://www.modelscope.cn/models/baidu/ERNIE-Image)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/full/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py)| +|[PaddlePaddle/ERNIE-Image](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/ERNIE-Image.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/full/ERNIE-Image.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_full/ERNIE-Image.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/lora/ERNIE-Image.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py)| +|[PaddlePaddle/ERNIE-Image-Turbo](https://www.modelscope.cn/models/PaddlePaddle/ERNIE-Image-Turbo)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py)|—|—|—|—| ## 模型推理 @@ -92,7 +93,7 @@ ERNIE-Image 系列模型统一通过 [`examples/ernie_image/model_training/train * `--data_file_keys`: 元数据中需要加载的字段名称,通常是图像或视频文件的路径,以 `,` 分隔。 * 模型加载配置 * `--model_paths`: 要加载的模型路径。JSON 格式。 - * `--model_id_with_origin_paths`: 带原始路径的模型 ID,例如 `"baidu/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors"`。用逗号分隔。 + * `--model_id_with_origin_paths`: 带原始路径的模型 ID,例如 `"PaddlePaddle/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors"`。用逗号分隔。 * `--extra_inputs`: 模型 Pipeline 所需的额外输入参数,以 `,` 分隔。 * `--fp8_models`:以 FP8 格式加载的模型,目前仅支持参数不被梯度更新的模型。 * 训练基础配置 diff --git a/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py b/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py new file mode 100644 index 0000000..afec5e5 --- /dev/null +++ b/examples/ernie_image/model_inference/ERNIE-Image-Turbo.py @@ -0,0 +1,25 @@ +from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig +import torch + +pipe = ErnieImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device='cuda', + model_configs=[ + ModelConfig(model_id="PaddlePaddle/ERNIE-Image-Turbo", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), +) + +image = pipe( + prompt="一只黑白相间的中华田园犬", + negative_prompt="", + height=1024, + width=1024, + seed=42, + num_inference_steps=8, + cfg_scale=1.0, + sigma_shift=4.0, +) +image.save("output_turbo.jpg") diff --git a/examples/ernie_image/model_inference/ERNIE-Image.py b/examples/ernie_image/model_inference/ERNIE-Image.py new file mode 100644 index 0000000..29e78e8 --- /dev/null +++ b/examples/ernie_image/model_inference/ERNIE-Image.py @@ -0,0 +1,24 @@ +from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig +import torch + +pipe = ErnieImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device='cuda', + model_configs=[ + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ], + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), +) + +image = pipe( + prompt="一只黑白相间的中华田园犬", + negative_prompt="", + height=1024, + width=1024, + seed=42, + num_inference_steps=50, + cfg_scale=4.0, +) +image.save("output.jpg") diff --git a/examples/ernie_image/model_inference/Ernie-Image-T2I.py b/examples/ernie_image/model_inference/Ernie-Image-T2I.py deleted file mode 100644 index 25332cf..0000000 --- a/examples/ernie_image/model_inference/Ernie-Image-T2I.py +++ /dev/null @@ -1,24 +0,0 @@ -from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig -import torch - -pipe = ErnieImagePipeline.from_pretrained( - torch_dtype=torch.bfloat16, - device='cuda', - model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), - ], - tokenizer_config=ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), -) - -image = pipe( - prompt="一只黑白相间的中华田园犬", - negative_prompt="", - height=1024, - width=1024, - seed=42, - num_inference_steps=50, - cfg_scale=4.0, -) -image.save("output.jpg") diff --git a/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py b/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py new file mode 100644 index 0000000..64b928c --- /dev/null +++ b/examples/ernie_image/model_inference_low_vram/ERNIE-Image-Turbo.py @@ -0,0 +1,37 @@ +from diffsynth.pipelines.ernie_image import ErnieImagePipeline, ModelConfig +import torch + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} + +pipe = ErnieImagePipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device='cuda', + model_configs=[ + ModelConfig(model_id="PaddlePaddle/ERNIE-Image-Turbo", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +image = pipe( + prompt="一只黑白相间的中华田园犬", + negative_prompt="", + height=1024, + width=1024, + seed=42, + num_inference_steps=8, + cfg_scale=1.0, + sigma_shift=4.0, +) +image.save("output_turbo.jpg") diff --git a/examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py b/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py similarity index 60% rename from examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py rename to examples/ernie_image/model_inference_low_vram/ERNIE-Image.py index 26b427d..ca74947 100644 --- a/examples/ernie_image/model_inference_low_vram/Ernie-Image-T2I.py +++ b/examples/ernie_image/model_inference_low_vram/ERNIE-Image.py @@ -16,11 +16,11 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device='cuda', model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors", **vram_config), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config), ], - tokenizer_config=ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"), + tokenizer_config=ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"), vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, ) diff --git a/examples/ernie_image/model_training/full/Ernie-Image-T2I.sh b/examples/ernie_image/model_training/full/ERNIE-Image.sh similarity index 75% rename from examples/ernie_image/model_training/full/Ernie-Image-T2I.sh rename to examples/ernie_image/model_training/full/ERNIE-Image.sh index 550dde5..374c1a8 100644 --- a/examples/ernie_image/model_training/full/Ernie-Image-T2I.sh +++ b/examples/ernie_image/model_training/full/ERNIE-Image.sh @@ -6,7 +6,7 @@ accelerate launch --config_file examples/ernie_image/model_training/full/acceler --dataset_metadata_path data/diffsynth_example_dataset/ernie_image/Ernie-Image-T2I/metadata.csv \ --max_pixels 1048576 \ --dataset_repeat 50 \ - --model_id_with_origin_paths "baidu/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors,baidu/ERNIE-Image:text_encoder/model.safetensors,baidu/ERNIE-Image:vae/diffusion_pytorch_model.safetensors" \ + --model_id_with_origin_paths "PaddlePaddle/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors,PaddlePaddle/ERNIE-Image:text_encoder/model.safetensors,PaddlePaddle/ERNIE-Image:vae/diffusion_pytorch_model.safetensors" \ --learning_rate 1e-5 \ --num_epochs 2 \ --remove_prefix_in_ckpt "pipe.dit." \ diff --git a/examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh b/examples/ernie_image/model_training/lora/ERNIE-Image.sh similarity index 79% rename from examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh rename to examples/ernie_image/model_training/lora/ERNIE-Image.sh index 5c8732e..20f4ad5 100644 --- a/examples/ernie_image/model_training/lora/Ernie-Image-T2I.sh +++ b/examples/ernie_image/model_training/lora/ERNIE-Image.sh @@ -6,7 +6,7 @@ accelerate launch examples/ernie_image/model_training/train.py \ --dataset_metadata_path data/diffsynth_example_dataset/ernie_image/Ernie-Image-T2I/metadata.csv \ --max_pixels 1048576 \ --dataset_repeat 50 \ - --model_id_with_origin_paths "baidu/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors,baidu/ERNIE-Image:text_encoder/model.safetensors,baidu/ERNIE-Image:vae/diffusion_pytorch_model.safetensors" \ + --model_id_with_origin_paths "PaddlePaddle/ERNIE-Image:transformer/diffusion_pytorch_model*.safetensors,PaddlePaddle/ERNIE-Image:text_encoder/model.safetensors,PaddlePaddle/ERNIE-Image:vae/diffusion_pytorch_model.safetensors" \ --learning_rate 1e-4 \ --num_epochs 5 \ --remove_prefix_in_ckpt "pipe.dit." \ diff --git a/examples/ernie_image/model_training/train.py b/examples/ernie_image/model_training/train.py index 5fa0bc8..643bf8e 100644 --- a/examples/ernie_image/model_training/train.py +++ b/examples/ernie_image/model_training/train.py @@ -25,7 +25,7 @@ class ErnieImageTrainingModule(DiffusionTrainingModule): super().__init__() # Load models model_configs = self.parse_model_configs(model_paths, model_id_with_origin_paths, fp8_models=fp8_models, offload_models=offload_models, device=device) - tokenizer_config = ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/") if tokenizer_path is None else ModelConfig(tokenizer_path) + tokenizer_config = ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/") if tokenizer_path is None else ModelConfig(tokenizer_path) self.pipe = ErnieImagePipeline.from_pretrained(torch_dtype=torch.bfloat16, device=device, model_configs=model_configs, tokenizer_config=tokenizer_config) self.pipe = self.split_pipeline_units(task, self.pipe, trainable_models, lora_base_model) diff --git a/examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py b/examples/ernie_image/model_training/validate_full/ERNIE-Image.py similarity index 61% rename from examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py rename to examples/ernie_image/model_training/validate_full/ERNIE-Image.py index 4664126..11532b3 100644 --- a/examples/ernie_image/model_training/validate_full/Ernie-Image-T2I.py +++ b/examples/ernie_image/model_training/validate_full/ERNIE-Image.py @@ -6,9 +6,9 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], ) diff --git a/examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py b/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py similarity index 65% rename from examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py rename to examples/ernie_image/model_training/validate_lora/ERNIE-Image.py index 20f84eb..787b811 100644 --- a/examples/ernie_image/model_training/validate_lora/Ernie-Image-T2I.py +++ b/examples/ernie_image/model_training/validate_lora/ERNIE-Image.py @@ -6,9 +6,9 @@ pipe = ErnieImagePipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), - ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors"), + ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"), ], )