support ernie-image-turbo (#1391)

* support ernie-image-turbo * pr review fix * fix modelname
2026-04-14 21:58:17 +00:00 · 2026-04-14 11:35:43 +08:00
parent 960d8c62c0
commit b5d04ceb30
18 changed files with 142 additions and 82 deletions
--- a/diffsynth/configs/model_configs.py
+++ b/diffsynth/configs/model_configs.py
@@ -543,13 +543,13 @@ flux2_series = [

 ernie_image_series = [
    {
-        # Example: ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
+        # Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
        "model_hash": "584c13713849f1af4e03d5f1858b8b7b",
        "model_name": "ernie_image_dit",
        "model_class": "diffsynth.models.ernie_image_dit.ErnieImageDiT",
    },
    {
-        # Example: ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors")
+        # Example: ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="text_encoder/model.safetensors")
        "model_hash": "404ed9f40796a38dd34c1620f1920207",
        "model_name": "ernie_image_text_encoder",
        "model_class": "diffsynth.models.ernie_image_text_encoder.ErnieImageTextEncoder",
--- a/diffsynth/diffusion/flow_match.py
+++ b/diffsynth/diffusion/flow_match.py
@@ -131,11 +131,14 @@ class FlowMatchScheduler():
        return sigmas, timesteps

    @staticmethod
-    def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0):
-        """ERNIE-Image scheduler: pure linear sigmas from 1.0 to 0.0, no shift."""
+    def set_timesteps_ernie_image(num_inference_steps=50, denoising_strength=1.0, shift=3.0):
+        sigma_min = 0.0
+        sigma_max = 1.0
        num_train_timesteps = 1000
-        sigma_start = denoising_strength
-        sigmas = torch.linspace(sigma_start, 0.0, num_inference_steps + 1)[:-1]
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+        if shift is not None and shift != 1.0:
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
        timesteps = sigmas * num_train_timesteps
        return sigmas, timesteps

@@ -185,9 +188,6 @@ class FlowMatchScheduler():
        return sigmas, timesteps

    def set_training_weight(self):
-        if self.set_timesteps_fn == FlowMatchScheduler.set_timesteps_ernie_image:
-            self.set_uniform_training_weight()
-            return
        steps = 1000
        x = self.timesteps
        y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2)
@@ -199,13 +199,6 @@ class FlowMatchScheduler():
            bsmntw_weighing = bsmntw_weighing + bsmntw_weighing[1]
        self.linear_timesteps_weights = bsmntw_weighing

-    def set_uniform_training_weight(self):
-        """Assign equal weight to every timestep, suitable for linear schedulers like ERNIE-Image."""
-        steps = 1000
-        num_steps = len(self.timesteps)
-        uniform_weight = torch.full((num_steps,), steps / num_steps, dtype=self.timesteps.dtype)
-        self.linear_timesteps_weights = uniform_weight
-        
    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, **kwargs):
        self.sigmas, self.timesteps = self.set_timesteps_fn(
            num_inference_steps=num_inference_steps,
--- a/diffsynth/models/ernie_image_dit.py
+++ b/diffsynth/models/ernie_image_dit.py
@@ -2,7 +2,7 @@
 Ernie-Image DiT for DiffSynth-Studio.

 Refactored from diffusers ErnieImageTransformer2DModel to use DiffSynth core modules.
-Default parameters from actual checkpoint config.json (baidu/ERNIE-Image transformer).
+Default parameters from actual checkpoint config.json (PaddlePaddle/ERNIE-Image transformer).
 """

 import torch
--- a/diffsynth/pipelines/ernie_image.py
+++ b/diffsynth/pipelines/ernie_image.py
@@ -46,7 +46,7 @@ class ErnieImagePipeline(BasePipeline):
        torch_dtype: torch.dtype = torch.bfloat16,
        device: Union[str, torch.device] = get_device_type(),
        model_configs: list[ModelConfig] = [],
-        tokenizer_config: ModelConfig = ModelConfig(model_id="baidu/ERNIE-Image", origin_file_pattern="tokenizer/"),
+        tokenizer_config: ModelConfig = ModelConfig(model_id="PaddlePaddle/ERNIE-Image", origin_file_pattern="tokenizer/"),
        vram_limit: float = None,
    ):
        pipe = ErnieImagePipeline(device=device, torch_dtype=torch_dtype)
@@ -78,11 +78,12 @@ class ErnieImagePipeline(BasePipeline):
        rand_device: str = "cuda",
        # Steps
        num_inference_steps: int = 50,
+        sigma_shift: float = 3.0,
        # Progress bar
        progress_bar_cmd=tqdm,
    ):
        # Scheduler
-        self.scheduler.set_timesteps(num_inference_steps=num_inference_steps)
+        self.scheduler.set_timesteps(num_inference_steps=num_inference_steps, shift=sigma_shift)

        # Parameters
        inputs_posi = {"prompt": prompt}