support inference webui tools (#1409)

support inference webui tools
This commit is contained in:
Zhongjie Duan
2026-04-24 10:13:36 +08:00
committed by GitHub
parent 5c89a15b9a
commit b1af4af8a9
11 changed files with 444 additions and 118 deletions

View File

@@ -1,4 +1,4 @@
from transformers import DINOv3ViTModel, DINOv3ViTImageProcessorFast
from transformers import DINOv3ViTModel, DINOv3ViTImageProcessor
from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTConfig
import torch
@@ -40,7 +40,7 @@ class DINOv3ImageEncoder(DINOv3ViTModel):
value_bias = False
)
super().__init__(config)
self.processor = DINOv3ViTImageProcessorFast(
self.processor = DINOv3ViTImageProcessor(
crop_size = None,
data_format = "channels_first",
default_to_square = True,
@@ -56,7 +56,7 @@ class DINOv3ImageEncoder(DINOv3ViTModel):
0.456,
0.406
],
image_processor_type = "DINOv3ViTImageProcessorFast",
image_processor_type = "DINOv3ViTImageProcessor",
image_std = [
0.229,
0.224,

View File

@@ -1,5 +1,5 @@
from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig
from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessorFast
from transformers import SiglipImageProcessor, Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessor
import torch
from diffsynth.core.device.npu_compatible_device import get_device_type
@@ -90,7 +90,7 @@ class Siglip2ImageEncoder428M(Siglip2VisionModel):
transformers_version = "4.57.1"
)
super().__init__(config)
self.processor = Siglip2ImageProcessorFast(
self.processor = Siglip2ImageProcessor(
**{
"data_format": "channels_first",
"default_to_square": True,
@@ -106,7 +106,7 @@ class Siglip2ImageEncoder428M(Siglip2VisionModel):
0.5,
0.5
],
"image_processor_type": "Siglip2ImageProcessorFast",
"image_processor_type": "Siglip2ImageProcessor",
"image_std": [
0.5,
0.5,

View File

@@ -74,7 +74,7 @@ class AnimaImagePipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
prompt: str = "",
negative_prompt: str = "",
cfg_scale: float = 4.0,
# Image

View File

@@ -75,7 +75,7 @@ class Flux2ImagePipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
prompt: str = "",
negative_prompt: str = "",
cfg_scale: float = 1.0,
embedded_guidance: float = 4.0,
@@ -83,7 +83,7 @@ class Flux2ImagePipeline(BasePipeline):
input_image: Image.Image = None,
denoising_strength: float = 1.0,
# Edit
edit_image: Union[Image.Image, List[Image.Image]] = None,
edit_image: List[Image.Image] = None,
edit_image_auto_resize: bool = True,
# Shape
height: int = 1024,

View File

@@ -181,7 +181,7 @@ class FluxImagePipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
prompt: str = "",
negative_prompt: str = "",
cfg_scale: float = 1.0,
embedded_guidance: float = 3.5,
@@ -199,10 +199,6 @@ class FluxImagePipeline(BasePipeline):
sigma_shift: float = None,
# Steps
num_inference_steps: int = 30,
# local prompts
multidiffusion_prompts=(),
multidiffusion_masks=(),
multidiffusion_scales=(),
# Kontext
kontext_images: Union[list[Image.Image], Image.Image] = None,
# ControlNet
@@ -257,7 +253,6 @@ class FluxImagePipeline(BasePipeline):
"height": height, "width": width,
"seed": seed, "rand_device": rand_device,
"sigma_shift": sigma_shift, "num_inference_steps": num_inference_steps,
"multidiffusion_prompts": multidiffusion_prompts, "multidiffusion_masks": multidiffusion_masks, "multidiffusion_scales": multidiffusion_scales,
"kontext_images": kontext_images,
"controlnet_inputs": controlnet_inputs,
"ipadapter_images": ipadapter_images, "ipadapter_scale": ipadapter_scale,

View File

@@ -169,46 +169,46 @@ class LTX2AudioVideoPipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
negative_prompt: Optional[str] = "",
prompt: str = "",
negative_prompt: str = "",
denoising_strength: float = 1.0,
# Image-to-video
input_images: Optional[list[Image.Image]] = None,
input_images_indexes: Optional[list[int]] = [0],
input_images_strength: Optional[float] = 1.0,
input_images: list[Image.Image] = None,
input_images_indexes: list[int] = [0],
input_images_strength: float = 1.0,
# In-Context Video Control
in_context_videos: Optional[list[list[Image.Image]]] = None,
in_context_downsample_factor: Optional[int] = 2,
in_context_videos: list[list[Image.Image]] = None,
in_context_downsample_factor: int = 2,
# Video-to-video
retake_video: Optional[list[Image.Image]] = None,
retake_video_regions: Optional[list[tuple[float, float]]] = None,
retake_video: list[Image.Image] = None,
retake_video_regions: list[tuple[float, float]] = None,
# Audio-to-video
retake_audio: Optional[torch.Tensor] = None,
audio_sample_rate: Optional[int] = 48000,
retake_audio_regions: Optional[list[tuple[float, float]]] = None,
retake_audio: torch.Tensor = None,
audio_sample_rate: int = 48000,
retake_audio_regions: list[tuple[float, float]] = None,
# Randomness
seed: Optional[int] = None,
rand_device: Optional[str] = "cpu",
seed: int = None,
rand_device: str = "cpu",
# Shape
height: Optional[int] = 512,
width: Optional[int] = 768,
num_frames: Optional[int] = 121,
frame_rate: Optional[int] = 24,
height: int = 512,
width: int = 768,
num_frames: int = 121,
frame_rate: int = 24,
# Classifier-free guidance
cfg_scale: Optional[float] = 3.0,
cfg_scale: float = 3.0,
# Scheduler
num_inference_steps: Optional[int] = 30,
num_inference_steps: int = 30,
# VAE tiling
tiled: Optional[bool] = True,
tile_size_in_pixels: Optional[int] = 512,
tile_overlap_in_pixels: Optional[int] = 128,
tile_size_in_frames: Optional[int] = 128,
tile_overlap_in_frames: Optional[int] = 24,
tiled: bool = True,
tile_size_in_pixels: int = 512,
tile_overlap_in_pixels: int = 128,
tile_size_in_frames: int = 128,
tile_overlap_in_frames: int = 24,
# Special Pipelines
use_two_stage_pipeline: Optional[bool] = False,
stage2_spatial_upsample_factor: Optional[int] = 2,
clear_lora_before_state_two: Optional[bool] = False,
use_distilled_pipeline: Optional[bool] = False,
use_two_stage_pipeline: bool = False,
stage2_spatial_upsample_factor: int = 2,
clear_lora_before_state_two: bool = False,
use_distilled_pipeline: bool = False,
# progress_bar
progress_bar_cmd=tqdm,
):

View File

@@ -115,33 +115,33 @@ class MovaAudioVideoPipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
negative_prompt: Optional[str] = "",
prompt: str = "",
negative_prompt: str = "",
# Image-to-video
input_image: Optional[Image.Image] = None,
input_image: Image.Image = None,
# First-last-frame-to-video
end_image: Optional[Image.Image] = None,
end_image: Image.Image = None,
# Video-to-video
denoising_strength: Optional[float] = 1.0,
denoising_strength: float = 1.0,
# Randomness
seed: Optional[int] = None,
rand_device: Optional[str] = "cpu",
seed: int = None,
rand_device: str = "cpu",
# Shape
height: Optional[int] = 352,
width: Optional[int] = 640,
num_frames: Optional[int] = 81,
frame_rate: Optional[int] = 24,
height: int = 352,
width: int = 640,
num_frames: int = 81,
frame_rate: int = 24,
# Classifier-free guidance
cfg_scale: Optional[float] = 5.0,
cfg_scale: float = 5.0,
# Boundary
switch_DiT_boundary: Optional[float] = 0.9,
switch_DiT_boundary: float = 0.9,
# Scheduler
num_inference_steps: Optional[int] = 50,
sigma_shift: Optional[float] = 5.0,
num_inference_steps: int = 50,
sigma_shift: float = 5.0,
# VAE tiling
tiled: Optional[bool] = True,
tile_size: Optional[tuple[int, int]] = (30, 52),
tile_stride: Optional[tuple[int, int]] = (15, 26),
tiled: bool = True,
tile_size: tuple[int, int] = (30, 52),
tile_stride: tuple[int, int] = (15, 26),
# progress_bar
progress_bar_cmd=tqdm,
):

View File

@@ -100,7 +100,7 @@ class QwenImagePipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
prompt: str = "",
negative_prompt: str = "",
cfg_scale: float = 4.0,
# Image

View File

@@ -190,82 +190,82 @@ class WanVideoPipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
negative_prompt: Optional[str] = "",
prompt: str = "",
negative_prompt: str = "",
# Image-to-video
input_image: Optional[Image.Image] = None,
input_image: Image.Image = None,
# First-last-frame-to-video
end_image: Optional[Image.Image] = None,
end_image: Image.Image = None,
# Video-to-video
input_video: Optional[list[Image.Image]] = None,
denoising_strength: Optional[float] = 1.0,
input_video: list[Image.Image] = None,
denoising_strength: float = 1.0,
# Speech-to-video
input_audio: Optional[np.array] = None,
audio_embeds: Optional[torch.Tensor] = None,
audio_sample_rate: Optional[int] = 16000,
s2v_pose_video: Optional[list[Image.Image]] = None,
s2v_pose_latents: Optional[torch.Tensor] = None,
motion_video: Optional[list[Image.Image]] = None,
input_audio: np.array = None,
audio_embeds: torch.Tensor = None,
audio_sample_rate: int = 16000,
s2v_pose_video: list[Image.Image] = None,
s2v_pose_latents: torch.Tensor = None,
motion_video: list[Image.Image] = None,
# ControlNet
control_video: Optional[list[Image.Image]] = None,
reference_image: Optional[Image.Image] = None,
control_video: list[Image.Image] = None,
reference_image: Image.Image = None,
# Camera control
camera_control_direction: Optional[Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"]] = None,
camera_control_speed: Optional[float] = 1/54,
camera_control_origin: Optional[tuple] = (0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0),
camera_control_direction: Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"] = None,
camera_control_speed: float = 1/54,
camera_control_origin: tuple = (0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0),
# VACE
vace_video: Optional[list[Image.Image]] = None,
vace_video_mask: Optional[Image.Image] = None,
vace_reference_image: Optional[Image.Image] = None,
vace_scale: Optional[float] = 1.0,
vace_video: list[Image.Image] = None,
vace_video_mask: Image.Image = None,
vace_reference_image: Image.Image = None,
vace_scale: float = 1.0,
# Animate
animate_pose_video: Optional[list[Image.Image]] = None,
animate_face_video: Optional[list[Image.Image]] = None,
animate_inpaint_video: Optional[list[Image.Image]] = None,
animate_mask_video: Optional[list[Image.Image]] = None,
animate_pose_video: list[Image.Image] = None,
animate_face_video: list[Image.Image] = None,
animate_inpaint_video: list[Image.Image] = None,
animate_mask_video: list[Image.Image] = None,
# VAP
vap_video: Optional[list[Image.Image]] = None,
vap_prompt: Optional[str] = " ",
negative_vap_prompt: Optional[str] = " ",
vap_video: list[Image.Image] = None,
vap_prompt: str = " ",
negative_vap_prompt: str = " ",
# Randomness
seed: Optional[int] = None,
rand_device: Optional[str] = "cpu",
seed: int = None,
rand_device: str = "cpu",
# Shape
height: Optional[int] = 480,
width: Optional[int] = 832,
num_frames=81,
height: int = 480,
width: int = 832,
num_frames: int = 81,
# Classifier-free guidance
cfg_scale: Optional[float] = 5.0,
cfg_merge: Optional[bool] = False,
cfg_scale: float = 5.0,
cfg_merge: bool = False,
# Boundary
switch_DiT_boundary: Optional[float] = 0.875,
switch_DiT_boundary: float = 0.875,
# Scheduler
num_inference_steps: Optional[int] = 50,
sigma_shift: Optional[float] = 5.0,
num_inference_steps: int = 50,
sigma_shift: float = 5.0,
# Speed control
motion_bucket_id: Optional[int] = None,
motion_bucket_id: int = None,
# LongCat-Video
longcat_video: Optional[list[Image.Image]] = None,
longcat_video: list[Image.Image] = None,
# VAE tiling
tiled: Optional[bool] = True,
tile_size: Optional[tuple[int, int]] = (30, 52),
tile_stride: Optional[tuple[int, int]] = (15, 26),
tiled: bool = True,
tile_size: tuple[int, int] = (30, 52),
tile_stride: tuple[int, int] = (15, 26),
# Sliding window
sliding_window_size: Optional[int] = None,
sliding_window_stride: Optional[int] = None,
sliding_window_size: int = None,
sliding_window_stride: int = None,
# Teacache
tea_cache_l1_thresh: Optional[float] = None,
tea_cache_model_id: Optional[str] = "",
tea_cache_l1_thresh: float = None,
tea_cache_model_id: str = "",
# WanToDance
wantodance_music_path: Optional[str] = None,
wantodance_reference_image: Optional[Image.Image] = None,
wantodance_fps: Optional[float] = 30,
wantodance_keyframes: Optional[list[Image.Image]] = None,
wantodance_keyframes_mask: Optional[list[int]] = None,
wantodance_music_path: str = None,
wantodance_reference_image: Image.Image = None,
wantodance_fps: float = 30,
wantodance_keyframes: list[Image.Image] = None,
wantodance_keyframes_mask: list[int] = None,
framewise_decoding: bool = False,
# progress_bar
progress_bar_cmd=tqdm,
output_type: Optional[Literal["quantized", "floatpoint"]] = "quantized",
output_type: Literal["quantized", "floatpoint"] = "quantized",
):
# Scheduler
self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength, shift=sigma_shift)

View File

@@ -95,7 +95,7 @@ class ZImagePipeline(BasePipeline):
def __call__(
self,
# Prompt
prompt: str,
prompt: str = "",
negative_prompt: str = "",
cfg_scale: float = 1.0,
# Image
@@ -109,7 +109,7 @@ class ZImagePipeline(BasePipeline):
width: int = 1024,
# Randomness
seed: int = None,
rand_device: str = "cpu",
rand_device: Union[str, torch.device] = "cpu",
# Steps
num_inference_steps: int = 8,
sigma_shift: float = None,