diff --git a/diffsynth/pipelines/flux2_image.py b/diffsynth/pipelines/flux2_image.py index 7b6dcc4..4e837e9 100644 --- a/diffsynth/pipelines/flux2_image.py +++ b/diffsynth/pipelines/flux2_image.py @@ -83,7 +83,7 @@ class Flux2ImagePipeline(BasePipeline): input_image: Image.Image = None, denoising_strength: float = 1.0, # Edit - edit_image: Union[Image.Image, List[Image.Image]] = None, + edit_image: List[Image.Image] = None, edit_image_auto_resize: bool = True, # Shape height: int = 1024, diff --git a/diffsynth/pipelines/flux_image.py b/diffsynth/pipelines/flux_image.py index db2d522..fdace11 100644 --- a/diffsynth/pipelines/flux_image.py +++ b/diffsynth/pipelines/flux_image.py @@ -200,9 +200,9 @@ class FluxImagePipeline(BasePipeline): # Steps num_inference_steps: int = 30, # local prompts - multidiffusion_prompts=(), - multidiffusion_masks=(), - multidiffusion_scales=(), + multidiffusion_prompts:tuple[str] =(), + multidiffusion_masks:tuple[str]=(), + multidiffusion_scales:tuple[str]=(), # Kontext kontext_images: Union[list[Image.Image], Image.Image] = None, # ControlNet diff --git a/diffsynth/pipelines/ltx2_audio_video.py b/diffsynth/pipelines/ltx2_audio_video.py index 1263b43..af31e08 100644 --- a/diffsynth/pipelines/ltx2_audio_video.py +++ b/diffsynth/pipelines/ltx2_audio_video.py @@ -170,45 +170,45 @@ class LTX2AudioVideoPipeline(BasePipeline): self, # Prompt prompt: str, - negative_prompt: Optional[str] = "", + negative_prompt: str = "", denoising_strength: float = 1.0, # Image-to-video - input_images: Optional[list[Image.Image]] = None, - input_images_indexes: Optional[list[int]] = [0], - input_images_strength: Optional[float] = 1.0, + input_images: list[Image.Image] = None, + input_images_indexes: list[int] = [0], + input_images_strength: float = 1.0, # In-Context Video Control - in_context_videos: Optional[list[list[Image.Image]]] = None, - in_context_downsample_factor: Optional[int] = 2, + in_context_videos: list[list[Image.Image]] = None, + in_context_downsample_factor: int = 2, # Video-to-video - retake_video: Optional[list[Image.Image]] = None, - retake_video_regions: Optional[list[tuple[float, float]]] = None, + retake_video: list[Image.Image] = None, + retake_video_regions: list[tuple[float, float]] = None, # Audio-to-video - retake_audio: Optional[torch.Tensor] = None, - audio_sample_rate: Optional[int] = 48000, - retake_audio_regions: Optional[list[tuple[float, float]]] = None, + retake_audio: torch.Tensor = None, + audio_sample_rate: int = 48000, + retake_audio_regions: list[tuple[float, float]] = None, # Randomness - seed: Optional[int] = None, - rand_device: Optional[str] = "cpu", + seed: int = None, + rand_device: str = "cpu", # Shape - height: Optional[int] = 512, - width: Optional[int] = 768, - num_frames: Optional[int] = 121, - frame_rate: Optional[int] = 24, + height: int = 512, + width: int = 768, + num_frames: int = 121, + frame_rate: int = 24, # Classifier-free guidance - cfg_scale: Optional[float] = 3.0, + cfg_scale: float = 3.0, # Scheduler - num_inference_steps: Optional[int] = 30, + num_inference_steps: int = 30, # VAE tiling - tiled: Optional[bool] = True, - tile_size_in_pixels: Optional[int] = 512, - tile_overlap_in_pixels: Optional[int] = 128, - tile_size_in_frames: Optional[int] = 128, - tile_overlap_in_frames: Optional[int] = 24, + tiled: bool = True, + tile_size_in_pixels: int = 512, + tile_overlap_in_pixels: int = 128, + tile_size_in_frames: int = 128, + tile_overlap_in_frames: int = 24, # Special Pipelines - use_two_stage_pipeline: Optional[bool] = False, - stage2_spatial_upsample_factor: Optional[int] = 2, - clear_lora_before_state_two: Optional[bool] = False, - use_distilled_pipeline: Optional[bool] = False, + use_two_stage_pipeline: bool = False, + stage2_spatial_upsample_factor: int = 2, + clear_lora_before_state_two: bool = False, + use_distilled_pipeline: bool = False, # progress_bar progress_bar_cmd=tqdm, ): diff --git a/diffsynth/pipelines/mova_audio_video.py b/diffsynth/pipelines/mova_audio_video.py index d89d3ff..9475e16 100644 --- a/diffsynth/pipelines/mova_audio_video.py +++ b/diffsynth/pipelines/mova_audio_video.py @@ -116,32 +116,32 @@ class MovaAudioVideoPipeline(BasePipeline): self, # Prompt prompt: str, - negative_prompt: Optional[str] = "", + negative_prompt: str = "", # Image-to-video - input_image: Optional[Image.Image] = None, + input_image: Image.Image = None, # First-last-frame-to-video - end_image: Optional[Image.Image] = None, + end_image: Image.Image = None, # Video-to-video - denoising_strength: Optional[float] = 1.0, + denoising_strength: float = 1.0, # Randomness - seed: Optional[int] = None, - rand_device: Optional[str] = "cpu", + seed: int = None, + rand_device: str = "cpu", # Shape - height: Optional[int] = 352, - width: Optional[int] = 640, - num_frames: Optional[int] = 81, - frame_rate: Optional[int] = 24, + height: int = 352, + width: int = 640, + num_frames: int = 81, + frame_rate: int = 24, # Classifier-free guidance - cfg_scale: Optional[float] = 5.0, + cfg_scale: float = 5.0, # Boundary - switch_DiT_boundary: Optional[float] = 0.9, + switch_DiT_boundary: float = 0.9, # Scheduler - num_inference_steps: Optional[int] = 50, - sigma_shift: Optional[float] = 5.0, + num_inference_steps: int = 50, + sigma_shift: float = 5.0, # VAE tiling - tiled: Optional[bool] = True, - tile_size: Optional[tuple[int, int]] = (30, 52), - tile_stride: Optional[tuple[int, int]] = (15, 26), + tiled: bool = True, + tile_size: tuple[int, int] = (30, 52), + tile_stride: tuple[int, int] = (15, 26), # progress_bar progress_bar_cmd=tqdm, ): diff --git a/diffsynth/pipelines/wan_video.py b/diffsynth/pipelines/wan_video.py index 1c1aa7e..9022b13 100644 --- a/diffsynth/pipelines/wan_video.py +++ b/diffsynth/pipelines/wan_video.py @@ -191,81 +191,81 @@ class WanVideoPipeline(BasePipeline): self, # Prompt prompt: str, - negative_prompt: Optional[str] = "", + negative_prompt: str = "", # Image-to-video - input_image: Optional[Image.Image] = None, + input_image: Image.Image = None, # First-last-frame-to-video - end_image: Optional[Image.Image] = None, + end_image: Image.Image = None, # Video-to-video - input_video: Optional[list[Image.Image]] = None, - denoising_strength: Optional[float] = 1.0, + input_video: list[Image.Image] = None, + denoising_strength: float = 1.0, # Speech-to-video - input_audio: Optional[np.array] = None, - audio_embeds: Optional[torch.Tensor] = None, - audio_sample_rate: Optional[int] = 16000, - s2v_pose_video: Optional[list[Image.Image]] = None, - s2v_pose_latents: Optional[torch.Tensor] = None, - motion_video: Optional[list[Image.Image]] = None, + input_audio: np.array = None, + audio_embeds: torch.Tensor = None, + audio_sample_rate: int = 16000, + s2v_pose_video: list[Image.Image] = None, + s2v_pose_latents: torch.Tensor = None, + motion_video: list[Image.Image] = None, # ControlNet - control_video: Optional[list[Image.Image]] = None, - reference_image: Optional[Image.Image] = None, + control_video: list[Image.Image] = None, + reference_image: Image.Image = None, # Camera control - camera_control_direction: Optional[Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"]] = None, - camera_control_speed: Optional[float] = 1/54, - camera_control_origin: Optional[tuple] = (0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0), + camera_control_direction: Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"] = None, + camera_control_speed: float = 1/54, + camera_control_origin: tuple = (0, 0.532139961, 0.946026558, 0.5, 0.5, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0), # VACE - vace_video: Optional[list[Image.Image]] = None, - vace_video_mask: Optional[Image.Image] = None, - vace_reference_image: Optional[Image.Image] = None, - vace_scale: Optional[float] = 1.0, + vace_video: list[Image.Image] = None, + vace_video_mask: Image.Image = None, + vace_reference_image: Image.Image = None, + vace_scale: float = 1.0, # Animate - animate_pose_video: Optional[list[Image.Image]] = None, - animate_face_video: Optional[list[Image.Image]] = None, - animate_inpaint_video: Optional[list[Image.Image]] = None, - animate_mask_video: Optional[list[Image.Image]] = None, + animate_pose_video: list[Image.Image] = None, + animate_face_video: list[Image.Image] = None, + animate_inpaint_video: list[Image.Image] = None, + animate_mask_video: list[Image.Image] = None, # VAP - vap_video: Optional[list[Image.Image]] = None, - vap_prompt: Optional[str] = " ", - negative_vap_prompt: Optional[str] = " ", + vap_video: list[Image.Image] = None, + vap_prompt: str = " ", + negative_vap_prompt: str = " ", # Randomness - seed: Optional[int] = None, - rand_device: Optional[str] = "cpu", + seed: int = None, + rand_device: str = "cpu", # Shape - height: Optional[int] = 480, - width: Optional[int] = 832, - num_frames=81, + height: int = 480, + width: int = 832, + num_frames: int = 81, # Classifier-free guidance - cfg_scale: Optional[float] = 5.0, - cfg_merge: Optional[bool] = False, + cfg_scale: float = 5.0, + cfg_merge: bool = False, # Boundary - switch_DiT_boundary: Optional[float] = 0.875, + switch_DiT_boundary: float = 0.875, # Scheduler - num_inference_steps: Optional[int] = 50, - sigma_shift: Optional[float] = 5.0, + num_inference_steps: int = 50, + sigma_shift: float = 5.0, # Speed control - motion_bucket_id: Optional[int] = None, + motion_bucket_id: int = None, # LongCat-Video - longcat_video: Optional[list[Image.Image]] = None, + longcat_video: list[Image.Image] = None, # VAE tiling - tiled: Optional[bool] = True, - tile_size: Optional[tuple[int, int]] = (30, 52), - tile_stride: Optional[tuple[int, int]] = (15, 26), + tiled: bool = True, + tile_size: tuple[int, int] = (30, 52), + tile_stride: tuple[int, int] = (15, 26), # Sliding window - sliding_window_size: Optional[int] = None, - sliding_window_stride: Optional[int] = None, + sliding_window_size: int = None, + sliding_window_stride: int = None, # Teacache - tea_cache_l1_thresh: Optional[float] = None, - tea_cache_model_id: Optional[str] = "", + tea_cache_l1_thresh: float = None, + tea_cache_model_id: str = "", # WanToDance - wantodance_music_path: Optional[str] = None, - wantodance_reference_image: Optional[Image.Image] = None, - wantodance_fps: Optional[float] = 30, - wantodance_keyframes: Optional[list[Image.Image]] = None, - wantodance_keyframes_mask: Optional[list[int]] = None, + wantodance_music_path: str = None, + wantodance_reference_image: Image.Image = None, + wantodance_fps: float = 30, + wantodance_keyframes: list[Image.Image] = None, + wantodance_keyframes_mask: list[int] = None, framewise_decoding: bool = False, # progress_bar progress_bar_cmd=tqdm, - output_type: Optional[Literal["quantized", "floatpoint"]] = "quantized", + output_type: Literal["quantized", "floatpoint"] = "quantized", ): # Scheduler self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength, shift=sigma_shift) diff --git a/examples/dev_tools/webui.py b/examples/dev_tools/webui.py index bc96b3e..c133b06 100644 --- a/examples/dev_tools/webui.py +++ b/examples/dev_tools/webui.py @@ -1,6 +1,7 @@ import importlib, inspect, pkgutil, traceback, torch, os, re from typing import Union, List, Optional, Tuple, Iterable, Dict from contextlib import contextmanager + import streamlit as st from diffsynth import ModelConfig from diffsynth.diffusion.base_pipeline import ControlNetInput @@ -280,4 +281,3 @@ def launch_webui(): print(f"unsupported result format: {result}") launch_webui() -# streamlit run examples/dev_tools/webui.py --server.fileWatcherType none