diff --git a/diffsynth/pipelines/sd_video.py b/diffsynth/pipelines/sd_video.py index eb812cf..8a577b5 100644 --- a/diffsynth/pipelines/sd_video.py +++ b/diffsynth/pipelines/sd_video.py @@ -257,10 +257,10 @@ class SDVideoPipeline(SDImagePipeline): progress_bar_st.progress(progress_id / len(self.scheduler.timesteps)) # Decode image - image = self.decode_video(latents, **tiler_kwargs) + output_frames = self.decode_video(latents, **tiler_kwargs) # Post-process if smoother is not None and (num_inference_steps in smoother_progress_ids or -1 in smoother_progress_ids): output_frames = smoother(output_frames, original_frames=input_frames) - return image + return output_frames diff --git a/diffsynth/pipelines/sdxl_video.py b/diffsynth/pipelines/sdxl_video.py index d918ae7..faa8bff 100644 --- a/diffsynth/pipelines/sdxl_video.py +++ b/diffsynth/pipelines/sdxl_video.py @@ -214,10 +214,10 @@ class SDXLVideoPipeline(SDXLImagePipeline): progress_bar_st.progress(progress_id / len(self.scheduler.timesteps)) # Decode image - image = self.decode_video(latents, **tiler_kwargs) + output_frames = self.decode_video(latents, **tiler_kwargs) # Post-process if smoother is not None and (num_inference_steps in smoother_progress_ids or -1 in smoother_progress_ids): output_frames = smoother(output_frames, original_frames=input_frames) - return image + return output_frames diff --git a/examples/ExVideo/ExVideo_svd_train.py b/examples/ExVideo/ExVideo_svd_train.py index 4342315..6a6221e 100644 --- a/examples/ExVideo/ExVideo_svd_train.py +++ b/examples/ExVideo/ExVideo_svd_train.py @@ -4,7 +4,7 @@ import numpy as np from einops import rearrange, repeat import lightning as pl from diffsynth import ModelManager, SVDImageEncoder, SVDUNet, SVDVAEEncoder, ContinuousODEScheduler, load_state_dict -from diffsynth.pipelines.stable_video_diffusion import SVDCLIPImageProcessor +from diffsynth.pipelines.svd_video import SVDCLIPImageProcessor from diffsynth.models.svd_unet import TemporalAttentionBlock @@ -131,14 +131,14 @@ class LightningModel(pl.LightningModule): self.image_encoder.requires_grad_(False) self.unet = SVDUNet(add_positional_conv=add_positional_conv).to(dtype=torch.float16, device=self.device) - self.unet.load_state_dict(SVDUNet.state_dict_converter().from_civitai(state_dict), strict=False) + self.unet.load_state_dict(SVDUNet.state_dict_converter().from_civitai(state_dict, add_positional_conv=add_positional_conv), strict=False) self.unet.train() self.unet.requires_grad_(False) for block in self.unet.blocks: if isinstance(block, TemporalAttentionBlock): block.requires_grad_(True) - self.vae_encoder = SVDVAEEncoder.to(dtype=torch.float16, device=self.device) + self.vae_encoder = SVDVAEEncoder().to(dtype=torch.float16, device=self.device) self.vae_encoder.load_state_dict(SVDVAEEncoder.state_dict_converter().from_civitai(state_dict)) self.vae_encoder.eval() self.vae_encoder.requires_grad_(False) diff --git a/examples/image_synthesis/sd3_text_to_image_textual_inversion.py b/examples/image_synthesis/sd3_text_to_image_textual_inversion.py deleted file mode 100644 index 1cf5256..0000000 --- a/examples/image_synthesis/sd3_text_to_image_textual_inversion.py +++ /dev/null @@ -1,32 +0,0 @@ -from diffsynth import ModelManager, SD3ImagePipeline, download_models, load_state_dict -import torch - - -# Download models (automatically) -# `models/stable_diffusion_3/sd3_medium_incl_clips.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors) -# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16) -download_models(["StableDiffusion3_without_T5", "TextualInversion_VeryBadImageNegative_v1.3"]) -model_manager = ModelManager(torch_dtype=torch.float16, device="cuda") -model_manager.load_textual_inversions("models/textual_inversion") -model_manager.load_models(["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"]) -pipe = SD3ImagePipeline.from_model_manager(model_manager) - - -for seed in range(4): - torch.manual_seed(seed) - image = pipe( - prompt="a girl, highly detailed, absurd res, perfect image", - negative_prompt="verybadimagenegative_v1.3", - cfg_scale=4.5, - num_inference_steps=50, width=1024, height=1024, - ) - image.save(f"image_with_textual_inversion_{seed}.jpg") - - torch.manual_seed(seed) - image = pipe( - prompt="a girl, highly detailed, absurd res, perfect image", - negative_prompt="", - cfg_scale=4.5, - num_inference_steps=50, width=1024, height=1024, - ) - image.save(f"image_without_textual_inversion_{seed}.jpg")