mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
@@ -73,6 +73,7 @@ class CogVideoPipeline(BasePipeline):
|
||||
tiled=False,
|
||||
tile_size=(60, 90),
|
||||
tile_stride=(30, 45),
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -83,7 +84,8 @@ class CogVideoPipeline(BasePipeline):
|
||||
self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength)
|
||||
|
||||
# Prepare latent tensors
|
||||
noise = torch.randn((1, 16, num_frames // 4 + 1, height//8, width//8), device="cpu", dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((1, 16, num_frames // 4 + 1, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype)
|
||||
|
||||
if denoising_strength == 1.0:
|
||||
latents = noise.clone()
|
||||
else:
|
||||
|
||||
@@ -226,6 +226,7 @@ class HunyuanDiTImagePipeline(BasePipeline):
|
||||
tiled=False,
|
||||
tile_size=64,
|
||||
tile_stride=32,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -233,7 +234,7 @@ class HunyuanDiTImagePipeline(BasePipeline):
|
||||
self.scheduler.set_timesteps(num_inference_steps, denoising_strength)
|
||||
|
||||
# Prepare latent tensors
|
||||
noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
if input_image is not None:
|
||||
self.load_models_to_device(['vae_encoder'])
|
||||
image = self.preprocess_image(input_image).to(device=self.device, dtype=torch.float32)
|
||||
|
||||
@@ -87,6 +87,7 @@ class SD3ImagePipeline(BasePipeline):
|
||||
tiled=False,
|
||||
tile_size=128,
|
||||
tile_stride=64,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -101,10 +102,10 @@ class SD3ImagePipeline(BasePipeline):
|
||||
self.load_models_to_device(['vae_encoder'])
|
||||
image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.encode_image(image, **tiler_kwargs)
|
||||
noise = torch.randn((1, 16, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((1, 16, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0])
|
||||
else:
|
||||
latents = torch.randn((1, 16, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.generate_noise((1, 16, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
|
||||
# Encode prompts
|
||||
self.load_models_to_device(['text_encoder_1', 'text_encoder_2', 'text_encoder_3'])
|
||||
|
||||
@@ -108,6 +108,7 @@ class SDImagePipeline(BasePipeline):
|
||||
tiled=False,
|
||||
tile_size=64,
|
||||
tile_stride=32,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -122,10 +123,10 @@ class SDImagePipeline(BasePipeline):
|
||||
self.load_models_to_device(['vae_encoder'])
|
||||
image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.encode_image(image, **tiler_kwargs)
|
||||
noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0])
|
||||
else:
|
||||
latents = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
|
||||
# Encode prompts
|
||||
self.load_models_to_device(['text_encoder'])
|
||||
|
||||
@@ -166,6 +166,7 @@ class SDVideoPipeline(SDImagePipeline):
|
||||
tiled=False,
|
||||
tile_size=64,
|
||||
tile_stride=32,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -182,9 +183,9 @@ class SDVideoPipeline(SDImagePipeline):
|
||||
|
||||
# Prepare latent tensors
|
||||
if self.motion_modules is None:
|
||||
noise = torch.randn((1, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1)
|
||||
noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1)
|
||||
else:
|
||||
noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((num_frames, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype)
|
||||
if input_frames is None or denoising_strength == 1.0:
|
||||
latents = noise
|
||||
else:
|
||||
|
||||
@@ -131,6 +131,7 @@ class SDXLImagePipeline(BasePipeline):
|
||||
tiled=False,
|
||||
tile_size=64,
|
||||
tile_stride=32,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -145,10 +146,10 @@ class SDXLImagePipeline(BasePipeline):
|
||||
self.load_models_to_device(['vae_encoder'])
|
||||
image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.encode_image(image, **tiler_kwargs)
|
||||
noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0])
|
||||
else:
|
||||
latents = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype)
|
||||
latents = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
|
||||
# Encode prompts
|
||||
self.load_models_to_device(['text_encoder', 'text_encoder_2', 'text_encoder_kolors'])
|
||||
|
||||
@@ -120,6 +120,7 @@ class SDXLVideoPipeline(SDXLImagePipeline):
|
||||
tiled=False,
|
||||
tile_size=64,
|
||||
tile_stride=32,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -131,9 +132,9 @@ class SDXLVideoPipeline(SDXLImagePipeline):
|
||||
|
||||
# Prepare latent tensors
|
||||
if self.motion_modules is None:
|
||||
noise = torch.randn((1, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1)
|
||||
noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1)
|
||||
else:
|
||||
noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype)
|
||||
noise = self.generate_noise((num_frames, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype)
|
||||
if input_frames is None or denoising_strength == 1.0:
|
||||
latents = noise
|
||||
else:
|
||||
|
||||
@@ -49,9 +49,9 @@ class SVDVideoPipeline(BasePipeline):
|
||||
return image_emb
|
||||
|
||||
|
||||
def encode_image_with_vae(self, image, noise_aug_strength):
|
||||
def encode_image_with_vae(self, image, noise_aug_strength, seed=None):
|
||||
image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype)
|
||||
noise = torch.randn(image.shape, device="cpu", dtype=self.torch_dtype).to(self.device)
|
||||
noise = self.generate_noise(image.shape, seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
image = image + noise_aug_strength * noise
|
||||
image_emb = self.vae_encoder(image) / self.vae_encoder.scaling_factor
|
||||
return image_emb
|
||||
@@ -126,6 +126,7 @@ class SVDVideoPipeline(BasePipeline):
|
||||
num_inference_steps=20,
|
||||
post_normalize=True,
|
||||
contrast_enhance_scale=1.2,
|
||||
seed=None,
|
||||
progress_bar_cmd=tqdm,
|
||||
progress_bar_st=None,
|
||||
):
|
||||
@@ -133,7 +134,7 @@ class SVDVideoPipeline(BasePipeline):
|
||||
self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength)
|
||||
|
||||
# Prepare latent tensors
|
||||
noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).to(self.device)
|
||||
noise = self.generate_noise((num_frames, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype)
|
||||
if denoising_strength == 1.0:
|
||||
latents = noise.clone()
|
||||
else:
|
||||
@@ -147,7 +148,7 @@ class SVDVideoPipeline(BasePipeline):
|
||||
# Encode image
|
||||
image_emb_clip_posi = self.encode_image_with_clip(input_image)
|
||||
image_emb_clip_nega = torch.zeros_like(image_emb_clip_posi)
|
||||
image_emb_vae_posi = repeat(self.encode_image_with_vae(input_image, noise_aug_strength), "B C H W -> (B T) C H W", T=num_frames)
|
||||
image_emb_vae_posi = repeat(self.encode_image_with_vae(input_image, noise_aug_strength, seed=seed), "B C H W -> (B T) C H W", T=num_frames)
|
||||
image_emb_vae_nega = torch.zeros_like(image_emb_vae_posi)
|
||||
|
||||
# Prepare classifier-free guidance
|
||||
|
||||
Reference in New Issue
Block a user