From 747572e62cdd2e98d45122c24e094573f437ace4 Mon Sep 17 00:00:00 2001 From: Qing112 Date: Mon, 21 Oct 2024 15:09:21 +0800 Subject: [PATCH 1/4] update noise generate --- diffsynth/pipelines/cog_video.py | 4 +++- diffsynth/pipelines/hunyuan_image.py | 3 ++- diffsynth/pipelines/sd3_image.py | 5 +++-- diffsynth/pipelines/sd_image.py | 5 +++-- diffsynth/pipelines/sd_video.py | 5 +++-- diffsynth/pipelines/sdxl_image.py | 5 +++-- diffsynth/pipelines/sdxl_video.py | 5 +++-- diffsynth/pipelines/svd_video.py | 5 +++-- 8 files changed, 23 insertions(+), 14 deletions(-) diff --git a/diffsynth/pipelines/cog_video.py b/diffsynth/pipelines/cog_video.py index 777ce75..959038f 100644 --- a/diffsynth/pipelines/cog_video.py +++ b/diffsynth/pipelines/cog_video.py @@ -73,6 +73,7 @@ class CogVideoPipeline(BasePipeline): tiled=False, tile_size=(60, 90), tile_stride=(30, 45), + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -83,7 +84,8 @@ class CogVideoPipeline(BasePipeline): self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength) # Prepare latent tensors - noise = torch.randn((1, 16, num_frames // 4 + 1, height//8, width//8), device="cpu", dtype=self.torch_dtype) + noise = self.generate_noise((1, 16, num_frames // 4 + 1, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype) + if denoising_strength == 1.0: latents = noise.clone() else: diff --git a/diffsynth/pipelines/hunyuan_image.py b/diffsynth/pipelines/hunyuan_image.py index 3407e54..9cc8c5e 100644 --- a/diffsynth/pipelines/hunyuan_image.py +++ b/diffsynth/pipelines/hunyuan_image.py @@ -226,6 +226,7 @@ class HunyuanDiTImagePipeline(BasePipeline): tiled=False, tile_size=64, tile_stride=32, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -233,7 +234,7 @@ class HunyuanDiTImagePipeline(BasePipeline): self.scheduler.set_timesteps(num_inference_steps, denoising_strength) # Prepare latent tensors - noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) + noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) if input_image is not None: self.load_models_to_device(['vae_encoder']) image = self.preprocess_image(input_image).to(device=self.device, dtype=torch.float32) diff --git a/diffsynth/pipelines/sd3_image.py b/diffsynth/pipelines/sd3_image.py index c624ce4..e6e72e9 100644 --- a/diffsynth/pipelines/sd3_image.py +++ b/diffsynth/pipelines/sd3_image.py @@ -87,6 +87,7 @@ class SD3ImagePipeline(BasePipeline): tiled=False, tile_size=128, tile_stride=64, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -101,10 +102,10 @@ class SD3ImagePipeline(BasePipeline): self.load_models_to_device(['vae_encoder']) image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype) latents = self.encode_image(image, **tiler_kwargs) - noise = torch.randn((1, 16, height//8, width//8), device=self.device, dtype=self.torch_dtype) + noise = self.generate_noise((1, 16, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) else: - latents = torch.randn((1, 16, height//8, width//8), device=self.device, dtype=self.torch_dtype) + latents = self.generate_noise((1, 16, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) # Encode prompts self.load_models_to_device(['text_encoder_1', 'text_encoder_2', 'text_encoder_3']) diff --git a/diffsynth/pipelines/sd_image.py b/diffsynth/pipelines/sd_image.py index 8847027..2c7821e 100644 --- a/diffsynth/pipelines/sd_image.py +++ b/diffsynth/pipelines/sd_image.py @@ -108,6 +108,7 @@ class SDImagePipeline(BasePipeline): tiled=False, tile_size=64, tile_stride=32, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -122,10 +123,10 @@ class SDImagePipeline(BasePipeline): self.load_models_to_device(['vae_encoder']) image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype) latents = self.encode_image(image, **tiler_kwargs) - noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) + noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) else: - latents = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) + latents = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) # Encode prompts self.load_models_to_device(['text_encoder']) diff --git a/diffsynth/pipelines/sd_video.py b/diffsynth/pipelines/sd_video.py index 8a577b5..99d65a4 100644 --- a/diffsynth/pipelines/sd_video.py +++ b/diffsynth/pipelines/sd_video.py @@ -166,6 +166,7 @@ class SDVideoPipeline(SDImagePipeline): tiled=False, tile_size=64, tile_stride=32, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -182,9 +183,9 @@ class SDVideoPipeline(SDImagePipeline): # Prepare latent tensors if self.motion_modules is None: - noise = torch.randn((1, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1) + noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1) else: - noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype) + noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype) if input_frames is None or denoising_strength == 1.0: latents = noise else: diff --git a/diffsynth/pipelines/sdxl_image.py b/diffsynth/pipelines/sdxl_image.py index 09b7dd4..9b522ba 100644 --- a/diffsynth/pipelines/sdxl_image.py +++ b/diffsynth/pipelines/sdxl_image.py @@ -131,6 +131,7 @@ class SDXLImagePipeline(BasePipeline): tiled=False, tile_size=64, tile_stride=32, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -145,10 +146,10 @@ class SDXLImagePipeline(BasePipeline): self.load_models_to_device(['vae_encoder']) image = self.preprocess_image(input_image).to(device=self.device, dtype=self.torch_dtype) latents = self.encode_image(image, **tiler_kwargs) - noise = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) + noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) else: - latents = torch.randn((1, 4, height//8, width//8), device=self.device, dtype=self.torch_dtype) + latents = self.generate_noise((1, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) # Encode prompts self.load_models_to_device(['text_encoder', 'text_encoder_2', 'text_encoder_kolors']) diff --git a/diffsynth/pipelines/sdxl_video.py b/diffsynth/pipelines/sdxl_video.py index faa8bff..cfcccd5 100644 --- a/diffsynth/pipelines/sdxl_video.py +++ b/diffsynth/pipelines/sdxl_video.py @@ -120,6 +120,7 @@ class SDXLVideoPipeline(SDXLImagePipeline): tiled=False, tile_size=64, tile_stride=32, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -131,9 +132,9 @@ class SDXLVideoPipeline(SDXLImagePipeline): # Prepare latent tensors if self.motion_modules is None: - noise = torch.randn((1, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1) + noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1) else: - noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype) + noise = self.generate_noise((num_frames, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype) if input_frames is None or denoising_strength == 1.0: latents = noise else: diff --git a/diffsynth/pipelines/svd_video.py b/diffsynth/pipelines/svd_video.py index 6d99331..8c93b1f 100644 --- a/diffsynth/pipelines/svd_video.py +++ b/diffsynth/pipelines/svd_video.py @@ -51,7 +51,7 @@ class SVDVideoPipeline(BasePipeline): def encode_image_with_vae(self, image, noise_aug_strength): image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype) - noise = torch.randn(image.shape, device="cpu", dtype=self.torch_dtype).to(self.device) + noise = self.generate_noise(image.shape, seed=seed, device=self.device, dtype=self.torch_dtype) image = image + noise_aug_strength * noise image_emb = self.vae_encoder(image) / self.vae_encoder.scaling_factor return image_emb @@ -126,6 +126,7 @@ class SVDVideoPipeline(BasePipeline): num_inference_steps=20, post_normalize=True, contrast_enhance_scale=1.2, + seed=None, progress_bar_cmd=tqdm, progress_bar_st=None, ): @@ -133,7 +134,7 @@ class SVDVideoPipeline(BasePipeline): self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength) # Prepare latent tensors - noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).to(self.device) + noise = self.generate_noise((num_frames, 4, height//8, width//8), seed=seed, device=self.device, dtype=self.torch_dtype) if denoising_strength == 1.0: latents = noise.clone() else: From d381c7b18608bc8ab951c1675fd012bb8b132b21 Mon Sep 17 00:00:00 2001 From: Qianyi Zhao <49068354+Qing112@users.noreply.github.com> Date: Wed, 23 Oct 2024 03:27:59 -0500 Subject: [PATCH 2/4] Update svd_video.py --- diffsynth/pipelines/svd_video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diffsynth/pipelines/svd_video.py b/diffsynth/pipelines/svd_video.py index 8c93b1f..01b358b 100644 --- a/diffsynth/pipelines/svd_video.py +++ b/diffsynth/pipelines/svd_video.py @@ -49,7 +49,7 @@ class SVDVideoPipeline(BasePipeline): return image_emb - def encode_image_with_vae(self, image, noise_aug_strength): + def encode_image_with_vae(self, image, noise_aug_strength, seed): image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype) noise = self.generate_noise(image.shape, seed=seed, device=self.device, dtype=self.torch_dtype) image = image + noise_aug_strength * noise @@ -148,7 +148,7 @@ class SVDVideoPipeline(BasePipeline): # Encode image image_emb_clip_posi = self.encode_image_with_clip(input_image) image_emb_clip_nega = torch.zeros_like(image_emb_clip_posi) - image_emb_vae_posi = repeat(self.encode_image_with_vae(input_image, noise_aug_strength), "B C H W -> (B T) C H W", T=num_frames) + image_emb_vae_posi = repeat(self.encode_image_with_vae(input_image, noise_aug_strength, seed=seed), "B C H W -> (B T) C H W", T=num_frames) image_emb_vae_nega = torch.zeros_like(image_emb_vae_posi) # Prepare classifier-free guidance From c8021d422404063c7c05c09763cd769c4713f3fd Mon Sep 17 00:00:00 2001 From: Qianyi Zhao <49068354+Qing112@users.noreply.github.com> Date: Fri, 25 Oct 2024 01:44:09 -0500 Subject: [PATCH 3/4] Update svd_video.py --- diffsynth/pipelines/svd_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffsynth/pipelines/svd_video.py b/diffsynth/pipelines/svd_video.py index 01b358b..2dc6311 100644 --- a/diffsynth/pipelines/svd_video.py +++ b/diffsynth/pipelines/svd_video.py @@ -49,7 +49,7 @@ class SVDVideoPipeline(BasePipeline): return image_emb - def encode_image_with_vae(self, image, noise_aug_strength, seed): + def encode_image_with_vae(self, image, noise_aug_strength, seed=None): image = self.preprocess_image(image).to(device=self.device, dtype=self.torch_dtype) noise = self.generate_noise(image.shape, seed=seed, device=self.device, dtype=self.torch_dtype) image = image + noise_aug_strength * noise From 8cfe4820f6c5e19d5e41e4d3b1c283a643d40fdb Mon Sep 17 00:00:00 2001 From: Qianyi Zhao <49068354+Qing112@users.noreply.github.com> Date: Fri, 25 Oct 2024 03:23:01 -0500 Subject: [PATCH 4/4] Update sd_video.py --- diffsynth/pipelines/sd_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffsynth/pipelines/sd_video.py b/diffsynth/pipelines/sd_video.py index 99d65a4..e9be94a 100644 --- a/diffsynth/pipelines/sd_video.py +++ b/diffsynth/pipelines/sd_video.py @@ -185,7 +185,7 @@ class SDVideoPipeline(SDImagePipeline): if self.motion_modules is None: noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype).repeat(num_frames, 1, 1, 1) else: - noise = self.generate_noise((1, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype) + noise = self.generate_noise((num_frames, 4, height//8, width//8), seed=seed, device="cpu", dtype=self.torch_dtype) if input_frames is None or denoising_strength == 1.0: latents = noise else: