mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-25 18:58:11 +00:00
align schedulers
This commit is contained in:
@@ -1,6 +1,5 @@
|
|||||||
from ..models import ModelManager, SVDImageEncoder, SVDUNet, SVDVAEEncoder, SVDVAEDecoder
|
from ..models import ModelManager, SVDImageEncoder, SVDUNet, SVDVAEEncoder, SVDVAEDecoder
|
||||||
from ..schedulers import ContinuousODEScheduler
|
from ..schedulers import ContinuousODEScheduler
|
||||||
from ..data import save_video
|
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@@ -93,16 +92,14 @@ class SVDVideoPipeline(torch.nn.Module):
|
|||||||
image_emb_vae_posi, image_emb_clip_posi,
|
image_emb_vae_posi, image_emb_clip_posi,
|
||||||
image_emb_vae_nega, image_emb_clip_nega
|
image_emb_vae_nega, image_emb_clip_nega
|
||||||
):
|
):
|
||||||
latents_input = self.scheduler.scale_model_input(latents, timestep)
|
|
||||||
|
|
||||||
# Positive side
|
# Positive side
|
||||||
noise_pred_posi = self.unet(
|
noise_pred_posi = self.unet(
|
||||||
torch.cat([latents_input, image_emb_vae_posi], dim=1),
|
torch.cat([latents, image_emb_vae_posi], dim=1),
|
||||||
timestep, image_emb_clip_posi, add_time_id
|
timestep, image_emb_clip_posi, add_time_id
|
||||||
)
|
)
|
||||||
# Negative side
|
# Negative side
|
||||||
noise_pred_nega = self.unet(
|
noise_pred_nega = self.unet(
|
||||||
torch.cat([latents_input, image_emb_vae_nega], dim=1),
|
torch.cat([latents, image_emb_vae_nega], dim=1),
|
||||||
timestep, image_emb_clip_nega, add_time_id
|
timestep, image_emb_clip_nega, add_time_id
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -136,7 +133,7 @@ class SVDVideoPipeline(torch.nn.Module):
|
|||||||
# Prepare latent tensors
|
# Prepare latent tensors
|
||||||
noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).to(self.device)
|
noise = torch.randn((num_frames, 4, height//8, width//8), device="cpu", dtype=self.torch_dtype).to(self.device)
|
||||||
if denoising_strength == 1.0:
|
if denoising_strength == 1.0:
|
||||||
latents = noise * self.scheduler.init_noise_sigma
|
latents = noise
|
||||||
else:
|
else:
|
||||||
latents = self.encode_video_with_vae(input_video)
|
latents = self.encode_video_with_vae(input_video)
|
||||||
latents = self.scheduler.add_noise(latents, noise, self.scheduler.timesteps[0])
|
latents = self.scheduler.add_noise(latents, noise, self.scheduler.timesteps[0])
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import torch, math
|
import torch
|
||||||
|
|
||||||
|
|
||||||
class ContinuousODEScheduler():
|
class ContinuousODEScheduler():
|
||||||
@@ -7,12 +7,11 @@ class ContinuousODEScheduler():
|
|||||||
self.sigma_max = sigma_max
|
self.sigma_max = sigma_max
|
||||||
self.sigma_min = sigma_min
|
self.sigma_min = sigma_min
|
||||||
self.rho = rho
|
self.rho = rho
|
||||||
self.init_noise_sigma = math.sqrt(sigma_max*sigma_max + 1)
|
|
||||||
self.set_timesteps(num_inference_steps)
|
self.set_timesteps(num_inference_steps)
|
||||||
|
|
||||||
|
|
||||||
def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0):
|
def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0):
|
||||||
ramp = torch.linspace(0, denoising_strength, num_inference_steps)
|
ramp = torch.linspace(1-denoising_strength, 1, num_inference_steps)
|
||||||
min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho))
|
min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho))
|
||||||
max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho))
|
max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho))
|
||||||
self.sigmas = torch.pow(max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho)
|
self.sigmas = torch.pow(max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho)
|
||||||
@@ -22,22 +21,17 @@ class ContinuousODEScheduler():
|
|||||||
def step(self, model_output, timestep, sample, to_final=False):
|
def step(self, model_output, timestep, sample, to_final=False):
|
||||||
timestep_id = torch.argmin((self.timesteps - timestep).abs())
|
timestep_id = torch.argmin((self.timesteps - timestep).abs())
|
||||||
sigma = self.sigmas[timestep_id]
|
sigma = self.sigmas[timestep_id]
|
||||||
|
sample *= (sigma*sigma + 1).sqrt()
|
||||||
estimated_sample = -sigma / (sigma*sigma + 1).sqrt() * model_output + 1 / (sigma*sigma + 1) * sample
|
estimated_sample = -sigma / (sigma*sigma + 1).sqrt() * model_output + 1 / (sigma*sigma + 1) * sample
|
||||||
if to_final or timestep_id + 1 >= len(self.timesteps):
|
if to_final or timestep_id + 1 >= len(self.timesteps):
|
||||||
prev_sample = estimated_sample
|
prev_sample = estimated_sample
|
||||||
else:
|
else:
|
||||||
dt = self.sigmas[timestep_id + 1] - sigma
|
sigma_ = self.sigmas[timestep_id + 1]
|
||||||
derivative = 1 / sigma * (sample - estimated_sample)
|
derivative = 1 / sigma * (sample - estimated_sample)
|
||||||
prev_sample = sample + derivative * dt
|
prev_sample = sample + derivative * (sigma_ - sigma)
|
||||||
|
prev_sample /= (sigma_*sigma_ + 1).sqrt()
|
||||||
return prev_sample
|
return prev_sample
|
||||||
|
|
||||||
|
|
||||||
def scale_model_input(self, sample, timestep):
|
|
||||||
timestep_id = torch.argmin((self.timesteps - timestep).abs())
|
|
||||||
sigma = self.sigmas[timestep_id]
|
|
||||||
sample = sample / (sigma*sigma + 1).sqrt()
|
|
||||||
return sample
|
|
||||||
|
|
||||||
|
|
||||||
def return_to_timestep(self, timestep, sample, sample_stablized):
|
def return_to_timestep(self, timestep, sample, sample_stablized):
|
||||||
# This scheduler doesn't support this function.
|
# This scheduler doesn't support this function.
|
||||||
@@ -47,6 +41,5 @@ class ContinuousODEScheduler():
|
|||||||
def add_noise(self, original_samples, noise, timestep):
|
def add_noise(self, original_samples, noise, timestep):
|
||||||
timestep_id = torch.argmin((self.timesteps - timestep).abs())
|
timestep_id = torch.argmin((self.timesteps - timestep).abs())
|
||||||
sigma = self.sigmas[timestep_id]
|
sigma = self.sigmas[timestep_id]
|
||||||
sample = original_samples + noise * sigma
|
sample = (original_samples + noise * sigma) / (sigma*sigma + 1).sqrt()
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user